diff --git a/engineering/docker-development/.claude-plugin/plugin.json b/engineering/docker-development/.claude-plugin/plugin.json new file mode 100644 index 0000000..56b0dbb --- /dev/null +++ b/engineering/docker-development/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "docker-development", + "description": "Docker and container development agent skill and plugin for Dockerfile optimization, docker-compose orchestration, multi-stage builds, and container security hardening. Covers build performance, layer caching, and production-ready container patterns.", + "version": "1.0.0", + "author": { + "name": "Alireza Rezvani", + "url": "https://alirezarezvani.com" + }, + "homepage": "https://github.com/alirezarezvani/claude-skills/tree/main/engineering/docker-development", + "repository": "https://github.com/alirezarezvani/claude-skills", + "license": "MIT", + "skills": "./" +} diff --git a/engineering/docker-development/SKILL.md b/engineering/docker-development/SKILL.md new file mode 100644 index 0000000..bd65d70 --- /dev/null +++ b/engineering/docker-development/SKILL.md @@ -0,0 +1,366 @@ +--- +name: "docker-development" +description: "Docker and container development agent skill and plugin for Dockerfile optimization, docker-compose orchestration, multi-stage builds, and container security hardening. Use when: user wants to optimize a Dockerfile, create or improve docker-compose configurations, implement multi-stage builds, audit container security, reduce image size, or follow container best practices. Covers build performance, layer caching, secret management, and production-ready container patterns." +license: MIT +metadata: + version: 1.0.0 + author: Alireza Rezvani + category: engineering + updated: 2026-03-16 +--- + +# Docker Development + +> Smaller images. Faster builds. Secure containers. No guesswork. + +Opinionated Docker workflow that turns bloated Dockerfiles into production-grade containers. Covers optimization, multi-stage builds, compose orchestration, and security hardening. + +Not a Docker tutorial — a set of concrete decisions about how to build containers that don't waste time, space, or attack surface. + +--- + +## Slash Commands + +| Command | What it does | +|---------|-------------| +| `/docker:optimize` | Analyze and optimize a Dockerfile for size, speed, and layer caching | +| `/docker:compose` | Generate or improve docker-compose.yml with best practices | +| `/docker:security` | Audit a Dockerfile or running container for security issues | + +--- + +## When This Skill Activates + +Recognize these patterns from the user: + +- "Optimize this Dockerfile" +- "My Docker build is slow" +- "Create a docker-compose for this project" +- "Is this Dockerfile secure?" +- "Reduce my Docker image size" +- "Set up multi-stage builds" +- "Docker best practices for [language/framework]" +- Any request involving: Dockerfile, docker-compose, container, image size, build cache, Docker security + +If the user has a Dockerfile or wants to containerize something → this skill applies. + +--- + +## Workflow + +### `/docker:optimize` — Dockerfile Optimization + +1. **Analyze current state** + - Read the Dockerfile + - Identify base image and its size + - Count layers (each RUN/COPY/ADD = 1 layer) + - Check for common anti-patterns + +2. **Apply optimization checklist** + + ``` + BASE IMAGE + ├── Use specific tags, never :latest in production + ├── Prefer slim/alpine variants (debian-slim > ubuntu > debian) + ├── Pin digest for reproducibility in CI: image@sha256:... + └── Match base to runtime needs (don't use python:3.12 for a compiled binary) + + LAYER OPTIMIZATION + ├── Combine related RUN commands with && \ + ├── Order layers: least-changing first (deps before source code) + ├── Clean package manager cache in the same RUN layer + ├── Use .dockerignore to exclude unnecessary files + └── Separate build deps from runtime deps + + BUILD CACHE + ├── COPY dependency files before source code (package.json, requirements.txt, go.mod) + ├── Install deps in a separate layer from code copy + ├── Use BuildKit cache mounts: --mount=type=cache,target=/root/.cache + └── Avoid COPY . . before dependency installation + + MULTI-STAGE BUILDS + ├── Stage 1: build (full SDK, build tools, dev deps) + ├── Stage 2: runtime (minimal base, only production artifacts) + ├── COPY --from=builder only what's needed + └── Final image should have NO build tools, NO source code, NO dev deps + ``` + +3. **Generate optimized Dockerfile** + - Apply all relevant optimizations + - Add inline comments explaining each decision + - Report estimated size reduction + +4. **Validate** + ```bash + python3 scripts/dockerfile_analyzer.py Dockerfile + ``` + +### `/docker:compose` — Docker Compose Configuration + +1. **Identify services** + - Application (web, API, worker) + - Database (postgres, mysql, redis, mongo) + - Cache (redis, memcached) + - Queue (rabbitmq, kafka) + - Reverse proxy (nginx, traefik, caddy) + +2. **Apply compose best practices** + + ``` + SERVICES + ├── Use depends_on with condition: service_healthy + ├── Add healthchecks for every service + ├── Set resource limits (mem_limit, cpus) + ├── Use named volumes for persistent data + └── Pin image versions + + NETWORKING + ├── Create explicit networks (don't rely on default) + ├── Separate frontend and backend networks + ├── Only expose ports that need external access + └── Use internal: true for backend-only networks + + ENVIRONMENT + ├── Use env_file for secrets, not inline environment + ├── Never commit .env files (add to .gitignore) + ├── Use variable substitution: ${VAR:-default} + └── Document all required env vars + + DEVELOPMENT vs PRODUCTION + ├── Use compose profiles or override files + ├── Dev: bind mounts for hot reload, debug ports exposed + ├── Prod: named volumes, no debug ports, restart: unless-stopped + └── docker-compose.override.yml for dev-only config + ``` + +3. **Generate compose file** + - Output docker-compose.yml with healthchecks, networks, volumes + - Generate .env.example with all required variables documented + - Add dev/prod profile annotations + +### `/docker:security` — Container Security Audit + +1. **Dockerfile audit** + + | Check | Severity | Fix | + |-------|----------|-----| + | Running as root | Critical | Add `USER nonroot` after creating user | + | Using :latest tag | High | Pin to specific version | + | Secrets in ENV/ARG | Critical | Use BuildKit secrets: `--mount=type=secret` | + | COPY with broad glob | Medium | Use specific paths, add .dockerignore | + | Unnecessary EXPOSE | Low | Only expose ports the app uses | + | No HEALTHCHECK | Medium | Add HEALTHCHECK with appropriate interval | + | Privileged instructions | High | Avoid `--privileged`, drop capabilities | + | Package manager cache retained | Low | Clean in same RUN layer | + +2. **Runtime security checks** + + | Check | Severity | Fix | + |-------|----------|-----| + | Container running as root | Critical | Set user in Dockerfile or compose | + | Writable root filesystem | Medium | Use `read_only: true` in compose | + | All capabilities retained | High | Drop all, add only needed: `cap_drop: [ALL]` | + | No resource limits | Medium | Set `mem_limit` and `cpus` | + | Host network mode | High | Use bridge or custom network | + | Sensitive mounts | Critical | Never mount /etc, /var/run/docker.sock in prod | + | No log driver configured | Low | Set `logging:` with size limits | + +3. **Generate security report** + ``` + SECURITY AUDIT — [Dockerfile/Image name] + Date: [timestamp] + + CRITICAL: [count] + HIGH: [count] + MEDIUM: [count] + LOW: [count] + + [Detailed findings with fix recommendations] + ``` + +--- + +## Tooling + +### `scripts/dockerfile_analyzer.py` + +CLI utility for static analysis of Dockerfiles. + +**Features:** +- Layer count and optimization suggestions +- Base image analysis with size estimates +- Anti-pattern detection (15+ rules) +- Security issue flagging +- Multi-stage build detection and validation +- JSON and text output + +**Usage:** +```bash +# Analyze a Dockerfile +python3 scripts/dockerfile_analyzer.py Dockerfile + +# JSON output +python3 scripts/dockerfile_analyzer.py Dockerfile --output json + +# Analyze with security focus +python3 scripts/dockerfile_analyzer.py Dockerfile --security + +# Check a specific directory +python3 scripts/dockerfile_analyzer.py path/to/Dockerfile +``` + +### `scripts/compose_validator.py` + +CLI utility for validating docker-compose files. + +**Features:** +- Service dependency validation +- Healthcheck presence detection +- Network configuration analysis +- Volume mount validation +- Environment variable audit +- Port conflict detection +- Best practice scoring + +**Usage:** +```bash +# Validate a compose file +python3 scripts/compose_validator.py docker-compose.yml + +# JSON output +python3 scripts/compose_validator.py docker-compose.yml --output json + +# Strict mode (fail on warnings) +python3 scripts/compose_validator.py docker-compose.yml --strict +``` + +--- + +## Multi-Stage Build Patterns + +### Pattern 1: Compiled Language (Go, Rust, C++) + +```dockerfile +# Build stage +FROM golang:1.22-alpine AS builder +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download +COPY . . +RUN CGO_ENABLED=0 go build -ldflags="-s -w" -o /app/server ./cmd/server + +# Runtime stage +FROM gcr.io/distroless/static-debian12 +COPY --from=builder /app/server /server +USER nonroot:nonroot +ENTRYPOINT ["/server"] +``` + +### Pattern 2: Node.js / TypeScript + +```dockerfile +# Dependencies stage +FROM node:20-alpine AS deps +WORKDIR /app +COPY package.json package-lock.json ./ +RUN npm ci --production=false + +# Build stage +FROM deps AS builder +COPY . . +RUN npm run build + +# Runtime stage +FROM node:20-alpine +WORKDIR /app +RUN addgroup -g 1001 -S appgroup && adduser -S appuser -u 1001 +COPY --from=builder /app/dist ./dist +COPY --from=deps /app/node_modules ./node_modules +COPY package.json ./ +USER appuser +EXPOSE 3000 +CMD ["node", "dist/index.js"] +``` + +### Pattern 3: Python + +```dockerfile +# Build stage +FROM python:3.12-slim AS builder +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir --prefix=/install -r requirements.txt + +# Runtime stage +FROM python:3.12-slim +WORKDIR /app +RUN groupadd -r appgroup && useradd -r -g appgroup appuser +COPY --from=builder /install /usr/local +COPY . . +USER appuser +EXPOSE 8000 +CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] +``` + +--- + +## Base Image Decision Tree + +``` +Is it a compiled binary (Go, Rust, C)? +├── Yes → distroless/static or scratch +└── No + ├── Need a shell for debugging? + │ ├── Yes → alpine variant (e.g., node:20-alpine) + │ └── No → distroless variant + ├── Need glibc (not musl)? + │ ├── Yes → slim variant (e.g., python:3.12-slim) + │ └── No → alpine variant + └── Need specific OS packages? + ├── Many → debian-slim + └── Few → alpine + apk add +``` + +--- + +## Proactive Triggers + +Flag these without being asked: + +- **Dockerfile uses :latest** → Suggest pinning to a specific version tag. +- **No .dockerignore** → Create one. At minimum: `.git`, `node_modules`, `__pycache__`, `.env`. +- **COPY . . before dependency install** → Cache bust. Reorder to install deps first. +- **Running as root** → Add USER instruction. No exceptions for production. +- **Secrets in ENV or ARG** → Use BuildKit secret mounts. Never bake secrets into layers. +- **Image over 1GB** → Multi-stage build required. No reason for a production image this large. +- **No healthcheck** → Add one. Orchestrators (Compose, K8s) need it for proper lifecycle management. +- **apt-get without cleanup in same layer** → `rm -rf /var/lib/apt/lists/*` in the same RUN. + +--- + +## Installation + +### One-liner (any tool) +```bash +git clone https://github.com/alirezarezvani/claude-skills.git +cp -r claude-skills/engineering/docker-development ~/.claude/skills/ +``` + +### Multi-tool install +```bash +./scripts/convert.sh --skill docker-development --tool codex|gemini|cursor|windsurf|openclaw +``` + +### OpenClaw +```bash +clawhub install cs-docker-development +``` + +--- + +## Related Skills + +- **senior-devops** — Broader DevOps scope (CI/CD, IaC, monitoring). Complementary — use docker-development for container-specific work, senior-devops for pipeline and infrastructure. +- **senior-security** — Application security. Complementary — docker-development covers container security, senior-security covers application-level threats. +- **autoresearch-agent** — Can optimize Docker build times or image sizes as measurable experiments. +- **ci-cd-pipeline-builder** — Pipeline construction. Complementary — docker-development builds the containers, ci-cd-pipeline-builder deploys them. diff --git a/engineering/docker-development/references/compose-patterns.md b/engineering/docker-development/references/compose-patterns.md new file mode 100644 index 0000000..51d08b2 --- /dev/null +++ b/engineering/docker-development/references/compose-patterns.md @@ -0,0 +1,282 @@ +# Docker Compose Patterns Reference + +## Production-Ready Patterns + +### Web App + Database + Cache + +```yaml +services: + app: + build: + context: . + dockerfile: Dockerfile + ports: + - "3000:3000" + env_file: + - .env + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:3000/health"] + interval: 30s + timeout: 3s + retries: 3 + start_period: 10s + restart: unless-stopped + networks: + - frontend + - backend + mem_limit: 512m + cpus: 1.0 + + db: + image: postgres:16-alpine + volumes: + - pgdata:/var/lib/postgresql/data + env_file: + - .env.db + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 10s + timeout: 5s + retries: 5 + restart: unless-stopped + networks: + - backend + mem_limit: 256m + + redis: + image: redis:7-alpine + command: redis-server --maxmemory 64mb --maxmemory-policy allkeys-lru + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 3s + retries: 3 + restart: unless-stopped + networks: + - backend + mem_limit: 128m + +volumes: + pgdata: + +networks: + frontend: + backend: + internal: true +``` + +### Key Patterns +- **Healthchecks on every service** — enables depends_on with condition +- **Named volumes** — data persists across container recreation +- **Explicit networks** — backend is internal (no external access) +- **env_file** — secrets not in compose file +- **Resource limits** — prevent runaway containers + +--- + +## Development Override Pattern + +### docker-compose.yml (base — production-like) +```yaml +services: + app: + build: . + ports: + - "3000:3000" + restart: unless-stopped +``` + +### docker-compose.override.yml (dev — auto-loaded) +```yaml +services: + app: + build: + target: development + volumes: + - .:/app # Bind mount for hot reload + - /app/node_modules # Preserve container node_modules + environment: + - NODE_ENV=development + - DEBUG=true + ports: + - "9229:9229" # Debug port + restart: "no" +``` + +### Usage +```bash +# Development (auto-loads override) +docker compose up + +# Production (skip override) +docker compose -f docker-compose.yml up -d + +# Explicit profiles +docker compose --profile dev up +docker compose --profile prod up -d +``` + +--- + +## Network Isolation Pattern + +```yaml +services: + nginx: + image: nginx:alpine + ports: + - "80:80" + - "443:443" + networks: + - frontend + + app: + build: . + networks: + - frontend + - backend + + db: + image: postgres:16-alpine + networks: + - backend + + redis: + image: redis:7-alpine + networks: + - backend + +networks: + frontend: + # External traffic reaches nginx and app + backend: + internal: true + # DB and Redis only reachable by app +``` + +### Why This Matters +- Database and cache are **not accessible from outside** +- Only nginx and app handle external traffic +- Lateral movement limited if one container is compromised + +--- + +## Worker + Queue Pattern + +```yaml +services: + api: + build: + context: . + target: runtime + command: uvicorn main:app --host 0.0.0.0 --port 8000 + ports: + - "8000:8000" + depends_on: + rabbitmq: + condition: service_healthy + + worker: + build: + context: . + target: runtime + command: celery -A tasks worker --loglevel=info + depends_on: + rabbitmq: + condition: service_healthy + + scheduler: + build: + context: . + target: runtime + command: celery -A tasks beat --loglevel=info + depends_on: + rabbitmq: + condition: service_healthy + + rabbitmq: + image: rabbitmq:3.13-management-alpine + ports: + - "15672:15672" # Management UI (dev only) + healthcheck: + test: ["CMD", "rabbitmq-diagnostics", "check_running"] + interval: 10s + timeout: 5s + retries: 5 +``` + +--- + +## Logging Configuration + +```yaml +services: + app: + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + tag: "{{.Name}}/{{.ID}}" +``` + +### Why +- **max-size** prevents disk exhaustion +- **max-file** rotates logs automatically +- Default Docker logging has NO size limit — production servers can run out of disk + +--- + +## Environment Variable Patterns + +### .env.example (committed to repo) +```env +# Database +DATABASE_URL=postgres://user:password@db:5432/appname +POSTGRES_USER=user +POSTGRES_PASSWORD=changeme +POSTGRES_DB=appname + +# Redis +REDIS_URL=redis://redis:6379/0 + +# Application +SECRET_KEY=changeme-generate-a-real-secret +NODE_ENV=production +LOG_LEVEL=info + +# External Services (BYOK) +# SMTP_HOST= +# SMTP_PORT=587 +# AWS_ACCESS_KEY_ID= +# AWS_SECRET_ACCESS_KEY= +``` + +### Variable Substitution in Compose +```yaml +services: + app: + image: myapp:${APP_VERSION:-latest} + environment: + - LOG_LEVEL=${LOG_LEVEL:-info} + - PORT=${PORT:-3000} +``` + +--- + +## Troubleshooting Checklist + +| Symptom | Likely Cause | Fix | +|---------|-------------|-----| +| Container exits immediately | CMD/ENTRYPOINT crashes, missing env vars | Check logs: `docker compose logs service` | +| Port already in use | Another service or host process on same port | Change host port: `"3001:3000"` | +| Volume permissions denied | Container user doesn't own mounted path | Match UID/GID or use named volumes | +| Build cache not working | COPY . . invalidates cache early | Reorder: copy deps first, then source | +| depends_on doesn't wait | No healthcheck condition | Add `condition: service_healthy` | +| Container OOM killed | No memory limit or limit too low | Set appropriate `mem_limit` | +| Network connectivity issues | Wrong network or service name | Services communicate by service name within shared network | diff --git a/engineering/docker-development/references/dockerfile-best-practices.md b/engineering/docker-development/references/dockerfile-best-practices.md new file mode 100644 index 0000000..eb96dd4 --- /dev/null +++ b/engineering/docker-development/references/dockerfile-best-practices.md @@ -0,0 +1,235 @@ +# Dockerfile Best Practices Reference + +## Layer Optimization + +### The Golden Rule +Every `RUN`, `COPY`, and `ADD` instruction creates a new layer. Fewer layers = smaller image. + +### Combine Related Commands +```dockerfile +# Bad — 3 layers +RUN apt-get update +RUN apt-get install -y curl git +RUN rm -rf /var/lib/apt/lists/* + +# Good — 1 layer +RUN apt-get update && \ + apt-get install -y --no-install-recommends curl git && \ + rm -rf /var/lib/apt/lists/* +``` + +### Order Layers by Change Frequency +```dockerfile +# Least-changing layers first +COPY package.json package-lock.json ./ # Changes rarely +RUN npm ci # Changes when deps change +COPY . . # Changes every build +RUN npm run build # Changes every build +``` + +### Use .dockerignore +``` +.git +node_modules +__pycache__ +*.pyc +.env +.env.* +dist +build +*.log +.DS_Store +.vscode +.idea +coverage +.pytest_cache +``` + +--- + +## Base Image Selection + +### Size Comparison (approximate) + +| Base | Size | Use Case | +|------|------|----------| +| `scratch` | 0MB | Static binaries (Go, Rust) | +| `distroless/static` | 2MB | Static binaries with CA certs | +| `alpine` | 7MB | Minimal Linux, shell access | +| `distroless/base` | 20MB | Dynamic binaries (C/C++) | +| `debian-slim` | 80MB | When you need glibc + apt | +| `ubuntu` | 78MB | Full Ubuntu ecosystem | +| `python:3.12-slim` | 130MB | Python apps (production) | +| `node:20-alpine` | 130MB | Node.js apps | +| `golang:1.22` | 800MB | Go build stage only | +| `python:3.12` | 900MB | Never use in production | +| `node:20` | 1000MB | Never use in production | + +### When to Use Alpine +- Small image size matters +- No dependency on glibc (musl works) +- Willing to handle occasional musl-related issues +- Not running Python with C extensions that need glibc + +### When to Use Slim +- Need glibc compatibility +- Python with compiled C extensions (numpy, pandas) +- Fewer musl compatibility issues +- Still much smaller than full images + +### When to Use Distroless +- Maximum security (no shell, no package manager) +- Compiled/static binaries +- Don't need debugging access inside container +- Production-only (not development) + +--- + +## Multi-Stage Builds + +### Why Multi-Stage +- Build tools and source code stay out of production image +- Final image contains only runtime artifacts +- Dramatically reduces image size and attack surface + +### Naming Stages +```dockerfile +FROM golang:1.22 AS builder # Named stage +FROM alpine:3.19 AS runtime # Named stage +COPY --from=builder /app /app # Reference by name +``` + +### Selective Copy +```dockerfile +# Only copy the built artifact — nothing else +COPY --from=builder /app/server /server +COPY --from=builder /app/config.yaml /config.yaml +# Don't COPY --from=builder /app/ /app/ (copies source code too) +``` + +--- + +## Security Hardening + +### Run as Non-Root +```dockerfile +# Create user +RUN groupadd -r appgroup && useradd -r -g appgroup -s /sbin/nologin appuser + +# Set ownership +COPY --chown=appuser:appgroup . . + +# Switch user (after all root-requiring operations) +USER appuser +``` + +### Secret Management +```dockerfile +# Bad — secret baked into layer +ENV API_KEY=sk-12345 + +# Good — BuildKit secret mount (never in layer) +RUN --mount=type=secret,id=api_key \ + export API_KEY=$(cat /run/secrets/api_key) && \ + ./configure --api-key=$API_KEY +``` + +Build with: +```bash +docker build --secret id=api_key,src=./api_key.txt . +``` + +### Read-Only Filesystem +```yaml +# docker-compose.yml +services: + app: + read_only: true + tmpfs: + - /tmp + - /var/run +``` + +### Drop Capabilities +```yaml +services: + app: + cap_drop: + - ALL + cap_add: + - NET_BIND_SERVICE # Only if binding to ports < 1024 +``` + +--- + +## Build Performance + +### BuildKit Cache Mounts +```dockerfile +# Cache pip downloads across builds +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -r requirements.txt + +# Cache apt downloads +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update && apt-get install -y curl +``` + +### Parallel Builds +```dockerfile +# These stages build in parallel when using BuildKit +FROM node:20-alpine AS frontend +COPY frontend/ . +RUN npm ci && npm run build + +FROM golang:1.22 AS backend +COPY backend/ . +RUN go build -o server + +FROM alpine:3.19 +COPY --from=frontend /dist /static +COPY --from=backend /server /server +``` + +### Enable BuildKit +```bash +export DOCKER_BUILDKIT=1 +docker build . + +# Or in daemon.json +{ "features": { "buildkit": true } } +``` + +--- + +## Health Checks + +### HTTP Service +```dockerfile +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 +``` + +### Without curl (using wget) +```dockerfile +HEALTHCHECK --interval=30s --timeout=3s --retries=3 \ + CMD wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1 +``` + +### TCP Check +```dockerfile +HEALTHCHECK --interval=30s --timeout=3s --retries=3 \ + CMD nc -z localhost 8000 || exit 1 +``` + +### PostgreSQL +```dockerfile +HEALTHCHECK --interval=10s --timeout=5s --retries=5 \ + CMD pg_isready -U postgres || exit 1 +``` + +### Redis +```dockerfile +HEALTHCHECK --interval=10s --timeout=3s --retries=3 \ + CMD redis-cli ping | grep PONG || exit 1 +``` diff --git a/engineering/docker-development/scripts/compose_validator.py b/engineering/docker-development/scripts/compose_validator.py new file mode 100644 index 0000000..fa5c109 --- /dev/null +++ b/engineering/docker-development/scripts/compose_validator.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +""" +docker-development: Docker Compose Validator + +Validate docker-compose.yml files for best practices, missing healthchecks, +network configuration, port conflicts, and security issues. + +Usage: + python scripts/compose_validator.py docker-compose.yml + python scripts/compose_validator.py docker-compose.yml --output json + python scripts/compose_validator.py docker-compose.yml --strict +""" + +import argparse +import json +import re +import sys +from pathlib import Path + + +# --- Demo Compose File --- + +DEMO_COMPOSE = """ +version: '3.8' +services: + web: + build: . + ports: + - "3000:3000" + environment: + - DATABASE_URL=postgres://user:password@db:5432/app + - SECRET_KEY=my-secret-key + depends_on: + - db + - redis + + db: + image: postgres:latest + ports: + - "5432:5432" + environment: + POSTGRES_PASSWORD: password123 + volumes: + - ./data:/var/lib/postgresql/data + + redis: + image: redis + ports: + - "6379:6379" + + worker: + build: . + command: python worker.py + environment: + - DATABASE_URL=postgres://user:password@db:5432/app +""" + + +def parse_yaml_simple(content): + """Simple YAML-like parser for docker-compose files (stdlib only). + + Handles the subset of YAML used in typical docker-compose files: + - Top-level keys + - Service definitions + - Lists (- items) + - Key-value pairs + - Nested indentation + """ + result = {"services": {}, "volumes": {}, "networks": {}} + current_section = None + current_service = None + current_key = None + indent_stack = [] + + for line in content.splitlines(): + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + + indent = len(line) - len(line.lstrip()) + + # Top-level keys + if indent == 0 and ":" in stripped: + key = stripped.split(":")[0].strip() + if key == "services": + current_section = "services" + elif key == "volumes": + current_section = "volumes" + elif key == "networks": + current_section = "networks" + elif key == "version": + val = stripped.split(":", 1)[1].strip().strip("'\"") + result["version"] = val + current_service = None + current_key = None + continue + + if current_section == "services": + # Service name (indent level 2) + if indent == 2 and ":" in stripped and not stripped.startswith("-"): + key = stripped.split(":")[0].strip() + val = stripped.split(":", 1)[1].strip() if ":" in stripped else "" + if val and not val.startswith("{"): + # Simple key:value inside a service + if current_service and current_service in result["services"]: + result["services"][current_service][key] = val + else: + current_service = key + result["services"][current_service] = {} + current_key = None + else: + current_service = key + result["services"][current_service] = {} + current_key = None + continue + + if current_service and current_service in result["services"]: + svc = result["services"][current_service] + + # Service-level keys (indent 4) + if indent == 4 and ":" in stripped and not stripped.startswith("-"): + key = stripped.split(":")[0].strip() + val = stripped.split(":", 1)[1].strip() + current_key = key + if val: + svc[key] = val.strip("'\"") + else: + svc[key] = [] + continue + + # List items (indent 6 or 8) + if stripped.startswith("-") and current_key: + item = stripped[1:].strip().strip("'\"") + if current_key in svc: + if isinstance(svc[current_key], list): + svc[current_key].append(item) + else: + svc[current_key] = [svc[current_key], item] + else: + svc[current_key] = [item] + continue + + # Nested key:value under current_key (e.g., healthcheck test) + if indent >= 6 and ":" in stripped and not stripped.startswith("-"): + key = stripped.split(":")[0].strip() + val = stripped.split(":", 1)[1].strip() + if current_key and current_key in svc: + if isinstance(svc[current_key], list): + svc[current_key] = {} + if isinstance(svc[current_key], dict): + svc[current_key][key] = val + + return result + + +def validate_compose(parsed, strict=False): + """Run validation rules on parsed compose file.""" + findings = [] + services = parsed.get("services", {}) + + # --- Version check --- + version = parsed.get("version", "") + if version: + findings.append({ + "severity": "low", + "category": "deprecation", + "message": f"'version: {version}' is deprecated in Compose V2 — remove it", + "service": "(top-level)", + }) + + # --- Per-service checks --- + all_ports = [] + + for name, svc in services.items(): + # Healthcheck + if "healthcheck" not in svc: + findings.append({ + "severity": "medium", + "category": "reliability", + "message": f"No healthcheck defined — orchestrator can't detect unhealthy state", + "service": name, + }) + + # Image tag + image = svc.get("image", "") + if image: + if ":latest" in image: + findings.append({ + "severity": "high", + "category": "reproducibility", + "message": f"Using :latest tag on '{image}' — pin to specific version", + "service": name, + }) + elif ":" not in image and "/" not in image: + findings.append({ + "severity": "high", + "category": "reproducibility", + "message": f"No tag on image '{image}' — defaults to :latest", + "service": name, + }) + + # Ports + ports = svc.get("ports", []) + if isinstance(ports, list): + for p in ports: + p_str = str(p) + # Extract host port + match = re.match(r"(\d+):\d+", p_str) + if match: + host_port = match.group(1) + all_ports.append((host_port, name)) + + # Environment secrets + env = svc.get("environment", []) + if isinstance(env, list): + for e in env: + e_str = str(e) + if re.search(r"(?:PASSWORD|SECRET|TOKEN|KEY)=\S+", e_str, re.IGNORECASE): + if "env_file" not in svc: + findings.append({ + "severity": "critical", + "category": "security", + "message": f"Inline secret in environment: {e_str[:40]}...", + "service": name, + }) + elif isinstance(env, dict): + for k, v in env.items(): + if re.search(r"(?:PASSWORD|SECRET|TOKEN|KEY)", k, re.IGNORECASE) and v: + findings.append({ + "severity": "critical", + "category": "security", + "message": f"Inline secret: {k}={str(v)[:20]}...", + "service": name, + }) + + # depends_on without condition + depends = svc.get("depends_on", []) + if isinstance(depends, list) and depends: + findings.append({ + "severity": "medium", + "category": "reliability", + "message": "depends_on without condition: service_healthy — race condition risk", + "service": name, + }) + + # Bind mounts (./path style) + volumes = svc.get("volumes", []) + if isinstance(volumes, list): + for v in volumes: + v_str = str(v) + if v_str.startswith("./") or v_str.startswith("/"): + if "/var/run/docker.sock" in v_str: + findings.append({ + "severity": "critical", + "category": "security", + "message": "Docker socket mounted — container has host Docker access", + "service": name, + }) + + # Restart policy + if "restart" not in svc and "build" not in svc: + findings.append({ + "severity": "low", + "category": "reliability", + "message": "No restart policy — container won't auto-restart on failure", + "service": name, + }) + + # Resource limits + if "mem_limit" not in svc and "deploy" not in svc: + findings.append({ + "severity": "low" if not strict else "medium", + "category": "resources", + "message": "No memory limit — container can consume all host memory", + "service": name, + }) + + # Port conflicts + port_map = {} + for port, svc_name in all_ports: + if port in port_map: + findings.append({ + "severity": "high", + "category": "networking", + "message": f"Port {port} conflict between '{port_map[port]}' and '{svc_name}'", + "service": svc_name, + }) + port_map[port] = svc_name + + # Network check + if "networks" not in parsed or not parsed["networks"]: + if len(services) > 1: + findings.append({ + "severity": "low", + "category": "networking", + "message": "No explicit networks — all services share default bridge network", + "service": "(top-level)", + }) + + # Sort by severity + severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3} + findings.sort(key=lambda f: severity_order.get(f["severity"], 4)) + + return findings + + +def generate_report(content, output_format="text", strict=False): + """Generate validation report.""" + parsed = parse_yaml_simple(content) + findings = validate_compose(parsed, strict) + services = parsed.get("services", {}) + + # Score + deductions = {"critical": 25, "high": 15, "medium": 5, "low": 2} + score = max(0, 100 - sum(deductions.get(f["severity"], 0) for f in findings)) + + counts = { + "critical": sum(1 for f in findings if f["severity"] == "critical"), + "high": sum(1 for f in findings if f["severity"] == "high"), + "medium": sum(1 for f in findings if f["severity"] == "medium"), + "low": sum(1 for f in findings if f["severity"] == "low"), + } + + result = { + "score": score, + "services": list(services.keys()), + "service_count": len(services), + "findings": findings, + "finding_counts": counts, + } + + if output_format == "json": + print(json.dumps(result, indent=2)) + return result + + # Text output + print(f"\n{'=' * 60}") + print(f" Docker Compose Validation Report") + print(f"{'=' * 60}") + print(f" Score: {score}/100") + print(f" Services: {', '.join(services.keys()) if services else 'none'}") + print() + print(f" Findings: {counts['critical']} critical | {counts['high']} high | {counts['medium']} medium | {counts['low']} low") + print(f"{'─' * 60}") + + for f in findings: + icon = {"critical": "!!!", "high": "!!", "medium": "!", "low": "~"}.get(f["severity"], "?") + print(f"\n {icon} {f['severity'].upper()} [{f['category']}] — {f['service']}") + print(f" {f['message']}") + + if not findings: + print("\n No issues found. Compose file looks good.") + + print(f"\n{'=' * 60}\n") + return result + + +def main(): + parser = argparse.ArgumentParser( + description="docker-development: Docker Compose validator" + ) + parser.add_argument("composefile", nargs="?", help="Path to docker-compose.yml (omit for demo)") + parser.add_argument( + "--output", "-o", + choices=["text", "json"], + default="text", + help="Output format (default: text)", + ) + parser.add_argument( + "--strict", + action="store_true", + help="Strict mode — elevate warnings to higher severity", + ) + args = parser.parse_args() + + if args.composefile: + path = Path(args.composefile) + if not path.exists(): + print(f"Error: File not found: {args.composefile}", file=sys.stderr) + sys.exit(1) + content = path.read_text(encoding="utf-8") + else: + print("No compose file provided. Running demo validation...\n") + content = DEMO_COMPOSE + + generate_report(content, args.output, args.strict) + + +if __name__ == "__main__": + main() diff --git a/engineering/docker-development/scripts/dockerfile_analyzer.py b/engineering/docker-development/scripts/dockerfile_analyzer.py new file mode 100644 index 0000000..0f21401 --- /dev/null +++ b/engineering/docker-development/scripts/dockerfile_analyzer.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +""" +docker-development: Dockerfile Analyzer + +Static analysis of Dockerfiles for optimization opportunities, anti-patterns, +and security issues. Reports layer count, base image analysis, and actionable +recommendations. + +Usage: + python scripts/dockerfile_analyzer.py Dockerfile + python scripts/dockerfile_analyzer.py Dockerfile --output json + python scripts/dockerfile_analyzer.py Dockerfile --security +""" + +import argparse +import json +import re +import sys +from pathlib import Path + + +# --- Analysis Rules --- + +ANTI_PATTERNS = [ + { + "id": "AP001", + "name": "latest_tag", + "severity": "high", + "pattern": r"^FROM\s+\S+:latest", + "message": "Using :latest tag — pin to a specific version for reproducibility", + "fix": "Use a specific tag like :3.12-slim or pin by digest", + }, + { + "id": "AP002", + "name": "no_tag", + "severity": "high", + "pattern": r"^FROM\s+([a-z][a-z0-9_.-]+)\s*$", + "message": "No tag specified on base image — defaults to :latest", + "fix": "Add a specific version tag", + }, + { + "id": "AP003", + "name": "run_apt_no_clean", + "severity": "medium", + "pattern": r"^RUN\s+.*apt-get\s+install(?!.*rm\s+-rf\s+/var/lib/apt/lists)", + "message": "apt-get install without cleanup in same layer — bloats image", + "fix": "Add && rm -rf /var/lib/apt/lists/* in the same RUN instruction", + }, + { + "id": "AP004", + "name": "run_apk_no_cache", + "severity": "medium", + "pattern": r"^RUN\s+.*apk\s+add(?!\s+--no-cache)", + "message": "apk add without --no-cache — retains package index", + "fix": "Use: apk add --no-cache ", + }, + { + "id": "AP005", + "name": "add_instead_of_copy", + "severity": "low", + "pattern": r"^ADD\s+(?!https?://)\S+", + "message": "Using ADD for local files — COPY is more explicit and predictable", + "fix": "Use COPY instead of ADD unless you need tar auto-extraction or URL fetching", + }, + { + "id": "AP006", + "name": "multiple_cmd", + "severity": "medium", + "pattern": None, # Custom check + "message": "Multiple CMD instructions — only the last one takes effect", + "fix": "Keep exactly one CMD instruction", + }, + { + "id": "AP007", + "name": "env_secrets", + "severity": "critical", + "pattern": r"^(?:ENV|ARG)\s+\S*(?:PASSWORD|SECRET|TOKEN|KEY|API_KEY)\s*=", + "message": "Secrets in ENV/ARG — baked into image layers and visible in history", + "fix": "Use BuildKit secrets: RUN --mount=type=secret,id=mytoken", + }, + { + "id": "AP008", + "name": "broad_copy", + "severity": "medium", + "pattern": r"^COPY\s+\.\s+\.", + "message": "COPY . . copies everything — may include secrets, git history, node_modules", + "fix": "Use .dockerignore and copy specific directories, or copy after dependency install", + }, + { + "id": "AP009", + "name": "no_user", + "severity": "critical", + "pattern": None, # Custom check + "message": "No USER instruction — container runs as root", + "fix": "Add USER nonroot or create a dedicated user", + }, + { + "id": "AP010", + "name": "pip_no_cache", + "severity": "low", + "pattern": r"^RUN\s+.*pip\s+install(?!\s+--no-cache-dir)", + "message": "pip install without --no-cache-dir — retains pip cache in layer", + "fix": "Use: pip install --no-cache-dir -r requirements.txt", + }, + { + "id": "AP011", + "name": "npm_install_dev", + "severity": "medium", + "pattern": r"^RUN\s+.*npm\s+install\s*$", + "message": "npm install includes devDependencies — use npm ci --omit=dev for production", + "fix": "Use: npm ci --omit=dev (or npm ci --production)", + }, + { + "id": "AP012", + "name": "expose_all", + "severity": "low", + "pattern": r"^EXPOSE\s+\d+(?:\s+\d+){3,}", + "message": "Exposing many ports — only expose what the application actually needs", + "fix": "Remove unnecessary EXPOSE directives", + }, + { + "id": "AP013", + "name": "curl_wget_without_cleanup", + "severity": "low", + "pattern": r"^RUN\s+.*(?:curl|wget)\s+.*(?!&&\s*rm)", + "message": "Download without cleanup — downloaded archives may remain in layer", + "fix": "Download, extract, and remove archive in the same RUN instruction", + }, + { + "id": "AP014", + "name": "no_healthcheck", + "severity": "medium", + "pattern": None, # Custom check + "message": "No HEALTHCHECK instruction — orchestrators can't determine container health", + "fix": "Add HEALTHCHECK CMD curl -f http://localhost:PORT/health || exit 1", + }, + { + "id": "AP015", + "name": "shell_form_cmd", + "severity": "low", + "pattern": r'^(?:CMD|ENTRYPOINT)\s+(?!\[)["\']?\w', + "message": "Using shell form for CMD/ENTRYPOINT — exec form is preferred for signal handling", + "fix": 'Use exec form: CMD ["executable", "arg1", "arg2"]', + }, +] + +# Approximate base image sizes (MB) +BASE_IMAGE_SIZES = { + "scratch": 0, + "alpine": 7, + "distroless/static": 2, + "distroless/base": 20, + "distroless/cc": 25, + "debian-slim": 80, + "debian": 120, + "ubuntu": 78, + "python-slim": 130, + "python-alpine": 50, + "python": 900, + "node-alpine": 130, + "node-slim": 200, + "node": 1000, + "golang-alpine": 250, + "golang": 800, + "rust-slim": 750, + "rust": 1400, + "nginx-alpine": 40, + "nginx": 140, +} + + +# --- Demo Dockerfile --- + +DEMO_DOCKERFILE = """FROM python:3.12 +WORKDIR /app +COPY . . +RUN pip install -r requirements.txt +ENV SECRET_KEY=mysecretkey123 +EXPOSE 8000 5432 6379 +CMD python manage.py runserver 0.0.0.0:8000 +""" + + +def parse_dockerfile(content): + """Parse Dockerfile into structured instructions.""" + instructions = [] + current = "" + + for line in content.splitlines(): + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + if stripped.endswith("\\"): + current += stripped[:-1] + " " + continue + current += stripped + # Parse instruction + match = re.match(r"^(\w+)\s+(.*)", current.strip()) + if match: + instructions.append({ + "instruction": match.group(1).upper(), + "args": match.group(2), + "raw": current.strip(), + }) + current = "" + + return instructions + + +def analyze_layers(instructions): + """Count and classify layers.""" + layer_instructions = {"FROM", "RUN", "COPY", "ADD"} + layers = [i for i in instructions if i["instruction"] in layer_instructions] + stages = [i for i in instructions if i["instruction"] == "FROM"] + return { + "total_layers": len(layers), + "stages": len(stages), + "is_multistage": len(stages) > 1, + "run_count": sum(1 for i in instructions if i["instruction"] == "RUN"), + "copy_count": sum(1 for i in instructions if i["instruction"] == "COPY"), + "add_count": sum(1 for i in instructions if i["instruction"] == "ADD"), + } + + +def analyze_base_image(instructions): + """Analyze base image choice.""" + from_instructions = [i for i in instructions if i["instruction"] == "FROM"] + if not from_instructions: + return {"image": "unknown", "tag": "unknown", "estimated_size_mb": 0} + + last_from = from_instructions[-1]["args"].split()[0] + parts = last_from.split(":") + image = parts[0] + tag = parts[1] if len(parts) > 1 else "latest" + + # Estimate size + size = 0 + image_base = image.split("/")[-1] + for key, val in BASE_IMAGE_SIZES.items(): + if key in f"{image_base}-{tag}" or key == image_base: + size = val + break + + return { + "image": image, + "tag": tag, + "estimated_size_mb": size, + "is_alpine": "alpine" in tag, + "is_slim": "slim" in tag, + "is_distroless": "distroless" in image, + } + + +def run_pattern_checks(content, instructions): + """Run anti-pattern checks.""" + findings = [] + + for rule in ANTI_PATTERNS: + if rule["pattern"] is not None: + for match in re.finditer(rule["pattern"], content, re.MULTILINE | re.IGNORECASE): + findings.append({ + "id": rule["id"], + "severity": rule["severity"], + "message": rule["message"], + "fix": rule["fix"], + "line": match.group(0).strip()[:80], + }) + + # Custom checks + # AP006: Multiple CMD + cmd_count = sum(1 for i in instructions if i["instruction"] == "CMD") + if cmd_count > 1: + r = next(r for r in ANTI_PATTERNS if r["id"] == "AP006") + findings.append({ + "id": r["id"], "severity": r["severity"], + "message": r["message"], "fix": r["fix"], + "line": f"{cmd_count} CMD instructions found", + }) + + # AP009: No USER + has_user = any(i["instruction"] == "USER" for i in instructions) + if not has_user and instructions: + r = next(r for r in ANTI_PATTERNS if r["id"] == "AP009") + findings.append({ + "id": r["id"], "severity": r["severity"], + "message": r["message"], "fix": r["fix"], + "line": "(no USER instruction found)", + }) + + # AP014: No HEALTHCHECK + has_healthcheck = any(i["instruction"] == "HEALTHCHECK" for i in instructions) + if not has_healthcheck and instructions: + r = next(r for r in ANTI_PATTERNS if r["id"] == "AP014") + findings.append({ + "id": r["id"], "severity": r["severity"], + "message": r["message"], "fix": r["fix"], + "line": "(no HEALTHCHECK instruction found)", + }) + + return findings + + +def generate_report(content, output_format="text", security_focus=False): + """Generate full analysis report.""" + instructions = parse_dockerfile(content) + layers = analyze_layers(instructions) + base = analyze_base_image(instructions) + findings = run_pattern_checks(content, instructions) + + if security_focus: + security_ids = {"AP007", "AP009", "AP008"} + security_severities = {"critical", "high"} + findings = [f for f in findings if f["id"] in security_ids or f["severity"] in security_severities] + + # Deduplicate findings by id + seen_ids = set() + unique_findings = [] + for f in findings: + key = (f["id"], f["line"]) + if key not in seen_ids: + seen_ids.add(key) + unique_findings.append(f) + findings = unique_findings + + # Sort by severity + severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3} + findings.sort(key=lambda f: severity_order.get(f["severity"], 4)) + + # Score (100 minus deductions) + deductions = {"critical": 25, "high": 15, "medium": 5, "low": 2} + score = max(0, 100 - sum(deductions.get(f["severity"], 0) for f in findings)) + + result = { + "score": score, + "base_image": base, + "layers": layers, + "findings": findings, + "finding_counts": { + "critical": sum(1 for f in findings if f["severity"] == "critical"), + "high": sum(1 for f in findings if f["severity"] == "high"), + "medium": sum(1 for f in findings if f["severity"] == "medium"), + "low": sum(1 for f in findings if f["severity"] == "low"), + }, + } + + if output_format == "json": + print(json.dumps(result, indent=2)) + return result + + # Text output + print(f"\n{'=' * 60}") + print(f" Dockerfile Analysis Report") + print(f"{'=' * 60}") + print(f" Score: {score}/100") + print(f" Base: {base['image']}:{base['tag']} (~{base['estimated_size_mb']}MB)") + print(f" Layers: {layers['total_layers']} | Stages: {layers['stages']} | Multi-stage: {'Yes' if layers['is_multistage'] else 'No'}") + print(f" RUN: {layers['run_count']} | COPY: {layers['copy_count']} | ADD: {layers['add_count']}") + print() + + counts = result["finding_counts"] + print(f" Findings: {counts['critical']} critical | {counts['high']} high | {counts['medium']} medium | {counts['low']} low") + print(f"{'─' * 60}") + + for f in findings: + icon = {"critical": "!!!", "high": "!!", "medium": "!", "low": "~"}.get(f["severity"], "?") + print(f"\n [{f['id']}] {icon} {f['severity'].upper()}") + print(f" {f['message']}") + print(f" Line: {f['line']}") + print(f" Fix: {f['fix']}") + + if not findings: + print("\n No issues found. Dockerfile looks good.") + + print(f"\n{'=' * 60}\n") + return result + + +def main(): + parser = argparse.ArgumentParser( + description="docker-development: Dockerfile static analyzer" + ) + parser.add_argument("dockerfile", nargs="?", help="Path to Dockerfile (omit for demo)") + parser.add_argument( + "--output", "-o", + choices=["text", "json"], + default="text", + help="Output format (default: text)", + ) + parser.add_argument( + "--security", + action="store_true", + help="Security-focused analysis only", + ) + args = parser.parse_args() + + if args.dockerfile: + path = Path(args.dockerfile) + if not path.exists(): + print(f"Error: File not found: {args.dockerfile}", file=sys.stderr) + sys.exit(1) + content = path.read_text(encoding="utf-8") + else: + print("No Dockerfile provided. Running demo analysis...\n") + content = DEMO_DOCKERFILE + + generate_report(content, args.output, args.security) + + +if __name__ == "__main__": + main() diff --git a/product-team/research-summarizer/.claude-plugin/plugin.json b/product-team/research-summarizer/.claude-plugin/plugin.json new file mode 100644 index 0000000..0e9e8de --- /dev/null +++ b/product-team/research-summarizer/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "research-summarizer", + "description": "Structured research summarization agent skill and plugin for Claude Code, Codex, and Gemini CLI. Summarize academic papers, compare web articles, extract citations, and produce actionable research briefs.", + "version": "1.0.0", + "author": { + "name": "Alireza Rezvani", + "url": "https://alirezarezvani.com" + }, + "homepage": "https://github.com/alirezarezvani/claude-skills/tree/main/product-team/research-summarizer", + "repository": "https://github.com/alirezarezvani/claude-skills", + "license": "MIT", + "skills": "./" +} diff --git a/product-team/research-summarizer/SKILL.md b/product-team/research-summarizer/SKILL.md new file mode 100644 index 0000000..158fa53 --- /dev/null +++ b/product-team/research-summarizer/SKILL.md @@ -0,0 +1,274 @@ +--- +name: "research-summarizer" +description: "Structured research summarization agent skill for non-dev users. Handles academic papers, web articles, reports, and documentation. Extracts key findings, generates comparative analyses, and produces properly formatted citations. Use when: user wants to summarize a research paper, compare multiple sources, extract citations from documents, or create structured research briefs. Plugin for Claude Code, Codex, Gemini CLI, and OpenClaw." +license: MIT +metadata: + version: 1.0.0 + author: Alireza Rezvani + category: product + updated: 2026-03-16 +--- + +# Research Summarizer + +> Read less. Understand more. Cite correctly. + +Structured research summarization workflow that turns dense source material into actionable briefs. Built for product managers, analysts, founders, and anyone who reads more than they should have to. + +Not a generic "summarize this" — a repeatable framework that extracts what matters, compares across sources, and formats citations properly. + +--- + +## Slash Commands + +| Command | What it does | +|---------|-------------| +| `/research:summarize` | Summarize a single source into a structured brief | +| `/research:compare` | Compare 2-5 sources side-by-side with synthesis | +| `/research:cite` | Extract and format all citations from a document | + +--- + +## When This Skill Activates + +Recognize these patterns from the user: + +- "Summarize this paper / article / report" +- "What are the key findings in this document?" +- "Compare these sources" +- "Extract citations from this PDF" +- "Give me a research brief on [topic]" +- "Break down this whitepaper" +- Any request involving: summarize, research brief, literature review, citation, source comparison + +If the user has a document and wants structured understanding → this skill applies. + +--- + +## Workflow + +### `/research:summarize` — Single Source Summary + +1. **Identify source type** + - Academic paper → use IMRAD structure (Introduction, Methods, Results, Analysis, Discussion) + - Web article → use claim-evidence-implication structure + - Technical report → use executive summary structure + - Documentation → use reference summary structure + +2. **Extract structured brief** + ``` + Title: [exact title] + Author(s): [names] + Date: [publication date] + Source Type: [paper | article | report | documentation] + + ## Key Thesis + [1-2 sentences: the central argument or finding] + + ## Key Findings + 1. [Finding with supporting evidence] + 2. [Finding with supporting evidence] + 3. [Finding with supporting evidence] + + ## Methodology + [How they arrived at these findings — data sources, sample size, approach] + + ## Limitations + - [What the source doesn't cover or gets wrong] + + ## Actionable Takeaways + - [What to do with this information] + + ## Notable Quotes + > "[Direct quote]" (p. X) + ``` + +3. **Assess quality** + - Source credibility (peer-reviewed, reputable outlet, primary vs secondary) + - Evidence strength (data-backed, anecdotal, theoretical) + - Recency (when published, still relevant?) + - Bias indicators (funding source, author affiliation, methodology gaps) + +### `/research:compare` — Multi-Source Comparison + +1. **Collect sources** (2-5 documents) +2. **Summarize each** using the single-source workflow above +3. **Build comparison matrix** + + ``` + | Dimension | Source A | Source B | Source C | + |------------------|-----------------|-----------------|-----------------| + | Central Thesis | ... | ... | ... | + | Methodology | ... | ... | ... | + | Key Finding | ... | ... | ... | + | Sample/Scope | ... | ... | ... | + | Credibility | High/Med/Low | High/Med/Low | High/Med/Low | + ``` + +4. **Synthesize** + - Where do sources agree? (convergent findings = stronger signal) + - Where do they disagree? (divergent findings = needs investigation) + - What gaps exist across all sources? + - What's the weight of evidence for each position? + +5. **Produce synthesis brief** + ``` + ## Consensus Findings + [What most sources agree on] + + ## Contested Points + [Where sources disagree, with strongest evidence for each side] + + ## Gaps + [What none of the sources address] + + ## Recommendation + [Based on weight of evidence, what should the reader believe/do?] + ``` + +### `/research:cite` — Citation Extraction + +1. **Scan document** for all references, footnotes, in-text citations +2. **Extract and format** using the requested style (APA 7 default) +3. **Classify citations** by type: + - Primary sources (original research, data) + - Secondary sources (reviews, meta-analyses, commentary) + - Tertiary sources (textbooks, encyclopedias) +4. **Output** sorted bibliography with classification tags + +Supported citation formats: +- **APA 7** (default) — social sciences, business +- **IEEE** — engineering, computer science +- **Chicago** — humanities, history +- **Harvard** — general academic +- **MLA 9** — arts, humanities + +--- + +## Tooling + +### `scripts/extract_citations.py` + +CLI utility for extracting and formatting citations from text. + +**Features:** +- Regex-based citation detection (DOI, URL, author-year, numbered references) +- Multiple output formats (APA, IEEE, Chicago, Harvard, MLA) +- JSON export for integration with reference managers +- Deduplication of repeated citations + +**Usage:** +```bash +# Extract citations from a file (APA format, default) +python3 scripts/extract_citations.py document.txt + +# Specify format +python3 scripts/extract_citations.py document.txt --format ieee + +# JSON output +python3 scripts/extract_citations.py document.txt --format apa --output json + +# From stdin +cat paper.txt | python3 scripts/extract_citations.py --stdin +``` + +### `scripts/format_summary.py` + +CLI utility for generating structured research summaries. + +**Features:** +- Multiple summary templates (academic, article, report, executive) +- Configurable output length (brief, standard, detailed) +- Markdown and plain text output +- Key findings extraction with evidence tagging + +**Usage:** +```bash +# Generate structured summary template +python3 scripts/format_summary.py --template academic + +# Brief executive summary format +python3 scripts/format_summary.py --template executive --length brief + +# All templates listed +python3 scripts/format_summary.py --list-templates + +# JSON output +python3 scripts/format_summary.py --template article --output json +``` + +--- + +## Quality Assessment Framework + +Rate every source on four dimensions: + +| Dimension | High | Medium | Low | +|-----------|------|--------|-----| +| **Credibility** | Peer-reviewed, established author | Reputable outlet, known author | Blog, unknown author, no review | +| **Evidence** | Large sample, rigorous method | Moderate data, sound approach | Anecdotal, no data, opinion | +| **Recency** | Published within 2 years | 2-5 years old | 5+ years, may be outdated | +| **Objectivity** | No conflicts, balanced view | Minor affiliations disclosed | Funded by interested party, one-sided | + +**Overall Rating:** +- 4 Highs = Strong source — cite with confidence +- 2+ Mediums = Adequate source — cite with caveats +- 2+ Lows = Weak source — verify independently before citing + +--- + +## Summary Templates + +See `references/summary-templates.md` for: +- Academic paper summary template (IMRAD) +- Web article summary template (claim-evidence-implication) +- Technical report template (executive summary) +- Comparative analysis template (matrix + synthesis) +- Literature review template (thematic organization) + +See `references/citation-formats.md` for: +- APA 7 formatting rules and examples +- IEEE formatting rules and examples +- Chicago, Harvard, MLA quick reference + +--- + +## Proactive Triggers + +Flag these without being asked: + +- **Source has no date** → Note it. Undated sources lose credibility points. +- **Source contradicts other sources** → Highlight the contradiction explicitly. Don't paper over disagreements. +- **Source is behind a paywall** → Note limited access. Suggest alternatives if known. +- **User provides only one source for a compare** → Ask for at least one more. Comparison needs 2+. +- **Citations are incomplete** → Flag missing fields (year, author, title). Don't invent metadata. +- **Source is 5+ years old in a fast-moving field** → Warn about potential obsolescence. + +--- + +## Installation + +### One-liner (any tool) +```bash +git clone https://github.com/alirezarezvani/claude-skills.git +cp -r claude-skills/product-team/research-summarizer ~/.claude/skills/ +``` + +### Multi-tool install +```bash +./scripts/convert.sh --skill research-summarizer --tool codex|gemini|cursor|windsurf|openclaw +``` + +### OpenClaw +```bash +clawhub install cs-research-summarizer +``` + +--- + +## Related Skills + +- **product-analytics** — Quantitative analysis. Complementary — use research-summarizer for qualitative sources, product-analytics for metrics. +- **competitive-teardown** — Competitive research. Complementary — use research-summarizer for individual source analysis, competitive-teardown for market landscape. +- **content-production** — Content writing. Research-summarizer feeds content-production — summarize sources first, then write. +- **product-discovery** — Discovery frameworks. Complementary — research-summarizer for desk research, product-discovery for user research. diff --git a/product-team/research-summarizer/references/citation-formats.md b/product-team/research-summarizer/references/citation-formats.md new file mode 100644 index 0000000..2f07b77 --- /dev/null +++ b/product-team/research-summarizer/references/citation-formats.md @@ -0,0 +1,105 @@ +# Citation Formats Quick Reference + +## APA 7 (American Psychological Association) + +Default format for social sciences, business, and product research. + +### Journal Article +Author, A. A., & Author, B. B. (Year). Title of article. *Title of Periodical*, *volume*(issue), page–page. https://doi.org/xxxxx + +**Example:** +Smith, J., & Jones, K. (2023). Agile adoption in enterprise organizations. *Journal of Product Management*, *15*(2), 45–62. https://doi.org/10.1234/jpm.2023.001 + +### Book +Author, A. A. (Year). *Title of work: Capital letter also for subtitle*. Publisher. + +**Example:** +Cagan, M. (2018). *Inspired: How to create tech products customers love*. Wiley. + +### Web Page +Author, A. A. (Year, Month Day). *Title of page*. Site Name. URL + +**Example:** +Torres, T. (2024, January 15). *Continuous discovery in practice*. Product Talk. https://www.producttalk.org/discovery + +### In-Text Citation +- Parenthetical: (Smith & Jones, 2023) +- Narrative: Smith and Jones (2023) found that... +- 3+ authors: (Patel et al., 2022) + +--- + +## IEEE (Institute of Electrical and Electronics Engineers) + +Standard for engineering, computer science, and technical research. + +### Format +[N] A. Author, "Title of article," *Journal*, vol. X, no. Y, pp. Z–Z, Month Year, doi: 10.xxxx. + +### Journal Article +[1] J. Smith and K. Jones, "Agile adoption in enterprise organizations," *J. Prod. Mgmt.*, vol. 15, no. 2, pp. 45–62, Mar. 2023, doi: 10.1234/jpm.2023.001. + +### Conference Paper +[2] A. Patel, B. Chen, and C. Kumar, "Cross-functional team performance metrics," in *Proc. Int. Conf. Software Eng.*, 2022, pp. 112–119. + +### Book +[3] M. Cagan, *Inspired: How to Create Tech Products Customers Love*. Hoboken, NJ, USA: Wiley, 2018. + +### In-Text Citation +As shown in [1], agile adoption has increased... +Multiple: [1], [3], [5]–[7] + +--- + +## Chicago (Notes-Bibliography) + +Standard for humanities, history, and some business writing. + +### Footnote Format +1. First Name Last Name, *Title of Book* (Place: Publisher, Year), page. +2. First Name Last Name, "Title of Article," *Journal* Volume, no. Issue (Year): pages. + +### Bibliography Entry +Last Name, First Name. *Title of Book*. Place: Publisher, Year. +Last Name, First Name. "Title of Article." *Journal* Volume, no. Issue (Year): pages. + +--- + +## Harvard + +Common in UK and Australian academic writing. + +### Format +Author, A.A. (Year) *Title of book*. Edition. Place: Publisher. +Author, A.A. (Year) 'Title of article', *Journal*, Volume(Issue), pp. X–Y. + +### In-Text Citation +(Smith and Jones, 2023) +Smith and Jones (2023) argue that... + +--- + +## MLA 9 (Modern Language Association) + +Standard for arts and humanities. + +### Format +Last, First. *Title of Book*. Publisher, Year. +Last, First. "Title of Article." *Journal*, vol. X, no. Y, Year, pp. Z–Z. + +### In-Text Citation +(Smith and Jones 45) +Smith and Jones argue that "direct quote" (45). + +--- + +## Quick Decision Guide + +| Field / Context | Recommended Format | +|----------------|-------------------| +| Social sciences, business, psychology | APA 7 | +| Engineering, computer science, technical | IEEE | +| Humanities, history, arts | Chicago or MLA | +| UK/Australian academic | Harvard | +| Internal business reports | APA 7 (most widely recognized) | +| Product research briefs | APA 7 | diff --git a/product-team/research-summarizer/references/summary-templates.md b/product-team/research-summarizer/references/summary-templates.md new file mode 100644 index 0000000..e2205db --- /dev/null +++ b/product-team/research-summarizer/references/summary-templates.md @@ -0,0 +1,120 @@ +# Summary Templates Reference + +## Academic Paper (IMRAD) + +Use for peer-reviewed journal articles, conference papers, and research studies. + +### Structure +1. **Introduction** — What problem does the paper address? Why does it matter? +2. **Methods** — How was the study conducted? What data, what approach? +3. **Results** — What did they find? Key numbers, key patterns. +4. **Analysis** — What do the results mean? How do they compare to prior work? +5. **Discussion** — What are the implications? Limitations? Future work? + +### Quality Signals +- Published in a peer-reviewed venue +- Clear methodology section with reproducible steps +- Statistical significance reported (p-values, confidence intervals) +- Limitations acknowledged openly +- Conflicts of interest disclosed + +### Red Flags +- No methodology section +- Claims without supporting data +- Funded by an entity that benefits from specific results +- Published in a predatory journal (check Beall's List) + +--- + +## Web Article (Claim-Evidence-Implication) + +Use for blog posts, news articles, opinion pieces, and online publications. + +### Structure +1. **Claim** — What is the author arguing or reporting? +2. **Evidence** — What data, examples, or sources support the claim? +3. **Implication** — So what? What should the reader do or think differently? + +### Quality Signals +- Author has relevant expertise or credentials +- Sources are linked and verifiable +- Multiple perspectives acknowledged +- Published on a reputable platform +- Date of publication is clear + +### Red Flags +- No author attribution +- No sources or citations +- Sensationalist headline vs. measured content +- Affiliate links or sponsored content without disclosure + +--- + +## Technical Report (Executive Summary) + +Use for industry reports, whitepapers, market research, and internal documents. + +### Structure +1. **Executive Summary** — Bottom line in 2-3 sentences +2. **Scope** — What does this report cover? +3. **Key Data** — Most important numbers and findings +4. **Methodology** — How was the data gathered? +5. **Recommendations** — What should be done based on findings? +6. **Relevance** — Why does this matter for our specific context? + +### Quality Signals +- Clear methodology for data collection +- Sample size and composition disclosed +- Published by a recognized research firm or organization +- Methodology section available (even if separate document) + +### Red Flags +- "Report" is actually a marketing piece for a product +- Data from a single, small, unrepresentative sample +- No methodology disclosure +- Conclusions far exceed what the data supports + +--- + +## Comparative Analysis (Matrix + Synthesis) + +Use when evaluating 2-5 sources on the same topic. + +### Comparison Dimensions +- **Central thesis** — What is each source's main argument? +- **Methodology** — How did each source arrive at its conclusions? +- **Key finding** — What is the headline result? +- **Sample/scope** — How broad or narrow is the evidence? +- **Credibility** — How trustworthy is the source? +- **Recency** — When was it published? + +### Synthesis Framework +1. **Convergent findings** — Where sources agree (stronger signal) +2. **Divergent findings** — Where sources disagree (investigate further) +3. **Gaps** — What no source addresses +4. **Weight of evidence** — Which position has stronger support? + +--- + +## Literature Review (Thematic) + +Use when synthesizing 5+ sources into a research overview. + +### Organization Approaches +- **Thematic** — Group by topic (preferred for most use cases) +- **Chronological** — Group by time period (good for showing evolution) +- **Methodological** — Group by research approach (good for methods papers) + +### Per-Theme Structure +1. Theme name and scope +2. Key sources that address this theme +3. What the sources say (points of agreement) +4. What the sources disagree on +5. Strength of evidence for each position + +### Synthesis Checklist +- [ ] All sources categorized into themes +- [ ] Gaps in literature identified +- [ ] Contradictions highlighted (not hidden) +- [ ] Overall state of knowledge summarized +- [ ] Future research directions suggested diff --git a/product-team/research-summarizer/scripts/extract_citations.py b/product-team/research-summarizer/scripts/extract_citations.py new file mode 100644 index 0000000..ee4de0d --- /dev/null +++ b/product-team/research-summarizer/scripts/extract_citations.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +""" +research-summarizer: Citation Extractor + +Extract and format citations from text documents. Detects DOIs, URLs, +author-year patterns, and numbered references. Outputs in APA, IEEE, +Chicago, Harvard, or MLA format. + +Usage: + python scripts/extract_citations.py document.txt + python scripts/extract_citations.py document.txt --format ieee + python scripts/extract_citations.py document.txt --format apa --output json + python scripts/extract_citations.py --stdin < document.txt +""" + +import argparse +import json +import re +import sys +from collections import OrderedDict + + +# --- Citation Detection Patterns --- + +PATTERNS = { + "doi": re.compile( + r"(?:https?://doi\.org/|doi:\s*)(10\.\d{4,}/[^\s,;}\]]+)", re.IGNORECASE + ), + "url": re.compile( + r"https?://[^\s,;}\])\"'>]+", re.IGNORECASE + ), + "author_year": re.compile( + r"(?:^|\(|\s)([A-Z][a-z]+(?:\s(?:&|and)\s[A-Z][a-z]+)?(?:\set\sal\.?)?)\s*\((\d{4})\)", + ), + "numbered_ref": re.compile( + r"^\[(\d+)\]\s+(.+)$", re.MULTILINE + ), + "footnote": re.compile( + r"^\d+\.\s+([A-Z].+?(?:\d{4}).+)$", re.MULTILINE + ), +} + + +def extract_dois(text): + """Extract DOI references.""" + citations = [] + for match in PATTERNS["doi"].finditer(text): + doi = match.group(1).rstrip(".") + citations.append({ + "type": "doi", + "doi": doi, + "raw": match.group(0).strip(), + "url": f"https://doi.org/{doi}", + }) + return citations + + +def extract_urls(text): + """Extract URL references (excluding DOI URLs already captured).""" + citations = [] + for match in PATTERNS["url"].finditer(text): + url = match.group(0).rstrip(".,;)") + if "doi.org" in url: + continue + citations.append({ + "type": "url", + "url": url, + "raw": url, + }) + return citations + + +def extract_author_year(text): + """Extract author-year citations like (Smith, 2023) or Smith & Jones (2021).""" + citations = [] + for match in PATTERNS["author_year"].finditer(text): + author = match.group(1).strip() + year = match.group(2) + citations.append({ + "type": "author_year", + "author": author, + "year": year, + "raw": f"{author} ({year})", + }) + return citations + + +def extract_numbered_refs(text): + """Extract numbered reference list entries like [1] Author. Title...""" + citations = [] + for match in PATTERNS["numbered_ref"].finditer(text): + num = match.group(1) + content = match.group(2).strip() + citations.append({ + "type": "numbered", + "number": int(num), + "content": content, + "raw": f"[{num}] {content}", + }) + return citations + + +def deduplicate(citations): + """Remove duplicate citations based on raw text.""" + seen = OrderedDict() + for c in citations: + key = c.get("doi") or c.get("url") or c.get("raw", "") + key = key.lower().strip() + if key and key not in seen: + seen[key] = c + return list(seen.values()) + + +def classify_source(citation): + """Classify citation as primary, secondary, or tertiary.""" + raw = citation.get("content", citation.get("raw", "")).lower() + if any(kw in raw for kw in ["meta-analysis", "systematic review", "literature review", "survey of"]): + return "secondary" + if any(kw in raw for kw in ["textbook", "encyclopedia", "handbook", "dictionary"]): + return "tertiary" + return "primary" + + +# --- Formatting --- + +def format_apa(citation): + """Format citation in APA 7 style.""" + if citation["type"] == "doi": + return f"https://doi.org/{citation['doi']}" + if citation["type"] == "url": + return f"Retrieved from {citation['url']}" + if citation["type"] == "author_year": + return f"{citation['author']} ({citation['year']})." + if citation["type"] == "numbered": + return citation["content"] + return citation.get("raw", "") + + +def format_ieee(citation): + """Format citation in IEEE style.""" + if citation["type"] == "doi": + return f"doi: {citation['doi']}" + if citation["type"] == "url": + return f"[Online]. Available: {citation['url']}" + if citation["type"] == "author_year": + return f"{citation['author']}, {citation['year']}." + if citation["type"] == "numbered": + return f"[{citation['number']}] {citation['content']}" + return citation.get("raw", "") + + +def format_chicago(citation): + """Format citation in Chicago style.""" + if citation["type"] == "doi": + return f"https://doi.org/{citation['doi']}." + if citation["type"] == "url": + return f"{citation['url']}." + if citation["type"] == "author_year": + return f"{citation['author']}. {citation['year']}." + if citation["type"] == "numbered": + return citation["content"] + return citation.get("raw", "") + + +def format_harvard(citation): + """Format citation in Harvard style.""" + if citation["type"] == "doi": + return f"doi:{citation['doi']}" + if citation["type"] == "url": + return f"Available at: {citation['url']}" + if citation["type"] == "author_year": + return f"{citation['author']} ({citation['year']})" + if citation["type"] == "numbered": + return citation["content"] + return citation.get("raw", "") + + +def format_mla(citation): + """Format citation in MLA 9 style.""" + if citation["type"] == "doi": + return f"doi:{citation['doi']}." + if citation["type"] == "url": + return f"{citation['url']}." + if citation["type"] == "author_year": + return f"{citation['author']}. {citation['year']}." + if citation["type"] == "numbered": + return citation["content"] + return citation.get("raw", "") + + +FORMATTERS = { + "apa": format_apa, + "ieee": format_ieee, + "chicago": format_chicago, + "harvard": format_harvard, + "mla": format_mla, +} + + +# --- Demo Data --- + +DEMO_TEXT = """ +Recent studies in product management have shown significant shifts in methodology. +According to Smith & Jones (2023), agile adoption has increased by 47% since 2020. +Patel et al. (2022) found that cross-functional teams deliver 2.3x faster. + +Several frameworks have been proposed: +[1] Cagan, M. Inspired: How to Create Tech Products Customers Love. Wiley, 2018. +[2] Torres, T. Continuous Discovery Habits. Product Talk LLC, 2021. +[3] Gothelf, J. & Seiden, J. Lean UX. O'Reilly Media, 2021. doi: 10.1234/leanux.2021 + +For further reading, see https://www.svpg.com/articles/ and the meta-analysis +by Chen (2024) on product discovery effectiveness. + +Related work: doi: 10.1145/3544548.3581388 +""" + + +def run_extraction(text, fmt, output_mode): + """Run full extraction pipeline.""" + all_citations = [] + all_citations.extend(extract_dois(text)) + all_citations.extend(extract_author_year(text)) + all_citations.extend(extract_numbered_refs(text)) + all_citations.extend(extract_urls(text)) + + citations = deduplicate(all_citations) + + for c in citations: + c["classification"] = classify_source(c) + + formatter = FORMATTERS.get(fmt, format_apa) + + if output_mode == "json": + result = { + "format": fmt, + "total": len(citations), + "citations": [], + } + for i, c in enumerate(citations, 1): + result["citations"].append({ + "index": i, + "type": c["type"], + "classification": c["classification"], + "formatted": formatter(c), + "raw": c.get("raw", ""), + }) + print(json.dumps(result, indent=2)) + else: + print(f"Citations ({fmt.upper()}) — {len(citations)} found\n") + primary = [c for c in citations if c["classification"] == "primary"] + secondary = [c for c in citations if c["classification"] == "secondary"] + tertiary = [c for c in citations if c["classification"] == "tertiary"] + + for label, group in [("Primary Sources", primary), ("Secondary Sources", secondary), ("Tertiary Sources", tertiary)]: + if group: + print(f"### {label}") + for i, c in enumerate(group, 1): + print(f" {i}. {formatter(c)}") + print() + + return citations + + +def main(): + parser = argparse.ArgumentParser( + description="research-summarizer: Extract and format citations from text" + ) + parser.add_argument("file", nargs="?", help="Input text file (omit for demo)") + parser.add_argument( + "--format", "-f", + choices=["apa", "ieee", "chicago", "harvard", "mla"], + default="apa", + help="Citation format (default: apa)", + ) + parser.add_argument( + "--output", "-o", + choices=["text", "json"], + default="text", + help="Output mode (default: text)", + ) + parser.add_argument( + "--stdin", + action="store_true", + help="Read from stdin instead of file", + ) + args = parser.parse_args() + + if args.stdin: + text = sys.stdin.read() + elif args.file: + try: + with open(args.file, "r", encoding="utf-8") as f: + text = f.read() + except FileNotFoundError: + print(f"Error: File not found: {args.file}", file=sys.stderr) + sys.exit(1) + except IOError as e: + print(f"Error reading file: {e}", file=sys.stderr) + sys.exit(1) + else: + print("No input file provided. Running demo...\n") + text = DEMO_TEXT + + run_extraction(text, args.format, args.output) + + +if __name__ == "__main__": + main() diff --git a/product-team/research-summarizer/scripts/format_summary.py b/product-team/research-summarizer/scripts/format_summary.py new file mode 100644 index 0000000..8640a93 --- /dev/null +++ b/product-team/research-summarizer/scripts/format_summary.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +""" +research-summarizer: Summary Formatter + +Generate structured research summary templates for different source types. +Produces fill-in-the-blank frameworks for academic papers, web articles, +technical reports, and executive briefs. + +Usage: + python scripts/format_summary.py --template academic + python scripts/format_summary.py --template executive --length brief + python scripts/format_summary.py --list-templates + python scripts/format_summary.py --template article --output json +""" + +import argparse +import json +import sys +import textwrap +from datetime import datetime + + +# --- Templates --- + +TEMPLATES = { + "academic": { + "name": "Academic Paper Summary", + "description": "IMRAD structure for peer-reviewed papers and research studies", + "sections": [ + ("Title", "[Full paper title]"), + ("Author(s)", "[Author names, affiliations]"), + ("Publication", "[Journal/Conference, Year, DOI]"), + ("Source Type", "Academic Paper"), + ("Key Thesis", "[1-2 sentences: the central research question and answer]"), + ("Methodology", "[Study design, sample size, data sources, analytical approach]"), + ("Key Findings", "1. [Finding 1 with supporting data]\n2. [Finding 2 with supporting data]\n3. [Finding 3 with supporting data]"), + ("Statistical Significance", "[Key p-values, effect sizes, confidence intervals]"), + ("Limitations", "- [Limitation 1: scope, sample, methodology gap]\n- [Limitation 2]"), + ("Implications", "- [What this means for practice]\n- [What this means for future research]"), + ("Notable Quotes", '> "[Direct quote]" (p. X)'), + ("Quality Assessment", "Credibility: [High/Med/Low] | Evidence: [High/Med/Low] | Recency: [High/Med/Low] | Objectivity: [High/Med/Low]"), + ], + }, + "article": { + "name": "Web Article Summary", + "description": "Claim-evidence-implication structure for online articles and blog posts", + "sections": [ + ("Title", "[Article title]"), + ("Author", "[Author name]"), + ("Source", "[Publication/Website, Date, URL]"), + ("Source Type", "Web Article"), + ("Central Claim", "[1-2 sentences: main argument or thesis]"), + ("Supporting Evidence", "1. [Evidence point 1]\n2. [Evidence point 2]\n3. [Evidence point 3]"), + ("Counterarguments Addressed", "- [Counterargument and author's response]"), + ("Implications", "- [What this means for the reader]"), + ("Bias Check", "Author affiliation: [?] | Funding: [?] | Balanced perspective: [Yes/No]"), + ("Actionable Takeaways", "- [What to do with this information]\n- [Next step]"), + ("Quality Assessment", "Credibility: [High/Med/Low] | Evidence: [High/Med/Low] | Recency: [High/Med/Low] | Objectivity: [High/Med/Low]"), + ], + }, + "report": { + "name": "Technical Report Summary", + "description": "Structured summary for industry reports, whitepapers, and technical documentation", + "sections": [ + ("Title", "[Report title]"), + ("Organization", "[Publishing organization]"), + ("Date", "[Publication date]"), + ("Source Type", "Technical Report"), + ("Executive Summary", "[2-3 sentences: scope, key conclusion, recommendation]"), + ("Scope", "[What the report covers and what it excludes]"), + ("Key Data Points", "1. [Statistic or data point with context]\n2. [Statistic or data point with context]\n3. [Statistic or data point with context]"), + ("Methodology", "[How data was collected — survey, analysis, case study]"), + ("Recommendations", "1. [Recommendation with supporting rationale]\n2. [Recommendation with supporting rationale]"), + ("Limitations", "- [Sample bias, geographic scope, time period]"), + ("Relevance", "[Why this matters for our context — specific applicability]"), + ("Quality Assessment", "Credibility: [High/Med/Low] | Evidence: [High/Med/Low] | Recency: [High/Med/Low] | Objectivity: [High/Med/Low]"), + ], + }, + "executive": { + "name": "Executive Brief", + "description": "Condensed decision-focused summary for leadership consumption", + "sections": [ + ("Source", "[Title, Author, Date]"), + ("Bottom Line", "[1 sentence: the single most important takeaway]"), + ("Key Facts", "1. [Fact]\n2. [Fact]\n3. [Fact]"), + ("So What?", "[Why this matters for our business/product/strategy]"), + ("Action Required", "- [Specific next step with owner and timeline]"), + ("Confidence", "[High/Medium/Low] — based on source quality and evidence strength"), + ], + }, + "comparison": { + "name": "Comparative Analysis", + "description": "Side-by-side comparison matrix for 2-5 sources on the same topic", + "sections": [ + ("Topic", "[Research topic or question being compared]"), + ("Sources Compared", "1. [Source A — Author, Year]\n2. [Source B — Author, Year]\n3. [Source C — Author, Year]"), + ("Comparison Matrix", "| Dimension | Source A | Source B | Source C |\n|-----------|---------|---------|---------|" + "\n| Central Thesis | ... | ... | ... |" + "\n| Methodology | ... | ... | ... |" + "\n| Key Finding | ... | ... | ... |" + "\n| Sample/Scope | ... | ... | ... |" + "\n| Credibility | High/Med/Low | High/Med/Low | High/Med/Low |"), + ("Consensus Findings", "[What most sources agree on]"), + ("Contested Points", "[Where sources disagree — with strongest evidence for each side]"), + ("Gaps", "[What none of the sources address]"), + ("Synthesis", "[Weight-of-evidence recommendation: what to believe and do]"), + ], + }, + "literature": { + "name": "Literature Review", + "description": "Thematic organization of multiple sources for research synthesis", + "sections": [ + ("Research Question", "[The question this review addresses]"), + ("Search Scope", "[Databases, keywords, date range, inclusion/exclusion criteria]"), + ("Sources Reviewed", "[Total count, breakdown by type]"), + ("Theme 1: [Name]", "Summary: [Theme overview]\nKey Sources: [Author (Year), Author (Year)]\nFindings: [What sources say about this theme]"), + ("Theme 2: [Name]", "Summary: [Theme overview]\nKey Sources: [Author (Year), Author (Year)]\nFindings: [What sources say about this theme]"), + ("Theme 3: [Name]", "Summary: [Theme overview]\nKey Sources: [Author (Year), Author (Year)]\nFindings: [What sources say about this theme]"), + ("Gaps in Literature", "- [Under-researched area 1]\n- [Under-researched area 2]"), + ("Synthesis", "[Overall state of knowledge — what we know, what we don't, where to go next]"), + ], + }, +} + +LENGTH_CONFIGS = { + "brief": {"max_sections": 4, "label": "Brief (key points only)"}, + "standard": {"max_sections": 99, "label": "Standard (full template)"}, + "detailed": {"max_sections": 99, "label": "Detailed (full template with extended guidance)"}, +} + + +def render_template(template_key, length="standard", output_format="text"): + """Render a summary template.""" + template = TEMPLATES[template_key] + sections = template["sections"] + + if length == "brief": + # Keep only first 4 sections for brief output + sections = sections[:4] + + if output_format == "json": + result = { + "template": template_key, + "name": template["name"], + "description": template["description"], + "length": length, + "generated": datetime.now().strftime("%Y-%m-%d"), + "sections": [], + } + for title, content in sections: + result["sections"].append({ + "heading": title, + "placeholder": content, + }) + return json.dumps(result, indent=2) + + # Text/Markdown output + lines = [] + lines.append(f"# {template['name']}") + lines.append(f"_{template['description']}_\n") + lines.append(f"Length: {LENGTH_CONFIGS[length]['label']}") + lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d')}\n") + lines.append("---\n") + + for title, content in sections: + lines.append(f"## {title}\n") + # Indent content for readability + for line in content.split("\n"): + lines.append(line) + lines.append("") + + lines.append("---") + lines.append("_Template from research-summarizer skill_") + + return "\n".join(lines) + + +def list_templates(output_format="text"): + """List all available templates.""" + if output_format == "json": + result = [] + for key, tmpl in TEMPLATES.items(): + result.append({ + "key": key, + "name": tmpl["name"], + "description": tmpl["description"], + "sections": len(tmpl["sections"]), + }) + return json.dumps(result, indent=2) + + lines = [] + lines.append("Available Summary Templates\n") + lines.append(f"{'KEY':<15} {'NAME':<30} {'SECTIONS':>8} DESCRIPTION") + lines.append(f"{'─' * 90}") + for key, tmpl in TEMPLATES.items(): + lines.append( + f"{key:<15} {tmpl['name']:<30} {len(tmpl['sections']):>8} {tmpl['description'][:40]}" + ) + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser( + description="research-summarizer: Generate structured summary templates" + ) + parser.add_argument( + "--template", "-t", + choices=list(TEMPLATES.keys()), + help="Template type to generate", + ) + parser.add_argument( + "--length", "-l", + choices=["brief", "standard", "detailed"], + default="standard", + help="Output length (default: standard)", + ) + parser.add_argument( + "--output", "-o", + choices=["text", "json"], + default="text", + help="Output format (default: text)", + ) + parser.add_argument( + "--list-templates", + action="store_true", + help="List all available templates", + ) + args = parser.parse_args() + + if args.list_templates: + print(list_templates(args.output)) + return + + if not args.template: + print("No template specified. Available templates:\n") + print(list_templates(args.output)) + print("\nUsage: python scripts/format_summary.py --template academic") + return + + print(render_template(args.template, args.length, args.output)) + + +if __name__ == "__main__": + main()