diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 631fe66..9344801 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -4,11 +4,11 @@ "name": "Alireza Rezvani", "url": "https://alirezarezvani.com" }, - "description": "223 production-ready skill packages for Claude AI across 9 domains: marketing (44), engineering (38+36), C-level advisory (34), regulatory/QMS (14), product (15), project management (7), business growth (5), and finance (3). Includes 298 Python tools, 416 reference documents, 23 agents, and 22 slash commands.", + "description": "248 production-ready skill packages for Claude AI across 9 domains: marketing (45), engineering (42+37), C-level advisory (34), regulatory/QMS (14), product (15), project management (9), business growth (5), and finance (4). Includes 332 Python tools, 460 reference documents, 23 agents, and 22 slash commands.", "homepage": "https://github.com/alirezarezvani/claude-skills", "repository": "https://github.com/alirezarezvani/claude-skills", "metadata": { - "description": "223 production-ready skill packages across 9 domains with 298 Python tools, 416 reference documents, 23 agents, and 22 slash commands. Compatible with Claude Code, Codex CLI, Gemini CLI, and OpenClaw.", + "description": "248 production-ready skill packages across 9 domains with 332 Python tools, 460 reference documents, 23 agents, and 22 slash commands. Compatible with Claude Code, Codex CLI, Gemini CLI, and OpenClaw.", "version": "2.2.0" }, "plugins": [ @@ -59,7 +59,7 @@ { "name": "engineering-advanced-skills", "source": "./engineering", - "description": "38 advanced engineering skills: agent designer, agent workflow designer, AgentHub, RAG architect, database designer, focused-fix, browser-automation, spec-driven-workflow, secrets-vault-manager, sql-database-assistant, migration architect, observability designer, dependency auditor, release manager, API reviewer, CI/CD pipeline builder, MCP server builder, skill security auditor, performance profiler, Helm chart builder, Terraform patterns, self-eval, llm-cost-optimizer, prompt-governance, and more.", + "description": "42 advanced engineering skills: agent designer, agent workflow designer, AgentHub, RAG architect, database designer, focused-fix, browser-automation, spec-driven-workflow, secrets-vault-manager, sql-database-assistant, migration architect, observability designer, dependency auditor, release manager, API reviewer, CI/CD pipeline builder, MCP server builder, skill security auditor, performance profiler, Helm chart builder, Terraform patterns, self-eval, llm-cost-optimizer, prompt-governance, behuman, code-tour, demo-video, data-quality-auditor, and more.", "version": "2.2.0", "author": { "name": "Alireza Rezvani" @@ -546,6 +546,58 @@ "product" ], "category": "product" + }, + { + "name": "code-tour", + "source": "./engineering/code-tour", + "description": "Create CodeTour .tour files — persona-targeted, step-by-step walkthroughs that link to real files and line numbers. 10 developer personas, all CodeTour step types, SMIG description formula.", + "version": "2.2.0", + "author": { + "name": "Alireza Rezvani" + }, + "keywords": [ + "codetour", + "walkthrough", + "onboarding", + "code-review", + "documentation" + ], + "category": "development" + }, + { + "name": "demo-video", + "source": "./engineering/demo-video", + "description": "Create polished demo videos from screenshots and scene descriptions. Orchestrates playwright, ffmpeg, and edge-tts with story structure, scene design system, and narration guidance.", + "version": "2.2.0", + "author": { + "name": "Alireza Rezvani" + }, + "keywords": [ + "video", + "demo", + "product-demo", + "walkthrough", + "ffmpeg", + "tts" + ], + "category": "development" + }, + { + "name": "data-quality-auditor", + "source": "./engineering/data-quality-auditor", + "description": "Audit datasets for completeness, consistency, accuracy, and validity. 3 stdlib-only Python tools: data profiler with DQS scoring, missing value analyzer with MCAR/MAR/MNAR classification, and multi-method outlier detector.", + "version": "2.2.0", + "author": { + "name": "Alireza Rezvani" + }, + "keywords": [ + "data-quality", + "profiling", + "outlier-detection", + "missing-values", + "data-audit" + ], + "category": "development" } ] } diff --git a/.codex/skills-index.json b/.codex/skills-index.json index e1624fb..41c7592 100644 --- a/.codex/skills-index.json +++ b/.codex/skills-index.json @@ -3,7 +3,7 @@ "name": "claude-code-skills", "description": "Production-ready skill packages for AI agents - Marketing, Engineering, Product, C-Level, PM, and RA/QM", "repository": "https://github.com/alirezarezvani/claude-skills", - "total_skills": 190, + "total_skills": 192, "skills": [ { "name": "contract-and-proposal-writer", @@ -485,6 +485,12 @@ "category": "engineering-advanced", "description": "Codebase Onboarding" }, + { + "name": "data-quality-auditor", + "source": "../../engineering/data-quality-auditor", + "category": "engineering-advanced", + "description": "Audit datasets for completeness, consistency, accuracy, and validity. Profile data distributions, detect anomalies and outliers, surface structural issues, and produce an actionable remediation plan." + }, { "name": "database-designer", "source": "../../engineering/database-designer", @@ -497,6 +503,12 @@ "category": "engineering-advanced", "description": "Use when the user asks to create ERD diagrams, normalize database schemas, design table relationships, or plan schema migrations." }, + { + "name": "demo-video", + "source": "../../engineering/demo-video", + "category": "engineering-advanced", + "description": "Use when the user asks to create a demo video, product walkthrough, feature showcase, animated presentation, marketing video, or GIF from screenshots or scene descriptions. Orchestrates playwright, ffmpeg, and edge-tts MCPs to produce polished video content." + }, { "name": "dependency-auditor", "source": "../../engineering/dependency-auditor", @@ -1163,7 +1175,7 @@ "description": "Software engineering and technical skills" }, "engineering-advanced": { - "count": 40, + "count": 42, "source": "../../engineering", "description": "Advanced engineering skills - agents, RAG, MCP, CI/CD, databases, observability" }, diff --git a/.codex/skills/data-quality-auditor b/.codex/skills/data-quality-auditor new file mode 120000 index 0000000..cf09eae --- /dev/null +++ b/.codex/skills/data-quality-auditor @@ -0,0 +1 @@ +../../engineering/data-quality-auditor \ No newline at end of file diff --git a/.codex/skills/demo-video b/.codex/skills/demo-video new file mode 120000 index 0000000..9ebf313 --- /dev/null +++ b/.codex/skills/demo-video @@ -0,0 +1 @@ +../../engineering/demo-video \ No newline at end of file diff --git a/.gemini/skills-index.json b/.gemini/skills-index.json index 73f4743..308e2d9 100644 --- a/.gemini/skills-index.json +++ b/.gemini/skills-index.json @@ -1,7 +1,7 @@ { "version": "1.0.0", "name": "gemini-cli-skills", - "total_skills": 274, + "total_skills": 280, "skills": [ { "name": "README", @@ -718,6 +718,11 @@ "category": "engineering-advanced", "description": "Autonomous experiment loop that optimizes any file by a measurable metric. Inspired by Karpathy's autoresearch. The agent edits a target file, runs a fixed evaluation, keeps improvements (git commit), discards failures (git reset), and loops indefinitely. Use when: user wants to optimize code speed, reduce bundle/image size, improve test pass rate, optimize prompts, improve content quality (headlines, copy, CTR), or run any measurable improvement loop. Requires: a target file, an evaluation command that outputs a metric, and a git repo." }, + { + "name": "behuman", + "category": "engineering-advanced", + "description": "Use when the user wants more human-like AI responses \u2014 less robotic, less listy, more authentic. Triggers: 'behuman', 'be real', 'like a human', 'more human', 'less AI', 'talk like a person', 'mirror mode', 'stop being so AI', or when conversations are emotionally charged (grief, job loss, relationship advice, fear). NOT for technical questions, code generation, or factual lookups." + }, { "name": "board", "category": "engineering-advanced", @@ -738,11 +743,21 @@ "category": "engineering-advanced", "description": "CI/CD Pipeline Builder" }, + { + "name": "code-tour", + "category": "engineering-advanced", + "description": "Use when the user asks to create a CodeTour .tour file \u2014 persona-targeted, step-by-step walkthroughs that link to real files and line numbers. Trigger for: create a tour, onboarding tour, architecture tour, PR review tour, explain how X works, vibe check, RCA tour, contributor guide, or any structured code walkthrough request." + }, { "name": "codebase-onboarding", "category": "engineering-advanced", "description": "Codebase Onboarding" }, + { + "name": "data-quality-auditor", + "category": "engineering-advanced", + "description": "Audit datasets for completeness, consistency, accuracy, and validity. Profile data distributions, detect anomalies and outliers, surface structural issues, and produce an actionable remediation plan." + }, { "name": "database-designer", "category": "engineering-advanced", @@ -753,6 +768,11 @@ "category": "engineering-advanced", "description": "Use when the user asks to create ERD diagrams, normalize database schemas, design table relationships, or plan schema migrations." }, + { + "name": "demo-video", + "category": "engineering-advanced", + "description": "Use when the user asks to create a demo video, product walkthrough, feature showcase, animated presentation, marketing video, or GIF from screenshots or scene descriptions. Orchestrates playwright, ffmpeg, and edge-tts MCPs to produce polished video content." + }, { "name": "dependency-auditor", "category": "engineering-advanced", @@ -1288,6 +1308,11 @@ "category": "project-management", "description": "Atlassian Jira expert for creating and managing projects, planning, product discovery, JQL queries, workflows, custom fields, automation, reporting, and all Jira features. Use for Jira project setup, configuration, advanced search, dashboard creation, workflow design, and technical Jira operations." }, + { + "name": "meeting-analyzer", + "category": "project-management", + "description": "Analyzes meeting transcripts and recordings to surface behavioral patterns, communication anti-patterns, and actionable coaching feedback. Use this skill whenever the user uploads or points to meeting transcripts (.txt, .md, .vtt, .srt, .docx), asks about their communication habits, wants feedback on how they run meetings, requests speaking ratio analysis, mentions filler words or conflict avoidance, or wants to compare their communication across time periods. Also trigger when users mention tools like Granola, Otter, Fireflies, or Zoom transcripts. Even if the user just says \"look at my meetings\" or \"how do I come across in meetings\" \u2014 use this skill." + }, { "name": "project-management-bundle", "category": "project-management", @@ -1303,6 +1328,11 @@ "category": "project-management", "description": "Senior Project Manager for enterprise software, SaaS, and digital transformation projects. Specializes in portfolio management, quantitative risk analysis, resource optimization, stakeholder alignment, and executive reporting. Uses advanced methodologies including EMV analysis, Monte Carlo simulation, WSJF prioritization, and multi-dimensional health scoring. Use when a user needs help with project plans, project status reports, risk assessments, resource allocation, project roadmaps, milestone tracking, team capacity planning, portfolio health reviews, program management, or executive-level project reporting \u2014 especially for enterprise-scale initiatives with multiple workstreams, complex dependencies, or multi-million dollar budgets." }, + { + "name": "team-communications", + "category": "project-management", + "description": "Write internal company communications \u2014 3P updates (Progress/Plans/Problems), company-wide newsletters, FAQ roundups, incident reports, leadership updates, status reports, project updates, and general internal comms. Use this skill any time the user asks to draft, edit, or format something meant for internal audiences. Trigger on keywords like \"3P\", \"weekly update\", \"newsletter\", \"FAQ\", \"internal comms\", \"status report\", \"company update\", \"team update\", \"incident report\", or any request to summarize work for leadership, teammates, or the broader company. Even casual requests like \"write my update\" or \"summarize what my team did this week\" should trigger this skill." + }, { "name": "capa-officer", "category": "ra-qm", @@ -1396,7 +1426,7 @@ "description": "Engineering resources" }, "engineering-advanced": { - "count": 52, + "count": 56, "description": "Engineering-advanced resources" }, "finance": { @@ -1412,7 +1442,7 @@ "description": "Product resources" }, "project-management": { - "count": 7, + "count": 9, "description": "Project-management resources" }, "ra-qm": { diff --git a/.gemini/skills/behuman/SKILL.md b/.gemini/skills/behuman/SKILL.md new file mode 120000 index 0000000..d947e34 --- /dev/null +++ b/.gemini/skills/behuman/SKILL.md @@ -0,0 +1 @@ +../../../engineering/behuman/SKILL.md \ No newline at end of file diff --git a/.gemini/skills/code-tour/SKILL.md b/.gemini/skills/code-tour/SKILL.md new file mode 120000 index 0000000..9f299f1 --- /dev/null +++ b/.gemini/skills/code-tour/SKILL.md @@ -0,0 +1 @@ +../../../engineering/code-tour/SKILL.md \ No newline at end of file diff --git a/.gemini/skills/data-quality-auditor/SKILL.md b/.gemini/skills/data-quality-auditor/SKILL.md new file mode 120000 index 0000000..fcdd3a0 --- /dev/null +++ b/.gemini/skills/data-quality-auditor/SKILL.md @@ -0,0 +1 @@ +../../../engineering/data-quality-auditor/SKILL.md \ No newline at end of file diff --git a/.gemini/skills/demo-video/SKILL.md b/.gemini/skills/demo-video/SKILL.md new file mode 120000 index 0000000..292026f --- /dev/null +++ b/.gemini/skills/demo-video/SKILL.md @@ -0,0 +1 @@ +../../../engineering/demo-video/SKILL.md \ No newline at end of file diff --git a/.gemini/skills/meeting-analyzer/SKILL.md b/.gemini/skills/meeting-analyzer/SKILL.md new file mode 120000 index 0000000..23c361a --- /dev/null +++ b/.gemini/skills/meeting-analyzer/SKILL.md @@ -0,0 +1 @@ +../../../project-management/meeting-analyzer/SKILL.md \ No newline at end of file diff --git a/.gemini/skills/team-communications/SKILL.md b/.gemini/skills/team-communications/SKILL.md new file mode 120000 index 0000000..46ab50e --- /dev/null +++ b/.gemini/skills/team-communications/SKILL.md @@ -0,0 +1 @@ +../../../project-management/team-communications/SKILL.md \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index 9901b66..4462229 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co This is a **comprehensive skills library** for Claude AI and Claude Code - reusable, production-ready skill packages that bundle domain expertise, best practices, analysis tools, and strategic frameworks. The repository provides modular skills that teams can download and use directly in their workflows. -**Current Scope:** 223 production-ready skills across 9 domains with 298 Python automation tools, 416 reference guides, 23 agents, and 22 slash commands. +**Current Scope:** 248 production-ready skills across 9 domains with 332 Python automation tools, 460 reference guides, 23 agents, and 22 slash commands. **Key Distinction**: This is NOT a traditional application. It's a library of skill packages meant to be extracted and deployed by users into their own Claude workflows. @@ -38,15 +38,15 @@ claude-code-skills/ ├── .claude-plugin/ # Plugin registry (marketplace.json) ├── agents/ # 23 agents across all domains ├── commands/ # 22 slash commands (changelog, tdd, saas-health, prd, code-to-prd, plugin-audit, sprint-plan, etc.) -├── engineering-team/ # 36 core engineering skills + Playwright Pro + Self-Improving Agent + Security Suite -├── engineering/ # 36 POWERFUL-tier advanced skills (incl. AgentHub, self-eval) +├── engineering-team/ # 37 core engineering skills + Playwright Pro + Self-Improving Agent + Security Suite +├── engineering/ # 42 POWERFUL-tier advanced skills (incl. AgentHub, self-eval) ├── product-team/ # 15 product skills + Python tools -├── marketing-skill/ # 44 marketing skills (7 pods) + Python tools +├── marketing-skill/ # 45 marketing skills (7 pods) + Python tools ├── c-level-advisor/ # 34 C-level advisory skills (10 roles + orchestration) -├── project-management/ # 7 PM skills + Atlassian MCP +├── project-management/ # 9 PM skills + Atlassian MCP ├── ra-qm-team/ # 14 RA/QM compliance skills ├── business-growth/ # 5 business & growth skills + Python tools -├── finance/ # 3 finance skills + Python tools +├── finance/ # 4 finance skills + Python tools ├── eval-workspace/ # Skill evaluation results (Tessl) ├── standards/ # 5 standards library files ├── templates/ # Reusable templates @@ -130,7 +130,7 @@ See [standards/git/git-workflow-standards.md](standards/git/git-workflow-standar - **Security skills suite** — 6 new engineering-team skills: adversarial-reviewer, ai-security, cloud-security, incident-response, red-team, threat-detection (5 Python tools, 4 reference guides) - **Self-eval skill** — Honest AI work quality evaluation with two-axis scoring, score inflation detection, and session persistence - **Snowflake development** — Data warehouse development, SQL optimization, and data pipeline patterns -- 223 total skills across 9 domains, 298 Python tools, 416 references, 23 agents, 22 commands +- 248 total skills across 9 domains, 332 Python tools, 460 references, 23 agents, 22 commands - MkDocs docs site expanded to 269 generated pages (301 HTML pages) **v2.1.2 (2026-03-10):** @@ -153,9 +153,9 @@ See [standards/git/git-workflow-standards.md](standards/git/git-workflow-standar ## Roadmap -**Phase 1-3 Complete:** 223 production-ready skills deployed across 9 domains -- Engineering Core (36), Engineering POWERFUL (36), Product (15), Marketing (44), PM (7), C-Level (34), RA/QM (14), Business & Growth (5), Finance (3) -- 298 Python automation tools, 416 reference guides, 23 agents, 22 commands +**Phase 1-3 Complete:** 248 production-ready skills deployed across 9 domains +- Engineering Core (37), Engineering POWERFUL (42), Product (15), Marketing (45), PM (9), C-Level (34), RA/QM (14), Business & Growth (5), Finance (4) +- 332 Python automation tools, 460 reference guides, 23 agents, 22 commands - Complete enterprise coverage from engineering through regulatory compliance, sales, customer success, and finance - MkDocs Material docs site with 269+ indexed pages for SEO @@ -208,4 +208,4 @@ This repository publishes skills to **ClawHub** (clawhub.com) as the distributio **Last Updated:** March 31, 2026 **Version:** v2.2.0 -**Status:** 223 skills deployed across 9 domains, 28 marketplace plugins, docs site live +**Status:** 248 skills deployed across 9 domains, 28 marketplace plugins, docs site live diff --git a/README.md b/README.md index 5e4e338..1e0e5f5 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # Claude Code Skills & Plugins — Agent Skills for Every Coding Tool -**223 production-ready Claude Code skills, plugins, and agent skills for 11 AI coding tools.** +**248 production-ready Claude Code skills, plugins, and agent skills for 11 AI coding tools.** The most comprehensive open-source library of Claude Code skills and agent plugins — also works with OpenAI Codex, Gemini CLI, Cursor, and 7 more coding agents. Reusable expertise packages covering engineering, DevOps, marketing, compliance, C-level advisory, and more. **Works with:** Claude Code · OpenAI Codex · Gemini CLI · OpenClaw · Cursor · Aider · Windsurf · Kilo Code · OpenCode · Augment · Antigravity [![License: MIT](https://img.shields.io/badge/License-MIT-yellow?style=for-the-badge)](https://opensource.org/licenses/MIT) -[![Skills](https://img.shields.io/badge/Skills-223-brightgreen?style=for-the-badge)](#skills-overview) +[![Skills](https://img.shields.io/badge/Skills-248-brightgreen?style=for-the-badge)](#skills-overview) [![Agents](https://img.shields.io/badge/Agents-23-blue?style=for-the-badge)](#agents) [![Personas](https://img.shields.io/badge/Personas-3-purple?style=for-the-badge)](#personas) [![Commands](https://img.shields.io/badge/Commands-22-orange?style=for-the-badge)](#commands) @@ -23,10 +23,10 @@ The most comprehensive open-source library of Claude Code skills and agent plugi Claude Code skills (also called agent skills or coding agent plugins) are modular instruction packages that give AI coding agents domain expertise they don't have out of the box. Each skill includes: - **SKILL.md** — structured instructions, workflows, and decision frameworks -- **Python tools** — 298 CLI scripts (all stdlib-only, zero pip installs) +- **Python tools** — 332 CLI scripts (all stdlib-only, zero pip installs) - **Reference docs** — templates, checklists, and domain-specific knowledge -**One repo, eleven platforms.** Works natively as Claude Code plugins, Codex agent skills, Gemini CLI skills, and converts to 8 more tools via `scripts/convert.sh`. All 298 Python tools run anywhere Python runs. +**One repo, eleven platforms.** Works natively as Claude Code plugins, Codex agent skills, Gemini CLI skills, and converts to 8 more tools via `scripts/convert.sh`. All 332 Python tools run anywhere Python runs. ### Skills vs Agents vs Personas @@ -145,21 +145,21 @@ Run `./scripts/convert.sh --tool all` to generate tool-specific outputs locally. ## Skills Overview -**223 skills across 9 domains:** +**248 skills across 9 domains:** | Domain | Skills | Highlights | Details | |--------|--------|------------|---------| -| **🔧 Engineering — Core** | 36 | Architecture, frontend, backend, fullstack, QA, DevOps, SecOps, AI/ML, data, Playwright, self-improving agent, security suite (6), a11y audit | [engineering-team/](engineering-team/) | +| **🔧 Engineering — Core** | 37 | Architecture, frontend, backend, fullstack, QA, DevOps, SecOps, AI/ML, data, Playwright, self-improving agent, security suite (6), a11y audit | [engineering-team/](engineering-team/) | | **🎭 Playwright Pro** | 9+3 | Test generation, flaky fix, Cypress/Selenium migration, TestRail, BrowserStack, 55 templates | [engineering-team/playwright-pro](engineering-team/playwright-pro/) | | **🧠 Self-Improving Agent** | 5+2 | Auto-memory curation, pattern promotion, skill extraction, memory health | [engineering-team/self-improving-agent](engineering-team/self-improving-agent/) | -| **⚡ Engineering — POWERFUL** | 36 | Agent designer, RAG architect, database designer, CI/CD builder, security auditor, MCP builder, AgentHub, Helm charts, Terraform, self-eval | [engineering/](engineering/) | -| **🎯 Product** | 14 | Product manager, agile PO, strategist, UX researcher, UI design, landing pages, SaaS scaffolder, analytics, experiment designer, discovery, roadmap communicator, code-to-prd | [product-team/](product-team/) | -| **📣 Marketing** | 43 | 7 pods: Content (8), SEO (5), CRO (6), Channels (6), Growth (4), Intelligence (4), Sales (2) + context foundation + orchestration router. 32 Python tools. | [marketing-skill/](marketing-skill/) | -| **📋 Project Management** | 6 | Senior PM, scrum master, Jira, Confluence, Atlassian admin, templates | [project-management/](project-management/) | +| **⚡ Engineering — POWERFUL** | 42 | Agent designer, RAG architect, database designer, CI/CD builder, security auditor, MCP builder, AgentHub, Helm charts, Terraform, self-eval | [engineering/](engineering/) | +| **🎯 Product** | 15 | Product manager, agile PO, strategist, UX researcher, UI design, landing pages, SaaS scaffolder, analytics, experiment designer, discovery, roadmap communicator, code-to-prd | [product-team/](product-team/) | +| **📣 Marketing** | 45 | 7 pods: Content (8), SEO (5), CRO (6), Channels (6), Growth (4), Intelligence (4), Sales (2) + context foundation + orchestration router. 32 Python tools. | [marketing-skill/](marketing-skill/) | +| **📋 Project Management** | 9 | Senior PM, scrum master, Jira, Confluence, Atlassian admin, templates | [project-management/](project-management/) | | **🏥 Regulatory & QM** | 14 | ISO 13485, MDR 2017/745, FDA, ISO 27001, GDPR, CAPA, risk management | [ra-qm-team/](ra-qm-team/) | -| **💼 C-Level Advisory** | 28 | Full C-suite (10 roles) + orchestration + board meetings + culture & collaboration | [c-level-advisor/](c-level-advisor/) | -| **📈 Business & Growth** | 4 | Customer success, sales engineer, revenue ops, contracts & proposals | [business-growth/](business-growth/) | -| **💰 Finance** | 2 | Financial analyst (DCF, budgeting, forecasting), SaaS metrics coach (ARR, MRR, churn, LTV, CAC) | [finance/](finance/) | +| **💼 C-Level Advisory** | 34 | Full C-suite (10 roles) + orchestration + board meetings + culture & collaboration | [c-level-advisor/](c-level-advisor/) | +| **📈 Business & Growth** | 5 | Customer success, sales engineer, revenue ops, contracts & proposals | [business-growth/](business-growth/) | +| **💰 Finance** | 4 | Financial analyst (DCF, budgeting, forecasting), SaaS metrics coach (ARR, MRR, churn, LTV, CAC) | [finance/](finance/) | --- @@ -296,7 +296,7 @@ for MDR Annex II compliance gaps. ## Python Analysis Tools -298 CLI tools ship with the skills (all verified, stdlib-only): +332 CLI tools ship with the skills (all verified, stdlib-only): ```bash # SaaS health check @@ -342,7 +342,7 @@ Yes. Skills work natively with 11 tools: Claude Code, OpenAI Codex, Gemini CLI, No. We follow semantic versioning and maintain backward compatibility within patch releases. Existing script arguments, plugin source paths, and SKILL.md structures are never changed in patch versions. See the [CHANGELOG](CHANGELOG.md) for details on each release. **Are the Python tools dependency-free?** -Yes. All 298 Python CLI tools use the standard library only — zero pip installs required. Every script is verified to run with `--help`. +Yes. All 332 Python CLI tools use the standard library only — zero pip installs required. Every script is verified to run with `--help`. **How do I create my own Claude Code skill?** Each skill is a folder with a `SKILL.md` (frontmatter + instructions), optional `scripts/`, `references/`, and `assets/`. See the [Skills & Agents Factory](https://github.com/alirezarezvani/claude-code-skills-agents-factory) for a step-by-step guide. diff --git a/docs/getting-started.md b/docs/getting-started.md index 827e1ca..c1a8498 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -1,6 +1,6 @@ --- title: Install Agent Skills — Codex, Gemini CLI, OpenClaw Setup -description: "How to install 223 Claude Code skills and agent plugins for 11 AI coding tools. Step-by-step setup for Claude Code, OpenAI Codex, Gemini CLI, OpenClaw, Cursor, Aider, Windsurf, and more." +description: "How to install 248 Claude Code skills and agent plugins for 11 AI coding tools. Step-by-step setup for Claude Code, OpenAI Codex, Gemini CLI, OpenClaw, Cursor, Aider, Windsurf, and more." --- # Getting Started @@ -140,15 +140,15 @@ Choose your platform and follow the steps: | Bundle | Install Command | Skills | |--------|----------------|--------| -| **Engineering Core** | `/plugin install engineering-skills@claude-code-skills` | 36 | -| **Engineering POWERFUL** | `/plugin install engineering-advanced-skills@claude-code-skills` | 38 | +| **Engineering Core** | `/plugin install engineering-skills@claude-code-skills` | 37 | +| **Engineering POWERFUL** | `/plugin install engineering-advanced-skills@claude-code-skills` | 42 | | **Product** | `/plugin install product-skills@claude-code-skills` | 15 | -| **Marketing** | `/plugin install marketing-skills@claude-code-skills` | 44 | +| **Marketing** | `/plugin install marketing-skills@claude-code-skills` | 45 | | **Regulatory & Quality** | `/plugin install ra-qm-skills@claude-code-skills` | 14 | -| **Project Management** | `/plugin install pm-skills@claude-code-skills` | 7 | +| **Project Management** | `/plugin install pm-skills@claude-code-skills` | 9 | | **C-Level Advisory** | `/plugin install c-level-skills@claude-code-skills` | 34 | | **Business & Growth** | `/plugin install business-growth-skills@claude-code-skills` | 5 | -| **Finance** | `/plugin install finance-skills@claude-code-skills` | 3 | +| **Finance** | `/plugin install finance-skills@claude-code-skills` | 4 | Or install individual skills: `/plugin install skill-name@claude-code-skills` @@ -182,7 +182,7 @@ AI-augmented development. Optimize for SEO. ## Python Tools -All 298 tools use the standard library only — zero pip installs, all verified. +All 332 tools use the standard library only — zero pip installs, all verified. ```bash # Security audit a skill before installing @@ -254,7 +254,7 @@ See the [Skills & Agents Factory](https://github.com/alirezarezvani/claude-code- Yes. Run `./scripts/gemini-install.sh` to set up skills for Gemini CLI. A sync script (`scripts/sync-gemini-skills.py`) generates the skills index automatically. ??? question "Does this work with Cursor, Windsurf, Aider, or other tools?" - Yes. All 156 skills can be converted to native formats for Cursor, Aider, Kilo Code, Windsurf, OpenCode, Augment, and Antigravity. Run `./scripts/convert.sh --tool all` and then install with `./scripts/install.sh --tool `. See [Multi-Tool Integrations](integrations.md) for details. + Yes. All 248 skills can be converted to native formats for Cursor, Aider, Kilo Code, Windsurf, OpenCode, Augment, and Antigravity. Run `./scripts/convert.sh --tool all` and then install with `./scripts/install.sh --tool `. See [Multi-Tool Integrations](integrations.md) for details. ??? question "Can I use Agent Skills in ChatGPT?" Yes. We have [6 Custom GPTs](custom-gpts.md) that bring Agent Skills directly into ChatGPT — no installation needed. Just click and start chatting. diff --git a/docs/index.md b/docs/index.md index 80d1c77..f1da12b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,6 @@ --- -title: 223 Agent Skills for Codex, Gemini CLI & OpenClaw -description: "223 production-ready Claude Code skills and agent plugins for 11 AI coding tools. Engineering, product, marketing, compliance, and finance agent skills for Claude Code, OpenAI Codex, Gemini CLI, Cursor, and OpenClaw." +title: 248 Agent Skills for Codex, Gemini CLI & OpenClaw +description: "248 production-ready Claude Code skills and agent plugins for 11 AI coding tools. Engineering, product, marketing, compliance, and finance agent skills for Claude Code, OpenAI Codex, Gemini CLI, Cursor, and OpenClaw." hide: - toc - edit @@ -14,7 +14,7 @@ hide: # Agent Skills -223 production-ready skills, 23 agents, 3 personas, and an orchestration protocol for AI coding tools. +248 production-ready skills, 23 agents, 3 personas, and an orchestration protocol for AI coding tools. { .hero-subtitle } [Get Started](getting-started.md){ .md-button .md-button--primary } @@ -49,7 +49,7 @@ hide:
-- :material-toolbox:{ .lg .middle } **223 Skills** +- :material-toolbox:{ .lg .middle } **248 Skills** --- @@ -81,7 +81,7 @@ hide: [:octicons-arrow-right-24: Learn patterns](orchestration.md) -- :material-language-python:{ .lg .middle } **298 Python Tools** +- :material-language-python:{ .lg .middle } **332 Python Tools** --- @@ -135,7 +135,7 @@ hide: Architecture, frontend, backend, fullstack, QA, DevOps, SecOps, AI/ML, data engineering, Playwright testing, self-improving agent - [:octicons-arrow-right-24: 36 skills](skills/engineering-team/) + [:octicons-arrow-right-24: 37 skills](skills/engineering-team/) - :material-lightning-bolt:{ .lg .middle } **Engineering — Advanced** @@ -143,7 +143,7 @@ hide: Agent designer, RAG architect, database designer, CI/CD builder, MCP server builder, security auditor, tech debt tracker - [:octicons-arrow-right-24: 38 skills](skills/engineering/) + [:octicons-arrow-right-24: 42 skills](skills/engineering/) - :material-bullseye-arrow:{ .lg .middle } **Product** @@ -159,7 +159,7 @@ hide: Content, SEO, CRO, channels, growth, intelligence, sales — 7 specialist pods with 32 Python tools - [:octicons-arrow-right-24: 44 skills](skills/marketing-skill/) + [:octicons-arrow-right-24: 45 skills](skills/marketing-skill/) - :material-clipboard-check:{ .lg .middle } **Project Management** @@ -167,7 +167,7 @@ hide: Senior PM, scrum master, Jira expert, Confluence expert, Atlassian admin, templates - [:octicons-arrow-right-24: 6 skills](skills/project-management/) + [:octicons-arrow-right-24: 9 skills](skills/project-management/) - :material-star-circle:{ .lg .middle } **C-Level Advisory** @@ -199,7 +199,7 @@ hide: Financial analyst, SaaS metrics coach — DCF valuation, budgeting, forecasting, ARR/MRR/churn/LTV - [:octicons-arrow-right-24: 3 skills](skills/finance/) + [:octicons-arrow-right-24: 4 skills](skills/finance/)
diff --git a/docs/skills/engineering/code-tour.md b/docs/skills/engineering/code-tour.md new file mode 100644 index 0000000..d523adb --- /dev/null +++ b/docs/skills/engineering/code-tour.md @@ -0,0 +1,151 @@ +--- +title: "Code Tour — Agent Skill for Codex & OpenClaw" +description: "Use when the user asks to create a CodeTour .tour file — persona-targeted, step-by-step walkthroughs that link to real files and line numbers. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw." +--- + +# Code Tour + +
+:material-rocket-launch: Engineering - POWERFUL +:material-identifier: `code-tour` +:material-github: Source +
+ +
+Install: claude /plugin install engineering-advanced-skills +
+ + +Create **CodeTour** files — persona-targeted, step-by-step walkthroughs of a codebase that link directly to files and line numbers. CodeTour files live in `.tours/` and work with the [VS Code CodeTour extension](https://github.com/microsoft/codetour). + +## Overview + +A great tour is a **narrative** — a story told to a specific person about what matters, why it matters, and what to do next. Only create `.tour` JSON files. Never modify source code. + +## When to Use This Skill + +- User asks to create a code tour, onboarding tour, or architecture walkthrough +- User says "tour for this PR", "explain how X works", "vibe check", "RCA tour" +- User wants a contributor guide, security review, or bug investigation walkthrough +- Any request for a structured walkthrough with file/line anchors + +## Core Workflow + +### 1. Discover the repo + +Before asking anything, explore the codebase: + +In parallel: list root directory, read README, check config files. +Then: identify language(s), framework(s), project purpose. Map folder structure 1-2 levels deep. Find entry points — every path in the tour must be real. + +If the repo has fewer than 5 source files, create a quick-depth tour regardless of persona — there's not enough to warrant a deep one. + +### 2. Infer the intent + +One message should be enough. Infer persona, depth, and focus silently. + +| User says | Persona | Depth | +|-----------|---------|-------| +| "tour for this PR" | pr-reviewer | standard | +| "why did X break" / "RCA" | rca-investigator | standard | +| "onboarding" / "new joiner" | new-joiner | standard | +| "quick tour" / "vibe check" | vibecoder | quick | +| "architecture" | architect | deep | +| "security" / "auth review" | security-reviewer | standard | +| (no qualifier) | new-joiner | standard | + +When intent is ambiguous, default to **new-joiner** persona at **standard** depth — it's the most generally useful. + +### 3. Read actual files + +**Every file path and line number must be verified.** A tour pointing to the wrong line is worse than no tour. + +### 4. Write the tour + +Save to `.tours/-.tour`. + +```json +{ + "$schema": "https://aka.ms/codetour-schema", + "title": "Descriptive Title — Persona / Goal", + "description": "Who this is for and what they'll understand after.", + "ref": "", + "steps": [] +} +``` + +### Step types + +| Type | When to use | Example | +|------|-------------|---------| +| **Content** | Intro/closing only (max 2) | `{ "title": "Welcome", "description": "..." }` | +| **Directory** | Orient to a module | `{ "directory": "src/services", "title": "..." }` | +| **File + line** | The workhorse | `{ "file": "src/auth.ts", "line": 42, "title": "..." }` | +| **Selection** | Highlight a code block | `{ "file": "...", "selection": {...}, "title": "..." }` | +| **Pattern** | Regex match (volatile files) | `{ "file": "...", "pattern": "class App", "title": "..." }` | +| **URI** | Link to PR, issue, doc | `{ "uri": "https://...", "title": "..." }` | + +### Step count + +| Depth | Steps | Use for | +|-------|-------|---------| +| Quick | 5-8 | Vibecoder, fast exploration | +| Standard | 9-13 | Most personas | +| Deep | 14-18 | Architect, RCA | + +### Writing descriptions — SMIG formula + +- **S — Situation**: What is the reader looking at? +- **M — Mechanism**: How does this code work? +- **I — Implication**: Why does this matter for this persona? +- **G — Gotcha**: What would a smart person get wrong? + +### 5. Validate + +- [ ] Every `file` path relative to repo root (no leading `/` or `./`) +- [ ] Every `file` confirmed to exist +- [ ] Every `line` verified by reading the file +- [ ] First step has `file` or `directory` anchor +- [ ] At most 2 content-only steps +- [ ] `nextTour` matches another tour's `title` exactly if set + +## Personas + +| Persona | Goal | Must cover | +|---------|------|------------| +| **Vibecoder** | Get the vibe fast | Entry point, main modules. Max 8 steps. | +| **New joiner** | Structured ramp-up | Directories, setup, business context | +| **Bug fixer** | Root cause fast | Trigger -> fault points -> tests | +| **RCA investigator** | Why did it fail | Causality chain, observability anchors | +| **Feature explainer** | End-to-end | UI -> API -> backend -> storage | +| **PR reviewer** | Review correctly | Change story, invariants, risky areas | +| **Architect** | Shape and rationale | Boundaries, tradeoffs, extension points | +| **Security reviewer** | Trust boundaries | Auth flow, validation, secret handling | +| **Refactorer** | Safe restructuring | Seams, hidden deps, extraction order | +| **External contributor** | Contribute safely | Safe areas, conventions, landmines | + +## Narrative Arc + +1. **Orientation** — `file` or `directory` step (never content-only first step — blank in VS Code) +2. **High-level map** — 1-3 directory steps showing major modules +3. **Core path** — file/line steps, the heart of the tour +4. **Closing** — what the reader can now do, suggested follow-ups + +## Anti-Patterns + +| Anti-pattern | Fix | +|---|---| +| **File listing** — "this file contains the models" | Tell a story. Each step depends on the previous. | +| **Generic descriptions** | Name the specific pattern unique to this codebase. | +| **Line number guessing** | Never write a line you didn't verify by reading. | +| **Too many steps** for quick depth | Actually cut steps. | +| **Hallucinated files** | If it doesn't exist, skip the step. | +| **Recap closing** — "we covered X, Y, Z" | Tell the reader what they can now *do*. | +| **Content-only first step** | Anchor step 1 to a file or directory. | + +## Cross-References + +- Related: `engineering/codebase-onboarding` — for broader onboarding beyond tours +- Related: `engineering/pr-review-expert` — for automated PR review workflows +- CodeTour extension: [microsoft/codetour](https://github.com/microsoft/codetour) +- Real-world tours: [coder/code-server](https://github.com/coder/code-server/blob/main/.tours/contributing.tour) diff --git a/docs/skills/engineering/data-quality-auditor.md b/docs/skills/engineering/data-quality-auditor.md new file mode 100644 index 0000000..f62febf --- /dev/null +++ b/docs/skills/engineering/data-quality-auditor.md @@ -0,0 +1,231 @@ +--- +title: "Profile from CSV — Agent Skill for Codex & OpenClaw" +description: "Audit datasets for completeness, consistency, accuracy, and validity. Profile data distributions, detect anomalies and outliers, surface structural. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw." +--- + +# Profile from CSV + +
+:material-rocket-launch: Engineering - POWERFUL +:material-identifier: `data-quality-auditor` +:material-github: Source +
+ +
+Install: claude /plugin install engineering-advanced-skills +
+ +You are an expert data quality engineer. Your goal is to systematically assess dataset health, surface hidden issues that corrupt downstream analysis, and prescribe prioritized fixes. You move fast, think in impact, and never let "good enough" data quietly poison a model or dashboard. + +--- + +## Entry Points + +### Mode 1 — Full Audit (New Dataset) +Use when you have a dataset you've never assessed before. + +1. **Profile** — Run `data_profiler.py` to get shape, types, completeness, and distributions +2. **Missing Values** — Run `missing_value_analyzer.py` to classify missingness patterns (MCAR/MAR/MNAR) +3. **Outliers** — Run `outlier_detector.py` to flag anomalies using IQR and Z-score methods +4. **Cross-column checks** — Inspect referential integrity, duplicate rows, and logical constraints +5. **Score & Report** — Assign a Data Quality Score (DQS) and produce the remediation plan + +### Mode 2 — Targeted Scan (Specific Concern) +Use when a specific column, metric, or pipeline stage is suspected. + +1. Ask: *What broke, when did it start, and what changed upstream?* +2. Run the relevant script against the suspect columns only +3. Compare distributions against a known-good baseline if available +4. Trace issues to root cause (source system, ETL transform, ingestion lag) + +### Mode 3 — Ongoing Monitoring Setup +Use when the user wants recurring quality checks on a live pipeline. + +1. Identify the 5–8 critical columns driving key metrics +2. Define thresholds: acceptable null %, outlier rate, value domain +3. Generate a monitoring checklist and alerting logic from `data_profiler.py --monitor` +4. Schedule checks at ingestion cadence + +--- + +## Tools + +### `scripts/data_profiler.py` +Full dataset profile: shape, dtypes, null counts, cardinality, value distributions, and a Data Quality Score. + +**Features:** +- Per-column null %, unique count, top values, min/max/mean/std +- Detects constant columns, high-cardinality text fields, mixed types +- Outputs a DQS (0–100) based on completeness + consistency signals +- `--monitor` flag prints threshold-ready summary for alerting + +```bash +# Profile from CSV +python3 scripts/data_profiler.py --file data.csv + +# Profile specific columns +python3 scripts/data_profiler.py --file data.csv --columns col1,col2,col3 + +# Output JSON for downstream use +python3 scripts/data_profiler.py --file data.csv --format json + +# Generate monitoring thresholds +python3 scripts/data_profiler.py --file data.csv --monitor +``` + +### `scripts/missing_value_analyzer.py` +Deep-dive into missingness: volume, patterns, and likely mechanism (MCAR/MAR/MNAR). + +**Features:** +- Null heatmap summary (text-based) and co-occurrence matrix +- Pattern classification: random, systematic, correlated +- Imputation strategy recommendations per column (drop / mean / median / mode / forward-fill / flag) +- Estimates downstream impact if missingness is ignored + +```bash +# Analyze all missing values +python3 scripts/missing_value_analyzer.py --file data.csv + +# Focus on columns above a null threshold +python3 scripts/missing_value_analyzer.py --file data.csv --threshold 0.05 + +# Output JSON +python3 scripts/missing_value_analyzer.py --file data.csv --format json +``` + +### `scripts/outlier_detector.py` +Multi-method outlier detection with business-impact context. + +**Features:** +- IQR method (robust, non-parametric) +- Z-score method (normal distribution assumption) +- Modified Z-score (Iglewicz-Hoaglin, robust to skew) +- Per-column outlier count, %, and boundary values +- Flags columns where outliers may be data errors vs. legitimate extremes + +```bash +# Detect outliers across all numeric columns +python3 scripts/outlier_detector.py --file data.csv + +# Use specific method +python3 scripts/outlier_detector.py --file data.csv --method iqr + +# Set custom Z-score threshold +python3 scripts/outlier_detector.py --file data.csv --method zscore --threshold 2.5 + +# Output JSON +python3 scripts/outlier_detector.py --file data.csv --format json +``` + +--- + +## Data Quality Score (DQS) + +The DQS is a 0–100 composite score across five dimensions. Report it at the top of every audit. + +| Dimension | Weight | What It Measures | +|---|---|---| +| Completeness | 30% | Null / missing rate across critical columns | +| Consistency | 25% | Type conformance, format uniformity, no mixed types | +| Validity | 20% | Values within expected domain (ranges, categories, regexes) | +| Uniqueness | 15% | Duplicate rows, duplicate keys, redundant columns | +| Timeliness | 10% | Freshness of timestamps, lag from source system | + +**Scoring thresholds:** +- 🟢 85–100 — Production-ready +- 🟡 65–84 — Usable with documented caveats +- 🔴 0–64 — Remediation required before use + +--- + +## Proactive Risk Triggers + +Surface these unprompted whenever you spot the signals: + +- **Silent nulls** — Nulls encoded as `0`, `""`, `"N/A"`, `"null"` strings. Completeness metrics lie until these are caught. +- **Leaky timestamps** — Future dates, dates before system launch, or timezone mismatches that corrupt time-series joins. +- **Cardinality explosions** — Free-text fields with thousands of unique values masquerading as categorical. Will break one-hot encoding silently. +- **Duplicate keys** — PKs that aren't unique invalidate joins and aggregations downstream. +- **Distribution shift** — Columns where current distribution diverges from baseline (>2σ on mean/std). Signals upstream pipeline changes. +- **Correlated missingness** — Nulls concentrated in a specific time range, user segment, or region — evidence of MNAR, not random dropout. + +--- + +## Output Artifacts + +| Request | Deliverable | +|---|---| +| "Profile this dataset" | Full DQS report with per-column breakdown and top issues ranked by impact | +| "What's wrong with column X?" | Targeted column audit: nulls, outliers, type issues, value domain violations | +| "Is this data ready for modeling?" | Model-readiness checklist with pass/fail per ML requirement | +| "Help me clean this data" | Prioritized remediation plan with specific transforms per issue | +| "Set up monitoring" | Threshold config + alerting checklist for critical columns | +| "Compare this to last month" | Distribution comparison report with drift flags | + +--- + +## Remediation Playbook + +### Missing Values +| Null % | Recommended Action | +|---|---| +| < 1% | Drop rows (if dataset is large) or impute with median/mode | +| 1–10% | Impute; add a binary indicator column `col_was_null` | +| 10–30% | Impute cautiously; investigate root cause; document assumption | +| > 30% | Flag for domain review; do not impute blindly; consider dropping column | + +### Outliers +- **Likely data error** (value physically impossible): cap, correct, or drop +- **Legitimate extreme** (valid but rare): keep, document, consider log transform for modeling +- **Unknown** (can't determine without domain input): flag, do not silently remove + +### Duplicates +1. Confirm uniqueness key with data owner before deduplication +2. Prefer `keep='last'` for event data (most recent state wins) +3. Prefer `keep='first'` for slowly-changing-dimension tables + +--- + +## Quality Loop + +Tag every finding with a confidence level: + +- 🟢 **Verified** — confirmed by data inspection or domain owner +- 🟡 **Likely** — strong signal but not fully confirmed +- 🔴 **Assumed** — inferred from patterns; needs domain validation + +Never auto-remediate 🔴 findings without human confirmation. + +--- + +## Communication Standard + +Structure all audit reports as: + +**Bottom Line** — DQS score and one-sentence verdict (e.g., "DQS: 61/100 — remediation required before production use") +**What** — The specific issues found (ranked by severity × breadth) +**Why It Matters** — Business or analytical impact of each issue +**How to Act** — Specific, ordered remediation steps + +--- + +## Related Skills + +| Skill | Use When | +|---|---| +| `finance/financial-analyst` | Data involves financial statements or accounting figures | +| `finance/saas-metrics-coach` | Data is subscription/event data feeding SaaS KPIs | +| `engineering/database-designer` | Issues trace back to schema design or normalization | +| `engineering/tech-debt-tracker` | Data quality issues are systemic and need to be tracked as tech debt | +| `product-team/product-analytics` | Auditing product event data (funnels, sessions, retention) | + +**When NOT to use this skill:** +- You need to design or optimize the database schema — use `engineering/database-designer` +- You need to build the ETL pipeline itself — use an engineering skill +- The dataset is a financial model output — use `finance/financial-analyst` for model validation + +--- + +## References + +- `references/data-quality-concepts.md` — MCAR/MAR/MNAR theory, DQS methodology, outlier detection methods diff --git a/docs/skills/engineering/demo-video.md b/docs/skills/engineering/demo-video.md new file mode 100644 index 0000000..b8710c6 --- /dev/null +++ b/docs/skills/engineering/demo-video.md @@ -0,0 +1,121 @@ +--- +title: "Demo Video — Agent Skill for Codex & OpenClaw" +description: "Use when the user asks to create a demo video, product walkthrough, feature showcase, animated presentation, marketing video, or GIF from screenshots. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw." +--- + +# Demo Video + +
+:material-rocket-launch: Engineering - POWERFUL +:material-identifier: `demo-video` +:material-github: Source +
+ +
+Install: claude /plugin install engineering-advanced-skills +
+ + +You are a video producer. Not a slideshow maker. Every frame has a job. Every second earns the next. + +## Overview + +Create polished demo videos by orchestrating browser rendering, text-to-speech, and video compositing. Think like a video producer — story arc, pacing, emotion, visual hierarchy. Turns screenshots and scene descriptions into shareable product demos. + +## When to Use This Skill + +- User asks to create a demo video, product walkthrough, or feature showcase +- User wants an animated presentation, marketing video, or product teaser +- User wants to turn screenshots or UI captures into a polished video or GIF +- User says "make a video", "create a demo", "record a demo", "promo video" + +## Core Workflow + +### 1. Choose a rendering mode + +Before starting, verify available tools: +- **playwright MCP available?** — needed for automated screenshots. Fallback: ask user to screenshot the HTML files manually. +- **edge-tts available?** — needed for narration audio. Fallback: output narration text files for user to record or use any TTS tool. +- **ffmpeg available?** — needed for compositing. Fallback: output individual scene images + audio files with manual ffmpeg commands the user can run. + +If none are available, produce HTML scene files + `scenes.json` manifest + narration scripts. The user can composite manually or use any video editor. + +| Mode | How | When | +|------|-----|------| +| **MCP Orchestration** | HTML → playwright screenshots → edge-tts audio → ffmpeg composite | Use when playwright + edge-tts + ffmpeg MCPs are all connected | +| **Manual** | Write HTML scene files, provide ffmpeg commands for user to run | Use when MCPs are not available | + +### 2. Pick a story structure + +**The Classic Demo (30-60s):** +Hook (3s) -> Problem (5s) -> Magic Moment (5s) -> Proof (15s) -> Social Proof (4s) -> Invite (4s) + +**The Problem-Solution (20-40s):** +Before (6s) -> After (6s) -> How (10s) -> CTA (4s) + +**The 15-Second Teaser:** +Hook (2s) -> Demo (8s) -> Logo (3s) -> Tagline (2s) + +### 3. Design scenes + +**If no screenshots are provided:** +- For CLI/terminal tools: generate HTML scenes with terminal-style dark background, monospace font, and animated typing effect +- For conceptual demos: use text-heavy scenes with the color language and typography system +- Ask the user for screenshots only if the product is visual and descriptions are insufficient + +Every scene has exactly ONE primary focus: +- Title scenes: product name +- Problem scenes: the pain (red, chaotic) +- Solution scenes: the result (green, spacious) +- Feature scenes: the highlighted screenshot region +- End scenes: URL / CTA button + +### 4. Write narration + +- One idea per scene. If you need "and" you need two scenes. +- Lead with the verb. "Organize your tabs" not "Tab organization is provided." +- No jargon. "Your tabs organize themselves" not "AI-powered tab categorization." +- Use contrast. "24 tabs. One click. 5 groups." + +## Output Artifacts + +For each video, produce these files in a `demo-output/` directory: + +1. `scenes/` — one HTML file per scene (1920x1080 viewport) +2. `narration/` — one `.txt` file per scene (for edge-tts input) +3. `scenes.json` — manifest listing scenes in order with durations and narration text +4. `build.sh` — shell script that runs the full pipeline: + - `playwright screenshot` each HTML scene → `frames/` + - `edge-tts` each narration file → `audio/` + - `ffmpeg` concat with crossfade transitions → `output.mp4` + +If MCPs are unavailable, still produce items 1-3. Include the ffmpeg commands in `build.sh` for the user to run manually. + +## Scene Design System + +See [references/scene-design-system.md](https://github.com/alirezarezvani/claude-skills/tree/main/engineering/demo-video/references/scene-design-system.md) for the full design system: color language, animation timing, typography, HTML layout, voice options, and pacing guide. + +## Quality Checklist + +- [ ] Video has audio stream +- [ ] Resolution is 1920x1080 +- [ ] No black frames between scenes +- [ ] First 3 seconds grab attention +- [ ] Every scene has one focus point +- [ ] End card has URL and CTA + +## Anti-Patterns + +| Anti-pattern | Fix | +|---|---| +| **Slideshow pacing** — every scene same duration, no rhythm | Vary durations: hooks 3s, proof 8s, CTA 4s | +| **Wall of text on screen** | Move info to narration, simplify visuals | +| **Generic narration** — "This feature lets you..." | Use specific numbers and concrete verbs | +| **No story arc** — just listing features | Use problem -> solution -> proof structure | +| **Raw screenshots** | Always add rounded corners, shadows, dark background | +| **Using `ease` or `linear` animations** | Use spring curve: `cubic-bezier(0.16, 1, 0.3, 1)` | + +## Cross-References + +- Related: `engineering/browser-automation` — for playwright-based browser workflows +- See also: [framecraft](https://github.com/vaddisrinivas/framecraft) — open-source scene rendering pipeline diff --git a/docs/skills/engineering/index.md b/docs/skills/engineering/index.md index e7dd177..ef8b284 100644 --- a/docs/skills/engineering/index.md +++ b/docs/skills/engineering/index.md @@ -1,13 +1,13 @@ --- title: "Engineering - POWERFUL Skills — Agent Skills & Codex Plugins" -description: "52 engineering - powerful skills — advanced agent-native skill and Claude Code plugin for AI agent design, infrastructure, and automation. Works with Claude Code, Codex CLI, Gemini CLI, and OpenClaw." +description: "55 engineering - powerful skills — advanced agent-native skill and Claude Code plugin for AI agent design, infrastructure, and automation. Works with Claude Code, Codex CLI, Gemini CLI, and OpenClaw." ---
# :material-rocket-launch: Engineering - POWERFUL -

52 skills in this domain

+

55 skills in this domain

@@ -77,12 +77,24 @@ description: "52 engineering - powerful skills — advanced agent-native skill a Tier: POWERFUL +- **[Code Tour](code-tour.md)** + + --- + + Create CodeTour files — persona-targeted, step-by-step walkthroughs of a codebase that link directly to files and lin... + - **[Codebase Onboarding](codebase-onboarding.md)** --- Tier: POWERFUL +- **[Profile from CSV](data-quality-auditor.md)** + + --- + + python3 scripts/dataprofiler.py --file data.csv + - **[Database Designer - POWERFUL Tier Skill](database-designer.md)** --- @@ -95,6 +107,12 @@ description: "52 engineering - powerful skills — advanced agent-native skill a Tier: POWERFUL +- **[Demo Video](demo-video.md)** + + --- + + You are a video producer. Not a slideshow maker. Every frame has a job. Every second earns the next. + - **[Dependency Auditor](dependency-auditor.md)** --- diff --git a/engineering/code-tour/.claude-plugin/plugin.json b/engineering/code-tour/.claude-plugin/plugin.json new file mode 100644 index 0000000..9d62d7d --- /dev/null +++ b/engineering/code-tour/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "code-tour", + "description": "Create CodeTour .tour files — persona-targeted, step-by-step walkthroughs that link to real files and line numbers. Supports 10 developer personas (vibecoder, new joiner, architect, security reviewer, etc.), all CodeTour step types, and SMIG description formula.", + "version": "2.2.0", + "author": { + "name": "Alireza Rezvani", + "url": "https://alirezarezvani.com" + }, + "homepage": "https://github.com/alirezarezvani/claude-skills/tree/main/engineering/code-tour", + "repository": "https://github.com/alirezarezvani/claude-skills", + "license": "MIT", + "skills": "./" +} diff --git a/engineering/code-tour/SKILL.md b/engineering/code-tour/SKILL.md index ab9ebf2..b9aa98c 100644 --- a/engineering/code-tour/SKILL.md +++ b/engineering/code-tour/SKILL.md @@ -23,10 +23,11 @@ A great tour is a **narrative** — a story told to a specific person about what ### 1. Discover the repo Before asking anything, explore the codebase: -- List root directory, read README, check config files -- Identify language(s), framework(s), project purpose -- Map folder structure 1-2 levels deep -- Find entry points — every path in the tour must be real + +In parallel: list root directory, read README, check config files. +Then: identify language(s), framework(s), project purpose. Map folder structure 1-2 levels deep. Find entry points — every path in the tour must be real. + +If the repo has fewer than 5 source files, create a quick-depth tour regardless of persona — there's not enough to warrant a deep one. ### 2. Infer the intent @@ -40,6 +41,9 @@ One message should be enough. Infer persona, depth, and focus silently. | "quick tour" / "vibe check" | vibecoder | quick | | "architecture" | architect | deep | | "security" / "auth review" | security-reviewer | standard | +| (no qualifier) | new-joiner | standard | + +When intent is ambiguous, default to **new-joiner** persona at **standard** depth — it's the most generally useful. ### 3. Read actual files @@ -54,7 +58,7 @@ Save to `.tours/-.tour`. "$schema": "https://aka.ms/codetour-schema", "title": "Descriptive Title — Persona / Goal", "description": "Who this is for and what they'll understand after.", - "ref": "main", + "ref": "", "steps": [] } ``` @@ -94,7 +98,7 @@ Save to `.tours/-.tour`. - [ ] At most 2 content-only steps - [ ] `nextTour` matches another tour's `title` exactly if set -## The 20 Personas +## Personas | Persona | Goal | Must cover | |---------|------|------------| @@ -131,6 +135,6 @@ Save to `.tours/-.tour`. ## Cross-References - Related: `engineering/codebase-onboarding` — for broader onboarding beyond tours -- Related: `engineering/code-review-automation` — for automated PR review workflows -- Full skill with validation scripts and schema: [code-tour repo](https://github.com/vaddisrinivas/code-tour) +- Related: `engineering/pr-review-expert` — for automated PR review workflows +- CodeTour extension: [microsoft/codetour](https://github.com/microsoft/codetour) - Real-world tours: [coder/code-server](https://github.com/coder/code-server/blob/main/.tours/contributing.tour) diff --git a/engineering/code-tour/evals.json b/engineering/code-tour/evals.json new file mode 100644 index 0000000..41e19ba --- /dev/null +++ b/engineering/code-tour/evals.json @@ -0,0 +1,32 @@ +[ + { + "id": 1, + "prompt": "I just hired a junior dev who starts Monday. Can you create an onboarding tour for this repo so they can get oriented on their own?", + "expected_output": "Agent infers new-joiner persona, standard depth (9-13 steps). Produces .tours/new-joiner-onboarding.tour with verified paths/lines, SMIG descriptions, narrative arc starting with orientation directory step.", + "scenario_type": "happy_path" + }, + { + "id": 2, + "prompt": "Give me a quick vibe check tour of this codebase — I just cloned it and want to understand the shape before diving in.", + "expected_output": "Agent infers vibecoder persona, quick depth (5-8 steps). Tour hits entry point and main modules only. File saved to .tours/vibecoder-overview.tour.", + "scenario_type": "happy_path" + }, + { + "id": 3, + "prompt": "We had an outage last night because the payment webhook handler silently swallowed errors. Can you build an RCA tour tracing how webhooks flow through the system?", + "expected_output": "Agent infers rca-investigator persona, standard depth. Tour follows causality chain from webhook entry point through handler to error handling. Steps anchored to specific lines showing the fault path.", + "scenario_type": "happy_path" + }, + { + "id": 4, + "prompt": "Create a tour for this repo.", + "expected_output": "Agent defaults to new-joiner persona at standard depth without asking clarifying questions. Produces a general-purpose onboarding tour.", + "scenario_type": "edge_case" + }, + { + "id": 5, + "prompt": "Make an onboarding tour for this repo, but I want it to also cover the deployment pipeline and our monitoring setup in Grafana.", + "expected_output": "Agent includes deployment pipeline files as normal file+line steps. Uses URI step type for Grafana link if user provides URL, or skips with explanation. Does not hallucinate files.", + "scenario_type": "edge_case" + } +] diff --git a/engineering/data-quality-auditor/.claude-plugin/plugin.json b/engineering/data-quality-auditor/.claude-plugin/plugin.json new file mode 100644 index 0000000..3a32c55 --- /dev/null +++ b/engineering/data-quality-auditor/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "data-quality-auditor", + "description": "Audit datasets for completeness, consistency, accuracy, and validity. 3 stdlib-only Python tools: data profiler with DQS scoring, missing value analyzer with MCAR/MAR/MNAR classification, and multi-method outlier detector.", + "version": "2.2.0", + "author": { + "name": "Alireza Rezvani", + "url": "https://alirezarezvani.com" + }, + "homepage": "https://github.com/alirezarezvani/claude-skills/tree/main/engineering/data-quality-auditor", + "repository": "https://github.com/alirezarezvani/claude-skills", + "license": "MIT", + "skills": "./" +} diff --git a/engineering/data-quality-auditor/SKILL.md b/engineering/data-quality-auditor/SKILL.md new file mode 100644 index 0000000..6d487ec --- /dev/null +++ b/engineering/data-quality-auditor/SKILL.md @@ -0,0 +1,219 @@ +--- +name: data-quality-auditor +description: Audit datasets for completeness, consistency, accuracy, and validity. Profile data distributions, detect anomalies and outliers, surface structural issues, and produce an actionable remediation plan. +--- + +You are an expert data quality engineer. Your goal is to systematically assess dataset health, surface hidden issues that corrupt downstream analysis, and prescribe prioritized fixes. You move fast, think in impact, and never let "good enough" data quietly poison a model or dashboard. + +--- + +## Entry Points + +### Mode 1 — Full Audit (New Dataset) +Use when you have a dataset you've never assessed before. + +1. **Profile** — Run `data_profiler.py` to get shape, types, completeness, and distributions +2. **Missing Values** — Run `missing_value_analyzer.py` to classify missingness patterns (MCAR/MAR/MNAR) +3. **Outliers** — Run `outlier_detector.py` to flag anomalies using IQR and Z-score methods +4. **Cross-column checks** — Inspect referential integrity, duplicate rows, and logical constraints +5. **Score & Report** — Assign a Data Quality Score (DQS) and produce the remediation plan + +### Mode 2 — Targeted Scan (Specific Concern) +Use when a specific column, metric, or pipeline stage is suspected. + +1. Ask: *What broke, when did it start, and what changed upstream?* +2. Run the relevant script against the suspect columns only +3. Compare distributions against a known-good baseline if available +4. Trace issues to root cause (source system, ETL transform, ingestion lag) + +### Mode 3 — Ongoing Monitoring Setup +Use when the user wants recurring quality checks on a live pipeline. + +1. Identify the 5–8 critical columns driving key metrics +2. Define thresholds: acceptable null %, outlier rate, value domain +3. Generate a monitoring checklist and alerting logic from `data_profiler.py --monitor` +4. Schedule checks at ingestion cadence + +--- + +## Tools + +### `scripts/data_profiler.py` +Full dataset profile: shape, dtypes, null counts, cardinality, value distributions, and a Data Quality Score. + +**Features:** +- Per-column null %, unique count, top values, min/max/mean/std +- Detects constant columns, high-cardinality text fields, mixed types +- Outputs a DQS (0–100) based on completeness + consistency signals +- `--monitor` flag prints threshold-ready summary for alerting + +```bash +# Profile from CSV +python3 scripts/data_profiler.py --file data.csv + +# Profile specific columns +python3 scripts/data_profiler.py --file data.csv --columns col1,col2,col3 + +# Output JSON for downstream use +python3 scripts/data_profiler.py --file data.csv --format json + +# Generate monitoring thresholds +python3 scripts/data_profiler.py --file data.csv --monitor +``` + +### `scripts/missing_value_analyzer.py` +Deep-dive into missingness: volume, patterns, and likely mechanism (MCAR/MAR/MNAR). + +**Features:** +- Null heatmap summary (text-based) and co-occurrence matrix +- Pattern classification: random, systematic, correlated +- Imputation strategy recommendations per column (drop / mean / median / mode / forward-fill / flag) +- Estimates downstream impact if missingness is ignored + +```bash +# Analyze all missing values +python3 scripts/missing_value_analyzer.py --file data.csv + +# Focus on columns above a null threshold +python3 scripts/missing_value_analyzer.py --file data.csv --threshold 0.05 + +# Output JSON +python3 scripts/missing_value_analyzer.py --file data.csv --format json +``` + +### `scripts/outlier_detector.py` +Multi-method outlier detection with business-impact context. + +**Features:** +- IQR method (robust, non-parametric) +- Z-score method (normal distribution assumption) +- Modified Z-score (Iglewicz-Hoaglin, robust to skew) +- Per-column outlier count, %, and boundary values +- Flags columns where outliers may be data errors vs. legitimate extremes + +```bash +# Detect outliers across all numeric columns +python3 scripts/outlier_detector.py --file data.csv + +# Use specific method +python3 scripts/outlier_detector.py --file data.csv --method iqr + +# Set custom Z-score threshold +python3 scripts/outlier_detector.py --file data.csv --method zscore --threshold 2.5 + +# Output JSON +python3 scripts/outlier_detector.py --file data.csv --format json +``` + +--- + +## Data Quality Score (DQS) + +The DQS is a 0–100 composite score across five dimensions. Report it at the top of every audit. + +| Dimension | Weight | What It Measures | +|---|---|---| +| Completeness | 30% | Null / missing rate across critical columns | +| Consistency | 25% | Type conformance, format uniformity, no mixed types | +| Validity | 20% | Values within expected domain (ranges, categories, regexes) | +| Uniqueness | 15% | Duplicate rows, duplicate keys, redundant columns | +| Timeliness | 10% | Freshness of timestamps, lag from source system | + +**Scoring thresholds:** +- 🟢 85–100 — Production-ready +- 🟡 65–84 — Usable with documented caveats +- 🔴 0–64 — Remediation required before use + +--- + +## Proactive Risk Triggers + +Surface these unprompted whenever you spot the signals: + +- **Silent nulls** — Nulls encoded as `0`, `""`, `"N/A"`, `"null"` strings. Completeness metrics lie until these are caught. +- **Leaky timestamps** — Future dates, dates before system launch, or timezone mismatches that corrupt time-series joins. +- **Cardinality explosions** — Free-text fields with thousands of unique values masquerading as categorical. Will break one-hot encoding silently. +- **Duplicate keys** — PKs that aren't unique invalidate joins and aggregations downstream. +- **Distribution shift** — Columns where current distribution diverges from baseline (>2σ on mean/std). Signals upstream pipeline changes. +- **Correlated missingness** — Nulls concentrated in a specific time range, user segment, or region — evidence of MNAR, not random dropout. + +--- + +## Output Artifacts + +| Request | Deliverable | +|---|---| +| "Profile this dataset" | Full DQS report with per-column breakdown and top issues ranked by impact | +| "What's wrong with column X?" | Targeted column audit: nulls, outliers, type issues, value domain violations | +| "Is this data ready for modeling?" | Model-readiness checklist with pass/fail per ML requirement | +| "Help me clean this data" | Prioritized remediation plan with specific transforms per issue | +| "Set up monitoring" | Threshold config + alerting checklist for critical columns | +| "Compare this to last month" | Distribution comparison report with drift flags | + +--- + +## Remediation Playbook + +### Missing Values +| Null % | Recommended Action | +|---|---| +| < 1% | Drop rows (if dataset is large) or impute with median/mode | +| 1–10% | Impute; add a binary indicator column `col_was_null` | +| 10–30% | Impute cautiously; investigate root cause; document assumption | +| > 30% | Flag for domain review; do not impute blindly; consider dropping column | + +### Outliers +- **Likely data error** (value physically impossible): cap, correct, or drop +- **Legitimate extreme** (valid but rare): keep, document, consider log transform for modeling +- **Unknown** (can't determine without domain input): flag, do not silently remove + +### Duplicates +1. Confirm uniqueness key with data owner before deduplication +2. Prefer `keep='last'` for event data (most recent state wins) +3. Prefer `keep='first'` for slowly-changing-dimension tables + +--- + +## Quality Loop + +Tag every finding with a confidence level: + +- 🟢 **Verified** — confirmed by data inspection or domain owner +- 🟡 **Likely** — strong signal but not fully confirmed +- 🔴 **Assumed** — inferred from patterns; needs domain validation + +Never auto-remediate 🔴 findings without human confirmation. + +--- + +## Communication Standard + +Structure all audit reports as: + +**Bottom Line** — DQS score and one-sentence verdict (e.g., "DQS: 61/100 — remediation required before production use") +**What** — The specific issues found (ranked by severity × breadth) +**Why It Matters** — Business or analytical impact of each issue +**How to Act** — Specific, ordered remediation steps + +--- + +## Related Skills + +| Skill | Use When | +|---|---| +| `finance/financial-analyst` | Data involves financial statements or accounting figures | +| `finance/saas-metrics-coach` | Data is subscription/event data feeding SaaS KPIs | +| `engineering/database-designer` | Issues trace back to schema design or normalization | +| `engineering/tech-debt-tracker` | Data quality issues are systemic and need to be tracked as tech debt | +| `product-team/product-analytics` | Auditing product event data (funnels, sessions, retention) | + +**When NOT to use this skill:** +- You need to design or optimize the database schema — use `engineering/database-designer` +- You need to build the ETL pipeline itself — use an engineering skill +- The dataset is a financial model output — use `finance/financial-analyst` for model validation + +--- + +## References + +- `references/data-quality-concepts.md` — MCAR/MAR/MNAR theory, DQS methodology, outlier detection methods diff --git a/engineering/data-quality-auditor/references/data-quality-concepts.md b/engineering/data-quality-auditor/references/data-quality-concepts.md new file mode 100644 index 0000000..b07b303 --- /dev/null +++ b/engineering/data-quality-auditor/references/data-quality-concepts.md @@ -0,0 +1,106 @@ +# Data Quality Concepts Reference + +Deep-dive reference for the Data Quality Auditor skill. Keep SKILL.md lean — this is where the theory lives. + +--- + +## Missingness Mechanisms (Rubin, 1976) + +Understanding *why* data is missing determines how safely it can be imputed. + +### MCAR — Missing Completely At Random +- The probability of missingness is independent of both observed and unobserved data. +- **Example:** A sensor drops a reading due to random hardware noise. +- **Safe to impute?** Yes. Imputing with mean/median introduces no systematic bias. +- **Detection:** Null rows are indistinguishable from non-null rows on all other dimensions. + +### MAR — Missing At Random +- The probability of missingness depends on *observed* data, not the missing value itself. +- **Example:** Older users are less likely to fill in a "social media handle" field — missingness depends on age (observed), not on the handle itself. +- **Safe to impute?** Conditionally yes — impute using a model that accounts for the related observed variables. +- **Detection:** Null rows differ systematically from non-null rows on *other* columns. + +### MNAR — Missing Not At Random +- The probability of missingness depends on the *missing value itself* (unobserved). +- **Example:** High earners skip the income field; low performers skip the satisfaction survey. +- **Safe to impute?** No — imputation will introduce systematic bias. Escalate to domain owner. +- **Detection:** Difficult to confirm statistically; look for clustered nulls in time or segment slices. + +--- + +## Data Quality Score (DQS) Methodology + +The DQS is a weighted composite of five ISO 8000 / DAMA-aligned dimensions: + +| Dimension | Weight | Rationale | +|---|---|---| +| Completeness | 30% | Nulls are the most common and impactful quality failure | +| Consistency | 25% | Type/format violations corrupt joins and aggregations silently | +| Validity | 20% | Out-of-domain values (negative ages, future birth dates) create invisible errors | +| Uniqueness | 15% | Duplicate rows inflate metrics and invalidate joins | +| Timeliness | 10% | Stale data causes decisions based on outdated state | + +**Scoring thresholds** align to production-readiness standards: +- 85–100: Ready for production use in models and dashboards +- 65–84: Usable for exploratory analysis with documented caveats +- 0–64: Unreliable; remediation required before use in any decision-making context + +--- + +## Outlier Detection Methods + +### IQR (Interquartile Range) +- **Formula:** Outlier if `x < Q1 − 1.5×IQR` or `x > Q3 + 1.5×IQR` +- **Strengths:** Non-parametric, robust to non-normal distributions, interpretable bounds +- **Weaknesses:** Can miss outliers in heavily skewed distributions; 1.5× multiplier is conventional, not universal +- **When to use:** Default choice for most business datasets (revenue, counts, durations) + +### Z-score +- **Formula:** Outlier if `|x − μ| / σ > threshold` (commonly 3.0) +- **Strengths:** Simple, widely understood, easy to explain to stakeholders +- **Weaknesses:** Mean and std are themselves influenced by outliers — the method is self-defeating for extreme contamination +- **When to use:** Only when the distribution is approximately normal and contamination is < 5% + +### Modified Z-score (Iglewicz-Hoaglin) +- **Formula:** `M_i = 0.6745 × |x_i − median| / MAD`; outlier if `M_i > 3.5` +- **Strengths:** Uses median and MAD — both resistant to outlier influence; handles skewed distributions +- **Weaknesses:** MAD = 0 for discrete columns with one dominant value; less intuitive +- **When to use:** Preferred for skewed distributions (e.g. revenue, latency, page views) + +--- + +## Imputation Strategies + +| Method | When | Risk | +|---|---|---| +| Mean | MCAR, continuous, symmetric distribution | Distorts variance; don't use with skewed data | +| Median | MCAR/MAR, continuous, skewed distribution | Safe for skewed; loses variance | +| Mode | MCAR/MAR, categorical | Can over-represent one category | +| Forward-fill | Time series with MCAR/MAR gaps | Assumes value persists — valid for slowly-changing fields | +| Binary indicator | Null % 1–30% | Preserves information about missingness without imputing | +| Model-based | MAR, high-value columns | Most accurate but computationally expensive | +| Drop column | > 50% missing with no business justification | Safest option if column has no predictive value | + +**Golden rule:** Always add a `col_was_null` indicator column when imputing with null% > 1%. This preserves the information that a value was imputed, which may itself be predictive. + +--- + +## Common Silent Data Quality Failures + +These are the issues that don't raise errors but corrupt results: + +1. **Sentinel values** — `0`, `-1`, `9999`, `""` used to mean "unknown" in legacy systems +2. **Timezone naive timestamps** — datetimes stored without timezone; comparisons silently shift by hours +3. **Trailing whitespace** — `"active "` ≠ `"active"` causes silent join mismatches +4. **Encoding errors** — UTF-8 vs Latin-1 mismatches produce garbled strings in one column +5. **Scientific notation** — `1e6` stored as string gets treated as a category not a number +6. **Implicit schema changes** — upstream adds a new category to a lookup field; existing code silently drops new rows + +--- + +## References + +- Rubin, D.B. (1976). "Inference and Missing Data." *Biometrika* 63(3): 581–592. +- Iglewicz, B. & Hoaglin, D. (1993). *How to Detect and Handle Outliers*. ASQC Quality Press. +- DAMA International (2017). *DAMA-DMBOK: Data Management Body of Knowledge*. 2nd ed. +- ISO 8000-8: Data quality — Concepts and measuring. diff --git a/engineering/data-quality-auditor/scripts/data_profiler.py b/engineering/data-quality-auditor/scripts/data_profiler.py new file mode 100644 index 0000000..9adef1a --- /dev/null +++ b/engineering/data-quality-auditor/scripts/data_profiler.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +from __future__ import annotations +""" +data_profiler.py — Full dataset profile with Data Quality Score (DQS). + +Usage: + python3 data_profiler.py --file data.csv + python3 data_profiler.py --file data.csv --columns col1,col2 + python3 data_profiler.py --file data.csv --format json + python3 data_profiler.py --file data.csv --monitor +""" + +import argparse +import csv +import json +import math +import sys +from collections import Counter, defaultdict + + +def load_csv(filepath: str) -> tuple[list[str], list[dict]]: + with open(filepath, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + rows = list(reader) + headers = reader.fieldnames or [] + return headers, rows + + +def infer_type(values: list[str]) -> str: + """Infer dominant type from non-null string values.""" + counts = {"int": 0, "float": 0, "bool": 0, "string": 0} + for v in values: + v = v.strip() + if v.lower() in ("true", "false"): + counts["bool"] += 1 + else: + try: + int(v) + counts["int"] += 1 + except ValueError: + try: + float(v) + counts["float"] += 1 + except ValueError: + counts["string"] += 1 + dominant = max(counts, key=lambda k: counts[k]) + return dominant if counts[dominant] > 0 else "string" + + +def safe_mean(nums: list[float]) -> float | None: + return sum(nums) / len(nums) if nums else None + + +def safe_std(nums: list[float], mean: float) -> float | None: + if len(nums) < 2: + return None + variance = sum((x - mean) ** 2 for x in nums) / (len(nums) - 1) + return math.sqrt(variance) + + +def profile_column(name: str, raw_values: list[str]) -> dict: + total = len(raw_values) + null_strings = {"", "null", "none", "n/a", "na", "nan", "nil"} + null_count = sum(1 for v in raw_values if v.strip().lower() in null_strings) + non_null = [v for v in raw_values if v.strip().lower() not in null_strings] + + col_type = infer_type(non_null) + unique_values = set(non_null) + top_values = Counter(non_null).most_common(5) + + profile = { + "column": name, + "total_rows": total, + "null_count": null_count, + "null_pct": round(null_count / total * 100, 2) if total else 0, + "non_null_count": len(non_null), + "unique_count": len(unique_values), + "cardinality_pct": round(len(unique_values) / len(non_null) * 100, 2) if non_null else 0, + "inferred_type": col_type, + "top_values": top_values, + "is_constant": len(unique_values) == 1, + "is_high_cardinality": len(unique_values) / len(non_null) > 0.9 if len(non_null) > 10 else False, + } + + if col_type in ("int", "float"): + try: + nums = [float(v) for v in non_null] + mean = safe_mean(nums) + profile["min"] = min(nums) + profile["max"] = max(nums) + profile["mean"] = round(mean, 4) if mean is not None else None + profile["std"] = round(safe_std(nums, mean), 4) if mean is not None else None + except ValueError: + pass + + return profile + + +def compute_dqs(profiles: list[dict], total_rows: int) -> dict: + """Compute Data Quality Score (0-100) across 5 dimensions.""" + if not profiles or total_rows == 0: + return {"score": 0, "dimensions": {}} + + # Completeness (30%) — avg non-null rate + avg_null_pct = sum(p["null_pct"] for p in profiles) / len(profiles) + completeness = max(0, 100 - avg_null_pct) + + # Consistency (25%) — penalize constant cols and mixed-type signals + constant_cols = sum(1 for p in profiles if p["is_constant"]) + consistency = max(0, 100 - (constant_cols / len(profiles)) * 100) + + # Validity (20%) — penalize high-cardinality string cols (proxy for free-text issues) + high_card = sum(1 for p in profiles if p["is_high_cardinality"] and p["inferred_type"] == "string") + validity = max(0, 100 - (high_card / len(profiles)) * 60) + + # Uniqueness (15%) — placeholder; duplicate detection needs full row comparison + uniqueness = 90.0 # conservative default without row-level dedup check + + # Timeliness (10%) — placeholder; requires timestamp columns + timeliness = 85.0 # conservative default + + score = ( + completeness * 0.30 + + consistency * 0.25 + + validity * 0.20 + + uniqueness * 0.15 + + timeliness * 0.10 + ) + + return { + "score": round(score, 1), + "dimensions": { + "completeness": round(completeness, 1), + "consistency": round(consistency, 1), + "validity": round(validity, 1), + "uniqueness": uniqueness, + "timeliness": timeliness, + }, + } + + +def dqs_label(score: float) -> str: + if score >= 85: + return "PASS — Production-ready" + elif score >= 65: + return "WARN — Usable with documented caveats" + else: + return "FAIL — Remediation required before use" + + +def print_report(headers: list[str], profiles: list[dict], dqs: dict, total_rows: int, monitor: bool): + print("=" * 64) + print("DATA QUALITY AUDIT REPORT") + print("=" * 64) + print(f"Rows: {total_rows} | Columns: {len(headers)}") + score = dqs["score"] + indicator = "🟢" if score >= 85 else ("🟡" if score >= 65 else "🔴") + print(f"\nData Quality Score (DQS): {score}/100 {indicator}") + print(f"Verdict: {dqs_label(score)}") + + dims = dqs["dimensions"] + print("\nDimension Breakdown:") + for dim, val in dims.items(): + bar = int(val / 5) + print(f" {dim.capitalize():<14} {val:>5.1f} {'█' * bar}{'░' * (20 - bar)}") + + print("\n" + "-" * 64) + print("COLUMN PROFILES") + print("-" * 64) + + issues = [] + for p in profiles: + status = "🟢" + col_issues = [] + if p["null_pct"] > 30: + status = "🔴" + col_issues.append(f"{p['null_pct']}% nulls — investigate root cause") + elif p["null_pct"] > 10: + status = "🟡" + col_issues.append(f"{p['null_pct']}% nulls — impute cautiously") + elif p["null_pct"] > 1: + col_issues.append(f"{p['null_pct']}% nulls — impute with indicator") + if p["is_constant"]: + status = "🟡" + col_issues.append("Constant column — zero variance, likely useless") + if p["is_high_cardinality"] and p["inferred_type"] == "string": + col_issues.append("High-cardinality string — check if categorical or free-text") + + print(f"\n {status} {p['column']}") + print(f" Type: {p['inferred_type']} | Nulls: {p['null_count']} ({p['null_pct']}%) | Unique: {p['unique_count']}") + if "min" in p: + print(f" Min: {p['min']} Max: {p['max']} Mean: {p['mean']} Std: {p['std']}") + if p["top_values"]: + top = ", ".join(f"{v}({c})" for v, c in p["top_values"][:3]) + print(f" Top values: {top}") + for issue in col_issues: + issues.append((p["column"], issue)) + print(f" ⚠ {issue}") + + if issues: + print("\n" + "-" * 64) + print(f"ISSUES SUMMARY ({len(issues)} found)") + print("-" * 64) + for col, msg in issues: + print(f" [{col}] {msg}") + + if monitor: + print("\n" + "-" * 64) + print("MONITORING THRESHOLDS (copy into alerting config)") + print("-" * 64) + for p in profiles: + if p["null_pct"] > 0: + print(f" {p['column']}: null_pct <= {min(p['null_pct'] * 1.5, 100):.1f}%") + if "mean" in p and p["mean"] is not None: + drift = abs(p.get("std", 0) or 0) * 2 + print(f" {p['column']}: mean within [{p['mean'] - drift:.2f}, {p['mean'] + drift:.2f}]") + + print("\n" + "=" * 64) + + +def main(): + parser = argparse.ArgumentParser(description="Profile a CSV dataset and compute a Data Quality Score.") + parser.add_argument("--file", required=True, help="Path to CSV file") + parser.add_argument("--columns", help="Comma-separated list of columns to profile (default: all)") + parser.add_argument("--format", choices=["text", "json"], default="text") + parser.add_argument("--monitor", action="store_true", help="Print monitoring thresholds") + args = parser.parse_args() + + try: + headers, rows = load_csv(args.file) + except FileNotFoundError: + print(f"Error: file not found: {args.file}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error reading file: {e}", file=sys.stderr) + sys.exit(1) + + if not rows: + print("Error: CSV file is empty or has no data rows.", file=sys.stderr) + sys.exit(1) + + selected = args.columns.split(",") if args.columns else headers + missing_cols = [c for c in selected if c not in headers] + if missing_cols: + print(f"Error: columns not found: {', '.join(missing_cols)}", file=sys.stderr) + sys.exit(1) + + profiles = [profile_column(col, [row.get(col, "") for row in rows]) for col in selected] + dqs = compute_dqs(profiles, len(rows)) + + if args.format == "json": + print(json.dumps({"total_rows": len(rows), "dqs": dqs, "columns": profiles}, indent=2)) + else: + print_report(selected, profiles, dqs, len(rows), args.monitor) + + +if __name__ == "__main__": + main() diff --git a/engineering/data-quality-auditor/scripts/missing_value_analyzer.py b/engineering/data-quality-auditor/scripts/missing_value_analyzer.py new file mode 100644 index 0000000..fcc59e1 --- /dev/null +++ b/engineering/data-quality-auditor/scripts/missing_value_analyzer.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +""" +missing_value_analyzer.py — Classify missingness patterns and recommend imputation strategies. + +Usage: + python3 missing_value_analyzer.py --file data.csv + python3 missing_value_analyzer.py --file data.csv --threshold 0.05 + python3 missing_value_analyzer.py --file data.csv --format json +""" + +import argparse +import csv +import json +import sys +from collections import defaultdict + + +NULL_STRINGS = {"", "null", "none", "n/a", "na", "nan", "nil", "undefined", "missing"} + + +def load_csv(filepath: str) -> tuple[list[str], list[dict]]: + with open(filepath, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + rows = list(reader) + headers = reader.fieldnames or [] + return headers, rows + + +def is_null(val: str) -> bool: + return val.strip().lower() in NULL_STRINGS + + +def compute_null_mask(headers: list[str], rows: list[dict]) -> dict[str, list[bool]]: + return {col: [is_null(row.get(col, "")) for row in rows] for col in headers} + + +def null_stats(mask: list[bool]) -> dict: + total = len(mask) + count = sum(mask) + return {"count": count, "pct": round(count / total * 100, 2) if total else 0} + + +def classify_mechanism(col: str, mask: list[bool], all_masks: dict[str, list[bool]]) -> str: + """ + Heuristic classification of missingness mechanism: + - MCAR: nulls appear randomly, no correlation with other columns + - MAR: nulls correlate with values in other observed columns + - MNAR: nulls correlate with the missing column's own unobserved value (can't fully detect) + + Returns one of: "MCAR (likely)", "MAR (likely)", "MNAR (possible)", "Insufficient data" + """ + null_indices = {i for i, v in enumerate(mask) if v} + if not null_indices: + return "None" + + n = len(mask) + if n < 10: + return "Insufficient data" + + # Check correlation with other columns' nulls + correlated_cols = [] + for other_col, other_mask in all_masks.items(): + if other_col == col: + continue + other_null_indices = {i for i, v in enumerate(other_mask) if v} + if not other_null_indices: + continue + overlap = len(null_indices & other_null_indices) + union = len(null_indices | other_null_indices) + jaccard = overlap / union if union else 0 + if jaccard > 0.5: + correlated_cols.append(other_col) + + # Check if nulls are clustered (time/positional pattern) — proxy for MNAR + sorted_indices = sorted(null_indices) + if len(sorted_indices) > 2: + gaps = [sorted_indices[i + 1] - sorted_indices[i] for i in range(len(sorted_indices) - 1)] + avg_gap = sum(gaps) / len(gaps) + clustered = avg_gap < n / len(null_indices) * 0.5 # nulls appear closer together than random + else: + clustered = False + + if correlated_cols: + return f"MAR (likely) — co-occurs with nulls in: {', '.join(correlated_cols[:3])}" + elif clustered: + return "MNAR (possible) — nulls are spatially clustered, may reflect a systematic gap" + else: + return "MCAR (likely) — nulls appear random, no strong correlation detected" + + +def recommend_strategy(pct: float, col_type: str) -> str: + if pct == 0: + return "No action needed" + if pct < 1: + return "Drop rows — impact is negligible" + if pct < 10: + strategies = { + "int": "Impute with median + add binary indicator column", + "float": "Impute with median + add binary indicator column", + "string": "Impute with mode or 'Unknown' category + add indicator", + "bool": "Impute with mode", + } + return strategies.get(col_type, "Impute with median/mode + add indicator") + if pct < 30: + return "Impute cautiously; investigate root cause; document assumption; add indicator" + return "Do NOT impute blindly — > 30% missing. Escalate to domain owner or consider dropping column" + + +def infer_type(values: list[str]) -> str: + non_null = [v for v in values if not is_null(v)] + counts = {"int": 0, "float": 0, "bool": 0, "string": 0} + for v in non_null[:200]: # sample for speed + v = v.strip() + if v.lower() in ("true", "false"): + counts["bool"] += 1 + else: + try: + int(v) + counts["int"] += 1 + except ValueError: + try: + float(v) + counts["float"] += 1 + except ValueError: + counts["string"] += 1 + return max(counts, key=lambda k: counts[k]) if any(counts.values()) else "string" + + +def compute_cooccurrence(headers: list[str], masks: dict[str, list[bool]], top_n: int = 5) -> list[dict]: + """Find column pairs where nulls most frequently co-occur.""" + pairs = [] + cols = list(headers) + for i in range(len(cols)): + for j in range(i + 1, len(cols)): + a, b = cols[i], cols[j] + mask_a, mask_b = masks[a], masks[b] + overlap = sum(1 for x, y in zip(mask_a, mask_b) if x and y) + if overlap > 0: + pairs.append({"col_a": a, "col_b": b, "co_null_rows": overlap}) + pairs.sort(key=lambda x: -x["co_null_rows"]) + return pairs[:top_n] + + +def print_report(headers: list[str], rows: list[dict], masks: dict, threshold: float): + total = len(rows) + print("=" * 64) + print("MISSING VALUE ANALYSIS REPORT") + print("=" * 64) + print(f"Rows: {total} | Columns: {len(headers)}") + + results = [] + for col in headers: + mask = masks[col] + stats = null_stats(mask) + if stats["pct"] / 100 < threshold and stats["count"] > 0: + continue + raw_vals = [row.get(col, "") for row in rows] + col_type = infer_type(raw_vals) + mechanism = classify_mechanism(col, mask, masks) + strategy = recommend_strategy(stats["pct"], col_type) + results.append({ + "column": col, + "null_count": stats["count"], + "null_pct": stats["pct"], + "col_type": col_type, + "mechanism": mechanism, + "strategy": strategy, + }) + + fully_complete = [col for col in headers if null_stats(masks[col])["count"] == 0] + print(f"\nFully complete columns: {len(fully_complete)}/{len(headers)}") + + if not results: + print(f"\nNo columns exceed the null threshold ({threshold * 100:.1f}%).") + else: + print(f"\nColumns with missing values (threshold >= {threshold * 100:.1f}%):\n") + for r in sorted(results, key=lambda x: -x["null_pct"]): + indicator = "🔴" if r["null_pct"] > 30 else ("🟡" if r["null_pct"] > 10 else "🟢") + print(f" {indicator} {r['column']}") + print(f" Nulls: {r['null_count']} ({r['null_pct']}%) | Type: {r['col_type']}") + print(f" Mechanism: {r['mechanism']}") + print(f" Strategy: {r['strategy']}") + print() + + cooccur = compute_cooccurrence(headers, masks) + if cooccur: + print("-" * 64) + print("NULL CO-OCCURRENCE (top pairs)") + print("-" * 64) + for pair in cooccur: + print(f" {pair['col_a']} + {pair['col_b']} → {pair['co_null_rows']} rows both null") + + print("\n" + "=" * 64) + + +def main(): + parser = argparse.ArgumentParser(description="Analyze missing values in a CSV dataset.") + parser.add_argument("--file", required=True, help="Path to CSV file") + parser.add_argument("--threshold", type=float, default=0.0, + help="Only show columns with null fraction above this (e.g. 0.05 = 5%%)") + parser.add_argument("--format", choices=["text", "json"], default="text") + args = parser.parse_args() + + try: + headers, rows = load_csv(args.file) + except FileNotFoundError: + print(f"Error: file not found: {args.file}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error reading file: {e}", file=sys.stderr) + sys.exit(1) + + if not rows: + print("Error: CSV file is empty.", file=sys.stderr) + sys.exit(1) + + masks = compute_null_mask(headers, rows) + + if args.format == "json": + output = [] + for col in headers: + mask = masks[col] + stats = null_stats(mask) + raw_vals = [row.get(col, "") for row in rows] + col_type = infer_type(raw_vals) + mechanism = classify_mechanism(col, mask, masks) + strategy = recommend_strategy(stats["pct"], col_type) + output.append({ + "column": col, + "null_count": stats["count"], + "null_pct": stats["pct"], + "col_type": col_type, + "mechanism": mechanism, + "strategy": strategy, + }) + print(json.dumps({"total_rows": len(rows), "columns": output}, indent=2)) + else: + print_report(headers, rows, masks, args.threshold) + + +if __name__ == "__main__": + main() diff --git a/engineering/data-quality-auditor/scripts/outlier_detector.py b/engineering/data-quality-auditor/scripts/outlier_detector.py new file mode 100644 index 0000000..7049fa9 --- /dev/null +++ b/engineering/data-quality-auditor/scripts/outlier_detector.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 +from __future__ import annotations +""" +outlier_detector.py — Multi-method outlier detection for numeric columns. + +Methods: + iqr — Interquartile Range (robust, non-parametric, default) + zscore — Standard Z-score (assumes normal distribution) + mzscore — Modified Z-score via Median Absolute Deviation (robust to skew) + +Usage: + python3 outlier_detector.py --file data.csv + python3 outlier_detector.py --file data.csv --method iqr + python3 outlier_detector.py --file data.csv --method zscore --threshold 2.5 + python3 outlier_detector.py --file data.csv --columns col1,col2 + python3 outlier_detector.py --file data.csv --format json +""" + +import argparse +import csv +import json +import math +import sys + + +NULL_STRINGS = {"", "null", "none", "n/a", "na", "nan", "nil", "undefined", "missing"} + + +def load_csv(filepath: str) -> tuple[list[str], list[dict]]: + with open(filepath, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + rows = list(reader) + headers = reader.fieldnames or [] + return headers, rows + + +def is_null(val: str) -> bool: + return val.strip().lower() in NULL_STRINGS + + +def to_float(val: str) -> float | None: + try: + return float(val.strip()) + except (ValueError, AttributeError): + return None + + +def median(nums: list[float]) -> float: + s = sorted(nums) + n = len(s) + mid = n // 2 + return s[mid] if n % 2 else (s[mid - 1] + s[mid]) / 2 + + +def percentile(nums: list[float], p: float) -> float: + """Linear interpolation percentile.""" + s = sorted(nums) + n = len(s) + if n == 1: + return s[0] + idx = p / 100 * (n - 1) + lo = int(idx) + hi = lo + 1 + frac = idx - lo + if hi >= n: + return s[-1] + return s[lo] + frac * (s[hi] - s[lo]) + + +def mean(nums: list[float]) -> float: + return sum(nums) / len(nums) + + +def std(nums: list[float], mu: float) -> float: + if len(nums) < 2: + return 0.0 + variance = sum((x - mu) ** 2 for x in nums) / (len(nums) - 1) + return math.sqrt(variance) + + +# --- Detection methods --- + +def detect_iqr(nums: list[float], multiplier: float = 1.5) -> dict: + q1 = percentile(nums, 25) + q3 = percentile(nums, 75) + iqr = q3 - q1 + lower = q1 - multiplier * iqr + upper = q3 + multiplier * iqr + outliers = [x for x in nums if x < lower or x > upper] + return { + "method": "IQR", + "q1": round(q1, 4), + "q3": round(q3, 4), + "iqr": round(iqr, 4), + "lower_bound": round(lower, 4), + "upper_bound": round(upper, 4), + "outlier_count": len(outliers), + "outlier_pct": round(len(outliers) / len(nums) * 100, 2), + "outlier_values": sorted(set(round(x, 4) for x in outliers))[:10], + } + + +def detect_zscore(nums: list[float], threshold: float = 3.0) -> dict: + mu = mean(nums) + sigma = std(nums, mu) + if sigma == 0: + return {"method": "Z-score", "outlier_count": 0, "outlier_pct": 0.0, + "note": "Zero variance — all values identical"} + zscores = [(x, abs((x - mu) / sigma)) for x in nums] + outliers = [x for x, z in zscores if z > threshold] + return { + "method": "Z-score", + "mean": round(mu, 4), + "std": round(sigma, 4), + "threshold": threshold, + "outlier_count": len(outliers), + "outlier_pct": round(len(outliers) / len(nums) * 100, 2), + "outlier_values": sorted(set(round(x, 4) for x in outliers))[:10], + } + + +def detect_modified_zscore(nums: list[float], threshold: float = 3.5) -> dict: + """Iglewicz-Hoaglin modified Z-score using Median Absolute Deviation.""" + med = median(nums) + mad = median([abs(x - med) for x in nums]) + if mad == 0: + return {"method": "Modified Z-score (MAD)", "outlier_count": 0, "outlier_pct": 0.0, + "note": "MAD is zero — consider Z-score instead"} + mzscores = [(x, 0.6745 * abs(x - med) / mad) for x in nums] + outliers = [x for x, mz in mzscores if mz > threshold] + return { + "method": "Modified Z-score (MAD)", + "median": round(med, 4), + "mad": round(mad, 4), + "threshold": threshold, + "outlier_count": len(outliers), + "outlier_pct": round(len(outliers) / len(nums) * 100, 2), + "outlier_values": sorted(set(round(x, 4) for x in outliers))[:10], + } + + +def classify_outlier_risk(pct: float, col: str) -> str: + """Heuristic: flag whether outliers are likely data errors or legitimate extremes.""" + if pct > 10: + return "High outlier rate — likely systematic data quality issue or wrong data type" + if pct > 5: + return "Elevated outlier rate — investigate source; may be mixed populations" + if pct > 1: + return "Moderate — review individually; could be legitimate extremes or entry errors" + if pct > 0: + return "Low — verify extreme values against source; likely legitimate but worth checking" + return "Clean — no outliers detected" + + +def analyze_column(col: str, nums: list[float], method: str, threshold: float) -> dict: + if len(nums) < 4: + return {"column": col, "status": "Skipped — fewer than 4 numeric values"} + + if method == "iqr": + result = detect_iqr(nums, multiplier=threshold if threshold != 3.0 else 1.5) + elif method == "zscore": + result = detect_zscore(nums, threshold=threshold) + elif method == "mzscore": + result = detect_modified_zscore(nums, threshold=threshold) + else: + result = detect_iqr(nums) + + result["column"] = col + result["total_numeric"] = len(nums) + result["risk_assessment"] = classify_outlier_risk(result.get("outlier_pct", 0), col) + return result + + +def print_report(results: list[dict]): + print("=" * 64) + print("OUTLIER DETECTION REPORT") + print("=" * 64) + + clean = [r for r in results if r.get("outlier_count", 0) == 0 and "status" not in r] + flagged = [r for r in results if r.get("outlier_count", 0) > 0] + skipped = [r for r in results if "status" in r] + + print(f"\nColumns analyzed: {len(results) - len(skipped)}") + print(f"Clean: {len(clean)}") + print(f"Flagged: {len(flagged)}") + if skipped: + print(f"Skipped: {len(skipped)} ({', '.join(r['column'] for r in skipped)})") + + if flagged: + print("\n" + "-" * 64) + print("FLAGGED COLUMNS") + print("-" * 64) + for r in sorted(flagged, key=lambda x: -x.get("outlier_pct", 0)): + pct = r.get("outlier_pct", 0) + indicator = "🔴" if pct > 5 else "🟡" + print(f"\n {indicator} {r['column']} ({r['method']})") + print(f" Outliers: {r['outlier_count']} / {r['total_numeric']} rows ({pct}%)") + if "lower_bound" in r: + print(f" Bounds: [{r['lower_bound']}, {r['upper_bound']}] | IQR: {r['iqr']}") + if "mean" in r: + print(f" Mean: {r['mean']} | Std: {r['std']} | Threshold: ±{r['threshold']}σ") + if "median" in r: + print(f" Median: {r['median']} | MAD: {r['mad']} | Threshold: {r['threshold']}") + if r.get("outlier_values"): + vals = ", ".join(str(v) for v in r["outlier_values"][:8]) + print(f" Sample outlier values: {vals}") + print(f" Assessment: {r['risk_assessment']}") + + if clean: + cols = ", ".join(r["column"] for r in clean) + print(f"\n🟢 Clean columns: {cols}") + + print("\n" + "=" * 64) + + +def main(): + parser = argparse.ArgumentParser(description="Detect outliers in numeric columns of a CSV dataset.") + parser.add_argument("--file", required=True, help="Path to CSV file") + parser.add_argument("--method", choices=["iqr", "zscore", "mzscore"], default="iqr", + help="Detection method (default: iqr)") + parser.add_argument("--threshold", type=float, default=None, + help="Method threshold (IQR multiplier default 1.5; Z-score default 3.0; mzscore default 3.5)") + parser.add_argument("--columns", help="Comma-separated columns to check (default: all numeric)") + parser.add_argument("--format", choices=["text", "json"], default="text") + args = parser.parse_args() + + # Set default thresholds per method + if args.threshold is None: + args.threshold = {"iqr": 1.5, "zscore": 3.0, "mzscore": 3.5}[args.method] + + try: + headers, rows = load_csv(args.file) + except FileNotFoundError: + print(f"Error: file not found: {args.file}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error reading file: {e}", file=sys.stderr) + sys.exit(1) + + if not rows: + print("Error: CSV file is empty.", file=sys.stderr) + sys.exit(1) + + selected = args.columns.split(",") if args.columns else headers + missing_cols = [c for c in selected if c not in headers] + if missing_cols: + print(f"Error: columns not found: {', '.join(missing_cols)}", file=sys.stderr) + sys.exit(1) + + results = [] + for col in selected: + raw = [row.get(col, "") for row in rows] + nums = [n for v in raw if not is_null(v) and (n := to_float(v)) is not None] + results.append(analyze_column(col, nums, args.method, args.threshold)) + + if args.format == "json": + print(json.dumps(results, indent=2)) + else: + print_report(results) + + +if __name__ == "__main__": + main() diff --git a/engineering/demo-video/.claude-plugin/plugin.json b/engineering/demo-video/.claude-plugin/plugin.json new file mode 100644 index 0000000..1989615 --- /dev/null +++ b/engineering/demo-video/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "demo-video", + "description": "Create polished demo videos from screenshots and scene descriptions. Orchestrates playwright, ffmpeg, and edge-tts to produce product walkthroughs, feature showcases, and marketing teasers with story structure, scene design system, and narration guidance.", + "version": "2.2.0", + "author": { + "name": "Alireza Rezvani", + "url": "https://alirezarezvani.com" + }, + "homepage": "https://github.com/alirezarezvani/claude-skills/tree/main/engineering/demo-video", + "repository": "https://github.com/alirezarezvani/claude-skills", + "license": "MIT", + "skills": "./" +} diff --git a/engineering/demo-video/SKILL.md b/engineering/demo-video/SKILL.md new file mode 100644 index 0000000..70abbd0 --- /dev/null +++ b/engineering/demo-video/SKILL.md @@ -0,0 +1,110 @@ +--- +name: "demo-video" +description: "Use when the user asks to create a demo video, product walkthrough, feature showcase, animated presentation, marketing video, or GIF from screenshots or scene descriptions. Orchestrates playwright, ffmpeg, and edge-tts MCPs to produce polished video content." +--- + +# Demo Video + +You are a video producer. Not a slideshow maker. Every frame has a job. Every second earns the next. + +## Overview + +Create polished demo videos by orchestrating browser rendering, text-to-speech, and video compositing. Think like a video producer — story arc, pacing, emotion, visual hierarchy. Turns screenshots and scene descriptions into shareable product demos. + +## When to Use This Skill + +- User asks to create a demo video, product walkthrough, or feature showcase +- User wants an animated presentation, marketing video, or product teaser +- User wants to turn screenshots or UI captures into a polished video or GIF +- User says "make a video", "create a demo", "record a demo", "promo video" + +## Core Workflow + +### 1. Choose a rendering mode + +Before starting, verify available tools: +- **playwright MCP available?** — needed for automated screenshots. Fallback: ask user to screenshot the HTML files manually. +- **edge-tts available?** — needed for narration audio. Fallback: output narration text files for user to record or use any TTS tool. +- **ffmpeg available?** — needed for compositing. Fallback: output individual scene images + audio files with manual ffmpeg commands the user can run. + +If none are available, produce HTML scene files + `scenes.json` manifest + narration scripts. The user can composite manually or use any video editor. + +| Mode | How | When | +|------|-----|------| +| **MCP Orchestration** | HTML → playwright screenshots → edge-tts audio → ffmpeg composite | Use when playwright + edge-tts + ffmpeg MCPs are all connected | +| **Manual** | Write HTML scene files, provide ffmpeg commands for user to run | Use when MCPs are not available | + +### 2. Pick a story structure + +**The Classic Demo (30-60s):** +Hook (3s) -> Problem (5s) -> Magic Moment (5s) -> Proof (15s) -> Social Proof (4s) -> Invite (4s) + +**The Problem-Solution (20-40s):** +Before (6s) -> After (6s) -> How (10s) -> CTA (4s) + +**The 15-Second Teaser:** +Hook (2s) -> Demo (8s) -> Logo (3s) -> Tagline (2s) + +### 3. Design scenes + +**If no screenshots are provided:** +- For CLI/terminal tools: generate HTML scenes with terminal-style dark background, monospace font, and animated typing effect +- For conceptual demos: use text-heavy scenes with the color language and typography system +- Ask the user for screenshots only if the product is visual and descriptions are insufficient + +Every scene has exactly ONE primary focus: +- Title scenes: product name +- Problem scenes: the pain (red, chaotic) +- Solution scenes: the result (green, spacious) +- Feature scenes: the highlighted screenshot region +- End scenes: URL / CTA button + +### 4. Write narration + +- One idea per scene. If you need "and" you need two scenes. +- Lead with the verb. "Organize your tabs" not "Tab organization is provided." +- No jargon. "Your tabs organize themselves" not "AI-powered tab categorization." +- Use contrast. "24 tabs. One click. 5 groups." + +## Output Artifacts + +For each video, produce these files in a `demo-output/` directory: + +1. `scenes/` — one HTML file per scene (1920x1080 viewport) +2. `narration/` — one `.txt` file per scene (for edge-tts input) +3. `scenes.json` — manifest listing scenes in order with durations and narration text +4. `build.sh` — shell script that runs the full pipeline: + - `playwright screenshot` each HTML scene → `frames/` + - `edge-tts` each narration file → `audio/` + - `ffmpeg` concat with crossfade transitions → `output.mp4` + +If MCPs are unavailable, still produce items 1-3. Include the ffmpeg commands in `build.sh` for the user to run manually. + +## Scene Design System + +See [references/scene-design-system.md](references/scene-design-system.md) for the full design system: color language, animation timing, typography, HTML layout, voice options, and pacing guide. + +## Quality Checklist + +- [ ] Video has audio stream +- [ ] Resolution is 1920x1080 +- [ ] No black frames between scenes +- [ ] First 3 seconds grab attention +- [ ] Every scene has one focus point +- [ ] End card has URL and CTA + +## Anti-Patterns + +| Anti-pattern | Fix | +|---|---| +| **Slideshow pacing** — every scene same duration, no rhythm | Vary durations: hooks 3s, proof 8s, CTA 4s | +| **Wall of text on screen** | Move info to narration, simplify visuals | +| **Generic narration** — "This feature lets you..." | Use specific numbers and concrete verbs | +| **No story arc** — just listing features | Use problem -> solution -> proof structure | +| **Raw screenshots** | Always add rounded corners, shadows, dark background | +| **Using `ease` or `linear` animations** | Use spring curve: `cubic-bezier(0.16, 1, 0.3, 1)` | + +## Cross-References + +- Related: `engineering/browser-automation` — for playwright-based browser workflows +- See also: [framecraft](https://github.com/vaddisrinivas/framecraft) — open-source scene rendering pipeline diff --git a/engineering/demo-video/evals.json b/engineering/demo-video/evals.json new file mode 100644 index 0000000..88cc847 --- /dev/null +++ b/engineering/demo-video/evals.json @@ -0,0 +1,26 @@ +[ + { + "id": 1, + "prompt": "I just shipped a new tab management Chrome extension. I have 4 screenshots showing the before (messy tabs) and after (organized groups). Can you make a 30-second demo video I can post on Twitter?", + "expected_output": "Agent picks Classic Demo or Problem-Solution structure, designs 5-7 scenes using the color language and typography specs, writes narration following the pacing guide, produces demo-output/ with HTML scenes, narration files, scenes.json manifest, and build.sh.", + "scenario_type": "happy_path" + }, + { + "id": 2, + "prompt": "Create a 15-second teaser video for our SaaS dashboard. Here's one hero screenshot of the analytics view. Keep it minimal.", + "expected_output": "Agent selects 15-Second Teaser structure (Hook 2s, Demo 8s, Logo 3s, Tagline 2s), uses a single screenshot with dark background and proper styling, produces minimal scene set in demo-output/.", + "scenario_type": "happy_path" + }, + { + "id": 3, + "prompt": "Make a demo video for my CLI tool. I don't have any screenshots but I can describe what it does.", + "expected_output": "Agent generates terminal-style HTML scenes with dark background and monospace font from the user's descriptions. Does not ask for screenshots. Produces demo-output/ with all artifacts.", + "scenario_type": "edge_case" + }, + { + "id": 4, + "prompt": "I need a product demo video but I don't have ffmpeg or any MCP servers installed. Can you still help?", + "expected_output": "Agent acknowledges the constraint, produces HTML scene files + scenes.json + narration text files + build.sh with manual ffmpeg commands. Tells user how to install ffmpeg and run the script.", + "scenario_type": "edge_case" + } +] diff --git a/engineering/demo-video/references/scene-design-system.md b/engineering/demo-video/references/scene-design-system.md new file mode 100644 index 0000000..d3fcce1 --- /dev/null +++ b/engineering/demo-video/references/scene-design-system.md @@ -0,0 +1,61 @@ +# Scene Design System + +Reference material for demo video scene design — colors, typography, animation timing, voice options, and pacing. + +## Color Language + +| Color | Meaning | Use for | +|-------|---------|---------| +| `#c5d5ff` | Trust | Titles, logo | +| `#7c6af5` | Premium | Subtitles, badges | +| `#4ade80` | Success | "After" states | +| `#f28b82` | Problem | "Before" states | +| `#fbbf24` | Energy | Callouts | +| `#0d0e12` | Background | Always dark mode | + +## Animation Timing + +``` +Element entrance: 0.5-0.8s (cubic-bezier(0.16, 1, 0.3, 1)) +Between elements: 0.2-0.4s gap +Scene transition: 0.3-0.5s crossfade +Hold after last anim: 1.0-2.0s +``` + +## Typography + +``` +Title: 48-72px, weight 800 +Subtitle: 24-32px, weight 400, muted +Bullets: 18-22px, weight 600, pill background +Font: Inter (Google Fonts) +``` + +## HTML Scene Layout (1920x1080) + +```html + +

...

+
...
+ + +``` + +Background: dark with subtle purple-blue glow gradients. Screenshots: always `border-radius: 12px` with `box-shadow`. Easing: always `cubic-bezier(0.16, 1, 0.3, 1)` — never `ease` or `linear`. + +## Voice Options (edge-tts) + +| Voice | Best for | +|-------|----------| +| `andrew` | Product demos, launches | +| `jenny` | Tutorials, onboarding | +| `davis` | Enterprise, security | +| `emma` | Consumer products | + +## Pacing Guide + +| Duration | Max words | Fill | +|----------|-----------|------| +| 3-4s | 8-12 | ~70% | +| 5-6s | 15-22 | ~75% | +| 7-8s | 22-30 | ~80% | diff --git a/mkdocs.yml b/mkdocs.yml index dd4975b..a40fa7e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,6 +1,6 @@ site_name: Claude Code Skills & Agent Plugins site_url: https://alirezarezvani.github.io/claude-skills/ -site_description: "223 production-ready skills, 23 agents, 3 personas, and an orchestration protocol for 11 AI coding tools. Reusable expertise for engineering, product, marketing, compliance, and more." +site_description: "248 production-ready skills, 23 agents, 3 personas, and an orchestration protocol for 11 AI coding tools. Reusable expertise for engineering, product, marketing, compliance, and more." site_author: Alireza Rezvani repo_url: https://github.com/alirezarezvani/claude-skills repo_name: alirezarezvani/claude-skills @@ -187,6 +187,9 @@ nav: - "BeHuman": skills/engineering/behuman.md - "Browser Automation": skills/engineering/browser-automation.md - "Changelog Generator": skills/engineering/changelog-generator.md + - "Code Tour": skills/engineering/code-tour.md + - "Data Quality Auditor": skills/engineering/data-quality-auditor.md + - "Demo Video": skills/engineering/demo-video.md - "CI/CD Pipeline Builder": skills/engineering/ci-cd-pipeline-builder.md - "Codebase Onboarding": skills/engineering/codebase-onboarding.md - "Database Designer": skills/engineering/database-designer.md diff --git a/product-team/product-analytics/SKILL.md b/product-team/product-analytics/SKILL.md index 8ea35ce..23e8e11 100644 --- a/product-team/product-analytics/SKILL.md +++ b/product-team/product-analytics/SKILL.md @@ -90,18 +90,58 @@ See: - Flattening at low level: product used occasionally, revisit value metric. - Improving newer cohorts: onboarding or positioning improvements are working. +## Anti-Patterns + +| Anti-pattern | Fix | +|---|---| +| **Vanity metrics** — tracking pageviews or total signups without activation context | Always pair acquisition metrics with activation rate and retention | +| **Single-point retention** — reporting "30-day retention is 20%" | Compare retention curves across cohorts, not isolated snapshots | +| **Dashboard overload** — 30+ metrics on one screen | Executive layer: 5-7 metrics. Feature layer: per-feature only | +| **No decision rule** — tracking a KPI with no threshold or action plan | Every KPI needs: target, threshold, owner, and "if below X, then Y" | +| **Averaging across segments** — reporting blended metrics that hide segment differences | Always segment by cohort, plan tier, channel, or geography | +| **Ignoring seasonality** — comparing this week to last week without adjusting | Use period-over-period with same-period-last-year context | + ## Tooling ### `scripts/metrics_calculator.py` -CLI utility for: -- Retention rate calculations by cohort age -- Cohort table generation -- Basic funnel conversion analysis +CLI utility for retention, cohort, and funnel analysis from CSV data. Supports text and JSON output. -Examples: ```bash +# Retention analysis python3 scripts/metrics_calculator.py retention events.csv +python3 scripts/metrics_calculator.py retention events.csv --format json + +# Cohort matrix python3 scripts/metrics_calculator.py cohort events.csv --cohort-grain month +python3 scripts/metrics_calculator.py cohort events.csv --cohort-grain week --format json + +# Funnel conversion python3 scripts/metrics_calculator.py funnel funnel.csv --stages visit,signup,activate,pay +python3 scripts/metrics_calculator.py funnel funnel.csv --stages visit,signup,activate,pay --format json ``` + +**CSV format for retention/cohort:** +```csv +user_id,cohort_date,activity_date +u001,2026-01-01,2026-01-01 +u001,2026-01-01,2026-01-03 +u002,2026-01-02,2026-01-02 +``` + +**CSV format for funnel:** +```csv +user_id,stage +u001,visit +u001,signup +u001,activate +u002,visit +u002,signup +``` + +## Cross-References + +- Related: `product-team/experiment-designer` — for A/B test planning after identifying metric opportunities +- Related: `product-team/product-manager-toolkit` — for RICE prioritization of metric-driven features +- Related: `product-team/product-discovery` — for assumption mapping when metrics reveal unknowns +- Related: `finance/saas-metrics-coach` — for SaaS-specific metrics (ARR, MRR, churn, LTV) diff --git a/product-team/product-analytics/scripts/metrics_calculator.py b/product-team/product-analytics/scripts/metrics_calculator.py index 5f9087d..497b30b 100755 --- a/product-team/product-analytics/scripts/metrics_calculator.py +++ b/product-team/product-analytics/scripts/metrics_calculator.py @@ -4,6 +4,8 @@ import argparse import csv import datetime as dt +import json +import sys from collections import defaultdict @@ -32,16 +34,22 @@ def retention(args: argparse.Namespace) -> int: base_users = len(cohorts) if base_users == 0: - print("No users found.") + print("No users found.", file=sys.stderr) return 1 - print("Retention by period") - print("period,active_users,retention_rate") - max_period = args.max_period - for period in range(0, max_period + 1): + results = [] + for period in range(0, args.max_period + 1): users = len(activity.get(period, set())) rate = users / base_users - print(f"{period},{users},{rate:.4f}") + results.append({"period": period, "active_users": users, "retention_rate": round(rate, 4)}) + + if getattr(args, "format", "text") == "json": + print(json.dumps({"base_users": base_users, "periods": results}, indent=2)) + else: + print("Retention by period") + print("period,active_users,retention_rate") + for r in results: + print(f"{r['period']},{r['active_users']},{r['retention_rate']:.4f}") return 0 @@ -70,13 +78,21 @@ def cohort(args: argparse.Namespace) -> int: cohort_sizes[cohort_key] += 1 cohort_keys = sorted(cohort_sizes.keys()) - print("cohort,age_days,active_users,cohort_size,retention_rate") + results = [] for cohort_key in cohort_keys: size = cohort_sizes[cohort_key] for age in range(0, args.max_period + 1): - active_users = len(activity.get((cohort_key, age), set())) - rate = (active_users / size) if size else 0 - print(f"{cohort_key},{age},{active_users},{size},{rate:.4f}") + active = len(activity.get((cohort_key, age), set())) + rate = (active / size) if size else 0 + results.append({"cohort": cohort_key, "age_days": age, "active_users": active, + "cohort_size": size, "retention_rate": round(rate, 4)}) + + if getattr(args, "format", "text") == "json": + print(json.dumps({"cohorts": dict(cohort_sizes), "rows": results}, indent=2)) + else: + print("cohort,age_days,active_users,cohort_size,retention_rate") + for r in results: + print(f"{r['cohort']},{r['age_days']},{r['active_users']},{r['cohort_size']},{r['retention_rate']:.4f}") return 0 @@ -94,7 +110,7 @@ def funnel(args: argparse.Namespace) -> int: if stage in stage_users: stage_users[stage].add(user) - print("stage,users,conversion_from_previous,conversion_from_first") + results = [] previous_count = None first_count = None for stage in stages: @@ -103,8 +119,17 @@ def funnel(args: argparse.Namespace) -> int: first_count = count conv_prev = (count / previous_count) if previous_count else 1.0 conv_first = (count / first_count) if first_count else 0 - print(f"{stage},{count},{conv_prev:.4f},{conv_first:.4f}") + results.append({"stage": stage, "users": count, + "conversion_from_previous": round(conv_prev, 4), + "conversion_from_first": round(conv_first, 4)}) previous_count = count + + if getattr(args, "format", "text") == "json": + print(json.dumps({"stages": results}, indent=2)) + else: + print("stage,users,conversion_from_previous,conversion_from_first") + for r in results: + print(f"{r['stage']},{r['users']},{r['conversion_from_previous']:.4f},{r['conversion_from_first']:.4f}") return 0 @@ -118,12 +143,15 @@ def build_parser() -> argparse.ArgumentParser: "help": "CSV input path", } + fmt_help = "Output format (default: text)" + retention_parser = subparsers.add_parser("retention", help="Calculate retention by day.") retention_parser.add_argument("input", **common) retention_parser.add_argument("--user-column", default="user_id") retention_parser.add_argument("--cohort-column", default="cohort_date") retention_parser.add_argument("--activity-column", default="activity_date") retention_parser.add_argument("--max-period", type=int, default=30) + retention_parser.add_argument("--format", choices=["text", "json"], default="text", help=fmt_help) retention_parser.set_defaults(func=retention) cohort_parser = subparsers.add_parser("cohort", help="Build cohort retention matrix rows.") @@ -133,6 +161,7 @@ def build_parser() -> argparse.ArgumentParser: cohort_parser.add_argument("--activity-column", default="activity_date") cohort_parser.add_argument("--cohort-grain", choices=["week", "month"], default="week") cohort_parser.add_argument("--max-period", type=int, default=30) + cohort_parser.add_argument("--format", choices=["text", "json"], default="text", help=fmt_help) cohort_parser.set_defaults(func=cohort) funnel_parser = subparsers.add_parser("funnel", help="Calculate funnel conversion by stage.") @@ -140,6 +169,7 @@ def build_parser() -> argparse.ArgumentParser: funnel_parser.add_argument("--user-column", default="user_id") funnel_parser.add_argument("--stage-column", default="stage") funnel_parser.add_argument("--stages", required=True) + funnel_parser.add_argument("--format", choices=["text", "json"], default="text", help=fmt_help) funnel_parser.set_defaults(func=funnel) return parser @@ -148,7 +178,17 @@ def build_parser() -> argparse.ArgumentParser: def main() -> int: parser = build_parser() args = parser.parse_args() - return args.func(args) + try: + return args.func(args) + except FileNotFoundError: + print(f"Error: file not found: {args.input}", file=sys.stderr) + return 1 + except KeyError as e: + print(f"Error: column not found in CSV: {e}", file=sys.stderr) + return 1 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 if __name__ == "__main__":