From 2896870061910cbd861d1a54f7b6c614059b425f Mon Sep 17 00:00:00 2001
From: daymade <daymadev89@gmail.com>
Date: Mon, 2 Mar 2026 19:40:52 +0800
Subject: [PATCH] feat: add financial-data-collector skill for US equity data
 collection

New skill that collects real financial data for any US publicly traded company
via yfinance. Outputs structured JSON with market data, historical financials,
WACC inputs, and analyst estimates. Includes 9-check validation script and
reference docs for yfinance pitfalls (NaN years, field aliases, FCF mismatch).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .claude-plugin/marketplace.json               |  70 +++-
 CHANGELOG.md                                  |  15 +
 CLAUDE.md                                     |   3 +-
 README.md                                     |  50 ++-
 README.zh-CN.md                               |  50 ++-
 financial-data-collector/.gitignore           |  10 +
 financial-data-collector/SKILL.md             | 152 ++++++++
 .../references/output-schema.md               |  84 ++++
 .../references/yfinance-pitfalls.md           |  99 +++++
 .../scripts/collect_data.py                   | 362 ++++++++++++++++++
 .../scripts/validate_data.py                  | 159 ++++++++
 11 files changed, 1045 insertions(+), 9 deletions(-)
 create mode 100644 financial-data-collector/.gitignore
 create mode 100644 financial-data-collector/SKILL.md
 create mode 100644 financial-data-collector/references/output-schema.md
 create mode 100644 financial-data-collector/references/yfinance-pitfalls.md
 create mode 100644 financial-data-collector/scripts/collect_data.py
 create mode 100644 financial-data-collector/scripts/validate_data.py

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 2f7b705..70dc64a 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -5,8 +5,8 @@
     "email": "daymadev89@gmail.com"
   },
   "metadata": {
-    "description": "Professional Claude Code skills for GitHub operations, document conversion, diagram generation, statusline customization, Teams communication, repomix utilities, skill creation, CLI demo generation, LLM icon access, Cloudflare troubleshooting, UI design system extraction, professional presentation creation, YouTube video downloading, secure repomix packaging, ASR transcription correction, video comparison quality analysis, comprehensive QA testing infrastructure, prompt optimization with EARS methodology, session history recovery, documentation cleanup, format-controlled deep research report generation with evidence tracking, PDF generation with Chinese font support, CLAUDE.md progressive disclosure optimization, CCPM skill registry search and management, Promptfoo LLM evaluation framework, iOS app development with XcodeGen and SwiftUI, fact-checking with automated corrections, Twitter/X content fetching, intelligent macOS disk space recovery, skill quality review and improvement, GitHub contribution strategy, complete internationalization/localization setup, plugin/skill troubleshooting with diagnostic tools, evidence-based competitor analysis with source citations, Windows Remote Desktop (AVD/W365) connection quality diagnosis with transport protocol analysis and log parsing, Tailscale+proxy conflict diagnosis with SSH tunnel SOP for remote development, and multi-path parallel product analysis with cross-model test-time compute scaling",
-    "version": "1.34.1",
+    "description": "Professional Claude Code skills for GitHub operations, document conversion, diagram generation, statusline customization, Teams communication, repomix utilities, skill creation, CLI demo generation, LLM icon access, Cloudflare troubleshooting, UI design system extraction, professional presentation creation, YouTube video downloading, secure repomix packaging, ASR transcription correction, video comparison quality analysis, comprehensive QA testing infrastructure, prompt optimization with EARS methodology, session history recovery, documentation cleanup, format-controlled deep research report generation with evidence tracking, PDF generation with Chinese font support, CLAUDE.md progressive disclosure optimization, CCPM skill registry search and management, Promptfoo LLM evaluation framework, iOS app development with XcodeGen and SwiftUI, fact-checking with automated corrections, Twitter/X content fetching, intelligent macOS disk space recovery, skill quality review and improvement, GitHub contribution strategy, complete internationalization/localization setup, plugin/skill troubleshooting with diagnostic tools, evidence-based competitor analysis with source citations, Windows Remote Desktop (AVD/W365) connection quality diagnosis with transport protocol analysis and log parsing, Tailscale+proxy conflict diagnosis with SSH tunnel SOP for remote development, multi-path parallel product analysis with cross-model test-time compute scaling, and real financial data collection for US equities with validation and yfinance pitfall handling",
+    "version": "1.36.0",
     "homepage": "https://github.com/daymade/claude-code-skills"
   },
   "plugins": [
@@ -809,6 +809,72 @@
       "skills": [
         "./product-analysis"
       ]
+    },
+    {
+      "name": "excel-automation",
+      "description": "Create, parse, and control Excel files on macOS. Professional formatting with openpyxl (font colors, fills, borders, conditional formatting), complex xlsm parsing with stdlib zipfile+xml for investment bank financial models, and Excel window control via AppleScript (zoom, scroll, select). Use when creating formatted Excel reports, parsing financial models, or automating Excel on macOS",
+      "source": "./",
+      "strict": false,
+      "version": "1.0.0",
+      "category": "productivity",
+      "keywords": [
+        "excel",
+        "openpyxl",
+        "xlsm",
+        "spreadsheet",
+        "formatting",
+        "financial-model",
+        "applescript",
+        "macos",
+        "dcf",
+        "investment-banking"
+      ],
+      "skills": [
+        "./excel-automation"
+      ]
+    },
+    {
+      "name": "capture-screen",
+      "description": "Programmatic screenshot capture on macOS. Get window IDs via Swift CGWindowListCopyWindowInfo, capture specific windows with screencapture -l, and control application windows via AppleScript. Supports multi-shot workflows for capturing different sections of the same window. Use when taking automated screenshots, capturing application windows, or creating visual documentation",
+      "source": "./",
+      "strict": false,
+      "version": "1.0.0",
+      "category": "utilities",
+      "keywords": [
+        "screenshot",
+        "screencapture",
+        "macos",
+        "window-capture",
+        "swift",
+        "applescript",
+        "automation",
+        "visual-documentation"
+      ],
+      "skills": [
+        "./capture-screen"
+      ]
+    },
+    {
+      "name": "financial-data-collector",
+      "description": "Collect real financial data for any US publicly traded company from free public sources (yfinance). Output structured JSON with market data, historical financials, WACC inputs, and analyst estimates. Handles NaN year detection, CapEx sign preservation, and FCF definition mismatches. Use when users request company financials, stock data, DCF inputs, or financial data collection for any US equity ticker",
+      "source": "./",
+      "strict": false,
+      "version": "1.0.0",
+      "category": "productivity",
+      "keywords": [
+        "finance",
+        "financial-data",
+        "yfinance",
+        "stock-data",
+        "dcf",
+        "wacc",
+        "market-data",
+        "investment-research",
+        "sec-filings"
+      ],
+      "skills": [
+        "./financial-data-collector"
+      ]
     }
   ]
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0ec0b8e..c55f150 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - None
 
+## [1.36.0] - 2026-03-02
+
+### Added
+- **New Skill**: financial-data-collector - Collect real financial data for US public companies via yfinance
+  - Structured JSON output with market data, income statement, cash flow, balance sheet, WACC inputs, analyst estimates
+  - Validation script with 9 checks (field completeness, cross-field consistency, sign conventions, NaN detection)
+  - Reference docs: output-schema.md, yfinance-pitfalls.md (NaN years, field aliases, FCF definition mismatch)
+  - NO FALLBACK principle: null for missing data, never default values
+
+### Changed
+- Updated marketplace skills count from 38 to 39
+- Updated marketplace version from 1.35.0 to 1.36.0
+- Updated README.md and README.zh-CN.md badges (skills count, version)
+- Updated CLAUDE.md skills count and list
+
 ## [1.34.1] - 2026-02-23
 
 ### Changed
diff --git a/CLAUDE.md b/CLAUDE.md
index e193006..7ba5e42 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ## Repository Overview
 
-This is a Claude Code skills marketplace containing 38 production-ready skills organized in a plugin marketplace structure. Each skill is a self-contained package that extends Claude's capabilities with specialized knowledge, workflows, and bundled resources.
+This is a Claude Code skills marketplace containing 39 production-ready skills organized in a plugin marketplace structure. Each skill is a self-contained package that extends Claude's capabilities with specialized knowledge, workflows, and bundled resources.
 
 **Essential Skill**: `skill-creator` is the most important skill in this marketplace - it's a meta-skill that enables users to create their own skills. Always recommend it first for users interested in extending Claude Code.
 
@@ -215,6 +215,7 @@ This applies when you change ANY file under a skill directory:
  36. **tunnel-doctor** - Diagnose and fix Tailscale + proxy/VPN conflicts (four layers: route, HTTP env, system proxy, SSH ProxyCommand) on macOS with WSL SSH support
  37. **windows-remote-desktop-connection-doctor** - Diagnose AVD/W365 connection quality issues with transport protocol analysis and Windows App log parsing
   38. **product-analysis** - Perform structured product audits across UX, API, architecture, and compare mode to produce prioritized optimization recommendations
+  39. **financial-data-collector** - Collect real financial data for US public companies via yfinance with validation, NaN detection, and NO FALLBACK principle
 
 **Recommendation**: Always suggest `skill-creator` first for users interested in creating skills or extending Claude Code.
 
diff --git a/README.md b/README.md
index da3ec14..fb6b5bc 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,15 @@
 [![简体中文](https://img.shields.io/badge/语言-简体中文-red)](./README.zh-CN.md)
 
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![Skills](https://img.shields.io/badge/skills-38-blue.svg)](https://github.com/daymade/claude-code-skills)
-[![Version](https://img.shields.io/badge/version-1.34.1-green.svg)](https://github.com/daymade/claude-code-skills)
+[![Skills](https://img.shields.io/badge/skills-39-blue.svg)](https://github.com/daymade/claude-code-skills)
+[![Version](https://img.shields.io/badge/version-1.36.0-green.svg)](https://github.com/daymade/claude-code-skills)
 [![Claude Code](https://img.shields.io/badge/Claude%20Code-2.0.13+-purple.svg)](https://claude.com/code)
 [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](./CONTRIBUTING.md)
 [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/daymade/claude-code-skills/graphs/commit-activity)
 
 </div>
 
-Professional Claude Code skills marketplace featuring 38 production-ready skills for enhanced development workflows.
+Professional Claude Code skills marketplace featuring 39 production-ready skills for enhanced development workflows.
 
 ## 📑 Table of Contents
 
@@ -228,6 +228,9 @@ claude plugin install windows-remote-desktop-connection-doctor@daymade-skills
 
 # Product analysis and optimization
 claude plugin install product-analysis@daymade-skills
+
+# Financial data collection for US equities
+claude plugin install financial-data-collector@daymade-skills
 ```
 
 Each skill can be installed independently - choose only what you need!
@@ -1626,6 +1629,44 @@ claude plugin install product-analysis@daymade-skills
 
 ---
 
+### 39. **financial-data-collector** - Financial Data Collection for US Equities
+
+Collect real financial data for any US publicly traded company from free public sources (yfinance). Output structured JSON with market data, historical financials (income statement, cash flow, balance sheet), WACC inputs, and analyst estimates — ready for downstream DCF modeling, comps analysis, or earnings review.
+
+**When to use:**
+- Collecting structured financial data before building DCF or valuation models
+- Pulling market data (price, shares, beta, market cap) for any US equity ticker
+- Gathering historical income statement, cash flow, and balance sheet data
+- Getting risk-free rate (10Y Treasury) and analyst consensus estimates
+
+**Key features:**
+- Robust yfinance field mapping with alias chains (handles API instability across versions)
+- NaN year detection and transparent reporting (never fills with estimates)
+- 9-check validation: field completeness, market cap cross-check, CapEx sign convention, net debt consistency
+- NO FALLBACK principle: missing data returns `null` with `_source` attribution, never default values
+- FCF definition mismatch flagging (yfinance FCF ≠ investment bank FCF due to SBC)
+
+**Example usage:**
+```bash
+# Install the skill
+claude plugin install financial-data-collector@daymade-skills
+
+# Then ask Claude to collect data
+"Collect financial data for META"
+"Get financials for AAPL --years 3"
+"Pull DCF inputs for NVDA"
+```
+
+**🎬 Live Demo**
+
+*Coming soon*
+
+📚 **Documentation**: See [financial-data-collector/SKILL.md](./financial-data-collector/SKILL.md), [output-schema.md](./financial-data-collector/references/output-schema.md), and [yfinance-pitfalls.md](./financial-data-collector/references/yfinance-pitfalls.md).
+
+**Requirements**: Python 3.11+, `yfinance`, `pandas` (auto-installed via uv inline dependencies).
+
+---
+
 ## 🎬 Interactive Demo Gallery
 
 Want to see all demos in one place with click-to-enlarge functionality? Check out our [interactive demo gallery](./demos/index.html) or browse the [demos directory](./demos/).
@@ -1668,6 +1709,9 @@ Use **youtube-downloader** to download YouTube videos and extract audio from vid
 ### For Transcription & ASR Correction
 Use **transcript-fixer** to correct speech-to-text errors in meeting notes, lectures, and interviews through dictionary-based rules and AI-powered corrections with automatic learning.
 
+### For Financial Data & Investment Research
+Use **financial-data-collector** to pull structured financial data for any US public company, then feed the JSON output into DCF modeling, comps analysis, or earnings review workflows.
+
 ### For Meeting Documentation
 Use **meeting-minutes-taker** to transform raw meeting transcripts into structured, evidence-based minutes. Combine with **transcript-fixer** to clean up ASR errors before generating minutes. Features multi-pass generation with UNION merge to avoid content loss.
 
diff --git a/README.zh-CN.md b/README.zh-CN.md
index 7c2eb5c..b8090e3 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -6,15 +6,15 @@
 [![简体中文](https://img.shields.io/badge/语言-简体中文-red)](./README.zh-CN.md)
 
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![Skills](https://img.shields.io/badge/skills-38-blue.svg)](https://github.com/daymade/claude-code-skills)
-[![Version](https://img.shields.io/badge/version-1.34.1-green.svg)](https://github.com/daymade/claude-code-skills)
+[![Skills](https://img.shields.io/badge/skills-39-blue.svg)](https://github.com/daymade/claude-code-skills)
+[![Version](https://img.shields.io/badge/version-1.36.0-green.svg)](https://github.com/daymade/claude-code-skills)
 [![Claude Code](https://img.shields.io/badge/Claude%20Code-2.0.13+-purple.svg)](https://claude.com/code)
 [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](./CONTRIBUTING.md)
 [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/daymade/claude-code-skills/graphs/commit-activity)
 
 </div>
 
-专业的 Claude Code 技能市场，提供 38 个生产就绪的技能，用于增强开发工作流。
+专业的 Claude Code 技能市场，提供 39 个生产就绪的技能，用于增强开发工作流。
 
 ## 📑 目录
 
@@ -231,6 +231,9 @@ claude plugin install windows-remote-desktop-connection-doctor@daymade-skills
 
 # 产品审计与优化
 claude plugin install product-analysis@daymade-skills
+
+# 美股金融数据采集
+claude plugin install financial-data-collector@daymade-skills
 ```
 
 每个技能都可以独立安装 - 只选择你需要的！
@@ -1668,6 +1671,44 @@ claude plugin install product-analysis@daymade-skills
 
 ---
 
+### 39. **financial-data-collector** - 美股金融数据采集
+
+从免费公开数据源（yfinance）采集美股上市公司的实时金融数据，输出结构化 JSON，包含市场数据、历史财务报表（利润表、现金流量表、资产负债表）、WACC 输入参数和分析师一致预期——可直接用于下游 DCF 建模、可比公司分析或财报复盘。
+
+**使用场景：**
+- 构建 DCF 或估值模型前采集结构化金融数据
+- 拉取任意美股 ticker 的市场数据（股价、流通股、beta、市值）
+- 获取历史利润表、现金流量表、资产负债表数据
+- 获取无风险利率（10Y Treasury）和分析师一致预期
+
+**主要功能：**
+- 健壮的 yfinance 字段映射，使用别名链（应对 API 跨版本不稳定）
+- NaN 年份检测与透明报告（从不用估计值填充）
+- 9 项校验：字段完整性、市值交叉验证、资本支出符号约定、净负债一致性
+- NO FALLBACK 原则：缺失数据返回 `null` 并附 `_source` 溯源，绝不使用默认值
+- FCF 定义差异标记（yfinance FCF 不扣除 SBC，与投行 FCF 有 ~30% 差距）
+
+**示例用法：**
+```bash
+# 安装技能
+claude plugin install financial-data-collector@daymade-skills
+
+# 然后请求数据采集
+"采集 META 的金融数据"
+"获取 AAPL 最近 3 年的财务数据"
+"拉取 NVDA 的 DCF 输入数据"
+```
+
+**🎬 实时演示**
+
+*即将推出*
+
+📚 **文档**：参见 [financial-data-collector/SKILL.md](./financial-data-collector/SKILL.md)、[output-schema.md](./financial-data-collector/references/output-schema.md) 和 [yfinance-pitfalls.md](./financial-data-collector/references/yfinance-pitfalls.md)。
+
+**要求**：Python 3.11+、`yfinance`、`pandas`（通过 uv 内联依赖自动安装）。
+
+---
+
 ## 🎬 交互式演示画廊
 
 想要在一个地方查看所有演示并具有点击放大功能？访问我们的[交互式演示画廊](./demos/index.html)或浏览[演示目录](./demos/)。
@@ -1710,6 +1751,9 @@ claude plugin install product-analysis@daymade-skills
 ### 转录与 ASR 校正
 使用 **transcript-fixer** 通过基于字典的规则和 AI 驱动的校正自动学习，纠正会议记录、讲座和访谈中的语音转文本错误。
 
+### 金融数据与投研
+使用 **financial-data-collector** 采集任意美股上市公司的结构化金融数据，将 JSON 输出接入 DCF 建模、可比公司分析或财报复盘工作流。
+
 ### 会议文档
 使用 **meeting-minutes-taker** 将原始会议转写稿转换为结构化、基于证据的会议纪要。与 **transcript-fixer** 结合使用可在生成纪要前清理 ASR 错误。特点是多轮生成配合 UNION 合并以避免内容丢失。
 
diff --git a/financial-data-collector/.gitignore b/financial-data-collector/.gitignore
new file mode 100644
index 0000000..6d2f613
--- /dev/null
+++ b/financial-data-collector/.gitignore
@@ -0,0 +1,10 @@
+# Security scan marker file (generated by security_scan.py)
+.security-scan-passed
+
+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+
+# Test outputs
+*_financial_data.json
diff --git a/financial-data-collector/SKILL.md b/financial-data-collector/SKILL.md
new file mode 100644
index 0000000..cdb6a83
--- /dev/null
+++ b/financial-data-collector/SKILL.md
@@ -0,0 +1,152 @@
+---
+name: financial-data-collector
+description: "Collect real financial data for any US publicly traded company from free public sources (yfinance). Output structured JSON consumable by downstream financial skills (DCF modeling, comps analysis, earnings review). Handles market data (price, shares, beta), historical financials (income statement, cash flow, balance sheet), WACC inputs, and analyst estimates. Use when users request collect data for ticker, get financials for company, pull market data, gather DCF inputs, or any task requiring structured financial data before analysis. Also triggers on financial data, company data, stock data."
+---
+
+# Financial Data Collector
+
+Collect and validate real financial data for US public companies using free data sources.
+Output is a standardized JSON file ready for consumption by other financial skills.
+
+## Critical Constraints
+
+**NO FALLBACK values.** If a field cannot be retrieved, set it to `null` with `_source: "missing"`.
+Never substitute defaults (e.g., `beta or 1.0`). The downstream skill decides how to handle missing data.
+
+**Data source attribution is mandatory.** Every data section must have a `_source` field.
+
+**CapEx sign convention:** yfinance returns CapEx as negative (cash outflow). Preserve the original sign. Document the convention in output metadata. Do NOT flip signs.
+
+**yfinance FCF ≠ Investment bank FCF.** yfinance FCF = Operating CF + CapEx (no SBC deduction). Flag this in output metadata so downstream DCF skills don't overstate FCF.
+
+## Workflow
+
+### Step 1: Collect Data
+
+Run the collection script:
+
+```bash
+python scripts/collect_data.py TICKER [--years 5] [--output path/to/output.json]
+```
+
+The script collects in this priority:
+1. **yfinance** — market data, historical financials, beta, analyst estimates
+2. **yfinance ^TNX** — 10Y Treasury yield as risk-free rate proxy
+3. **User supplement** — for years where yfinance returns NaN (report to user, do not guess)
+
+### Step 2: Validate Data
+
+```bash
+python scripts/validate_data.py path/to/output.json
+```
+
+Checks: field completeness, cross-field consistency (Market Cap = Price × Shares), range sanity (WACC 5-20%, beta 0.3-3.0), sign conventions.
+
+### Step 3: Deliver JSON
+
+Single file: `{TICKER}_financial_data.json`. Schema in `references/output-schema.md`.
+
+**Do NOT create**: README, CSV, summary reports, or any auxiliary files.
+
+## Output Schema (Summary)
+
+```json
+{
+  "ticker": "META",
+  "company_name": "Meta Platforms, Inc.",
+  "data_date": "2026-03-02",
+  "currency": "USD",
+  "unit": "millions_usd",
+  "data_sources": { "market_data": "...", "2022_to_2024": "..." },
+  "market_data": { "current_price": 648.18, "shares_outstanding_millions": 2187, "market_cap_millions": 1639607, "beta_5y_monthly": 1.284 },
+  "income_statement": { "2024": { "revenue": 164501, "ebit": 69380, "tax_expense": ..., "net_income": ..., "_source": "yfinance" } },
+  "cash_flow": { "2024": { "operating_cash_flow": ..., "capex": -37256, "depreciation_amortization": 15498, "free_cash_flow": ..., "change_in_nwc": ..., "_source": "yfinance" } },
+  "balance_sheet": { "2024": { "total_debt": 30768, "cash_and_equivalents": 77815, "net_debt": -47047, "current_assets": ..., "current_liabilities": ..., "_source": "yfinance" } },
+  "wacc_inputs": { "risk_free_rate": 0.0396, "beta": 1.284, "credit_rating": null, "_source": "yfinance + ^TNX" },
+  "analyst_estimates": { "revenue_next_fy": 251113, "revenue_fy_after": 295558, "eps_next_fy": 29.59, "_source": "yfinance" },
+  "metadata": { "_capex_convention": "negative = cash outflow", "_fcf_note": "yfinance FCF = OperatingCF + CapEx. Does NOT deduct SBC." }
+}
+```
+
+Full schema with all field definitions: `references/output-schema.md`
+
+<correct_patterns>
+
+### Handling Missing Years
+
+```python
+if pd.isna(revenue):
+    result[year] = {"revenue": None, "_source": "yfinance returned NaN — supplement from 10-K"}
+# Report missing years to the user. Do NOT skip or fill with estimates.
+```
+
+### CapEx Sign Preservation
+
+```python
+capex = cash_flow.loc["Capital Expenditure", year_col]  # -37256.0
+result["capex"] = float(capex)  # Preserve negative
+```
+
+### Datetime Column Indexing
+
+```python
+year_col = [c for c in financials.columns if c.year == target_year][0]
+revenue = financials.loc["Total Revenue", year_col]
+```
+
+### Field Name Guards
+
+```python
+if "Total Revenue" in financials.index:
+    revenue = financials.loc["Total Revenue", year_col]
+elif "Revenue" in financials.index:
+    revenue = financials.loc["Revenue", year_col]
+else:
+    revenue = None
+```
+
+</correct_patterns>
+
+<common_mistakes>
+
+### Mistake 1: Default Values for Missing Data
+
+```python
+# ❌ WRONG
+beta = info.get("beta", 1.0)
+growth = data.get("growth") or 0.02
+
+# ✅ RIGHT
+beta = info.get("beta")  # May be None — that's OK
+```
+
+### Mistake 2: Assuming All Years Have Data
+
+```python
+# ❌ WRONG — 2020-2021 may be NaN
+revenue = float(financials.loc["Total Revenue", year_col])
+
+# ✅ RIGHT
+value = financials.loc["Total Revenue", year_col]
+revenue = float(value) if pd.notna(value) else None
+```
+
+### Mistake 3: Using yfinance FCF in DCF Models Directly
+
+yfinance FCF does NOT deduct SBC. For mega-caps like META, SBC can be $20-30B/yr, making yfinance FCF ~30% higher than investment-bank FCF. Always flag this in output.
+
+### Mistake 4: Flipping CapEx Sign
+
+```python
+# ❌ WRONG — double-negation risk downstream
+capex = abs(cash_flow.loc["Capital Expenditure", year_col])
+
+# ✅ RIGHT — preserve original, document convention
+capex = float(cash_flow.loc["Capital Expenditure", year_col])  # -37256.0
+```
+
+</common_mistakes>
+
+## Known yfinance Pitfalls
+
+See `references/yfinance-pitfalls.md` for detailed field mapping and workarounds.
diff --git a/financial-data-collector/references/output-schema.md b/financial-data-collector/references/output-schema.md
new file mode 100644
index 0000000..def0cda
--- /dev/null
+++ b/financial-data-collector/references/output-schema.md
@@ -0,0 +1,84 @@
+# Output Schema — financial-data-collector
+
+All monetary values in millions USD unless noted. Null means "not available from source."
+
+## Top-Level Fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `ticker` | string | Stock ticker symbol (e.g., "META") |
+| `company_name` | string | Full company name from yfinance |
+| `data_date` | string | ISO date when data was collected |
+| `currency` | string | Always "USD" for US equities |
+| `unit` | string | Always "millions_usd" |
+| `data_sources` | object | Attribution for each data section |
+
+## market_data
+
+| Field | Type | Source | Notes |
+|-------|------|--------|-------|
+| `current_price` | float | `yf.Ticker.info["currentPrice"]` | USD per share |
+| `shares_outstanding_millions` | float | `info["sharesOutstanding"] / 1e6` | Diluted if available |
+| `market_cap_millions` | float | `info["marketCap"] / 1e6` | |
+| `beta_5y_monthly` | float\|null | `info["beta"]` | May be null for recent IPOs |
+
+## income_statement (keyed by year string)
+
+| Field | Type | Source Row Index |
+|-------|------|-----------------|
+| `revenue` | float\|null | "Total Revenue" or "Revenue" |
+| `ebit` | float\|null | "Operating Income" or "EBIT" |
+| `ebitda` | float\|null | "EBITDA" (if available) |
+| `tax_expense` | float\|null | "Tax Provision" or "Income Tax Expense" |
+| `net_income` | float\|null | "Net Income" |
+| `sbc` | float\|null | "Stock Based Compensation" (in cash_flow) |
+| `_source` | string | Data provenance |
+
+## cash_flow (keyed by year string)
+
+| Field | Type | Source Row Index | Sign |
+|-------|------|-----------------|------|
+| `operating_cash_flow` | float\|null | "Operating Cash Flow" | Positive = inflow |
+| `capex` | float\|null | "Capital Expenditure" | **Negative = outflow** |
+| `depreciation_amortization` | float\|null | "Depreciation And Amortization" | Positive |
+| `free_cash_flow` | float\|null | "Free Cash Flow" | yfinance definition (see metadata) |
+| `change_in_nwc` | float\|null | "Change In Working Capital" | Negative = use of cash |
+| `_source` | string | | |
+
+## balance_sheet (latest year only by default)
+
+| Field | Type | Source Row Index |
+|-------|------|-----------------|
+| `total_debt` | float\|null | "Total Debt" or "Long Term Debt" + "Short Long Term Debt" |
+| `cash_and_equivalents` | float\|null | "Cash And Cash Equivalents" + "Other Short Term Investments" |
+| `net_debt` | float\|null | Computed: total_debt - cash_and_equivalents |
+| `current_assets` | float\|null | "Current Assets" |
+| `current_liabilities` | float\|null | "Current Liabilities" |
+| `total_assets` | float\|null | "Total Assets" |
+| `total_equity` | float\|null | "Stockholders Equity" |
+| `_source` | string | |
+
+## wacc_inputs
+
+| Field | Type | Source |
+|-------|------|--------|
+| `risk_free_rate` | float\|null | yfinance ^TNX (10Y Treasury) |
+| `beta` | float\|null | Same as market_data.beta_5y_monthly |
+| `credit_rating` | string\|null | Not available from yfinance — null unless user provides |
+
+## analyst_estimates
+
+| Field | Type | Source |
+|-------|------|--------|
+| `revenue_next_fy` | float\|null | `yf.Ticker.analyst_price_targets` or `revenue_estimate` |
+| `revenue_fy_after` | float\|null | Same |
+| `eps_next_fy` | float\|null | `yf.Ticker.analyst_price_targets` or `eps_trend` |
+
+## metadata
+
+| Field | Value |
+|-------|-------|
+| `_capex_convention` | "negative = cash outflow (yfinance convention)" |
+| `_fcf_note` | "yfinance FCF = OperatingCF + CapEx. Does NOT deduct SBC." |
+| `_nan_years` | List of years where yfinance returned NaN |
+| `_collection_duration_seconds` | Time taken to collect all data |
diff --git a/financial-data-collector/references/yfinance-pitfalls.md b/financial-data-collector/references/yfinance-pitfalls.md
new file mode 100644
index 0000000..21b0aa5
--- /dev/null
+++ b/financial-data-collector/references/yfinance-pitfalls.md
@@ -0,0 +1,99 @@
+# yfinance Pitfalls & Field Mapping
+
+## NaN Year Patterns
+
+yfinance frequently returns NaN for older fiscal years. Observed patterns:
+
+| Ticker | NaN Years | Notes |
+|--------|-----------|-------|
+| META | 2020, 2021 | All fields NaN; must supplement from 10-K |
+| General | Varies | Older years (>3 years back) are less reliable |
+
+**Workaround**: Check every field with `pd.notna()`. Report NaN years to user. Never fill with estimates.
+
+## Field Name Variants
+
+yfinance row index names are not fully stable across versions. Use fallback chains:
+
+```python
+FIELD_ALIASES = {
+    "revenue": ["Total Revenue", "Revenue", "Operating Revenue"],
+    "ebit": ["Operating Income", "EBIT"],
+    "ebitda": ["EBITDA", "Normalized EBITDA"],
+    "tax": ["Tax Provision", "Income Tax Expense", "Tax Effect Of Unusual Items"],
+    "net_income": ["Net Income", "Net Income Common Stockholders"],
+    "capex": ["Capital Expenditure", "Capital Expenditures"],
+    "ocf": ["Operating Cash Flow", "Cash Flow From Continuing Operating Activities"],
+    "da": ["Depreciation And Amortization", "Depreciation Amortization Depletion"],
+    "fcf": ["Free Cash Flow"],
+    "nwc": ["Change In Working Capital", "Changes In Working Capital"],
+    "total_debt": ["Total Debt"],
+    "cash": ["Cash And Cash Equivalents"],
+    "short_investments": ["Other Short Term Investments", "Short Term Investments"],
+    "sbc": ["Stock Based Compensation"],
+}
+
+def safe_get(df, aliases, col):
+    for alias in aliases:
+        if alias in df.index:
+            val = df.loc[alias, col]
+            return float(val) if pd.notna(val) else None
+    return None
+```
+
+## Datetime Column Index
+
+yfinance returns DataFrame columns as `pandas.Timestamp`, not integer years:
+
+```python
+# ❌ WRONG
+financials[2024]  # KeyError
+
+# ✅ RIGHT
+year_col = [c for c in financials.columns if c.year == 2024][0]
+financials.loc["Total Revenue", year_col]
+```
+
+## Shares Outstanding Variants
+
+```python
+# Preferred: diluted
+shares = info.get("sharesOutstanding")  # Basic shares
+# Alternative
+shares = info.get("impliedSharesOutstanding")  # May be more accurate
+```
+
+## Risk-Free Rate via ^TNX
+
+```python
+tnx = yf.Ticker("^TNX")
+hist = tnx.history(period="1d")
+risk_free_rate = hist["Close"].iloc[-1] / 100  # Convert from percentage
+```
+
+**Pitfall**: ^TNX returns yield as percentage (e.g., 4.3), not decimal (0.043). Divide by 100.
+
+## Analyst Estimates
+
+```python
+ticker = yf.Ticker("META")
+
+# Revenue estimates
+rev_est = ticker.revenue_estimate  # DataFrame with columns: avg, low, high, ...
+# Rows: "0q" (current quarter), "+1q", "0y" (current year), "+1y"
+
+# EPS estimates
+eps_est = ticker.eps_trend  # Similar structure
+```
+
+**Pitfall**: These APIs change between yfinance versions. Always wrap in try/except.
+
+## FCF Definition Mismatch
+
+| Source | FCF Definition | META 2024 |
+|--------|---------------|-----------|
+| yfinance | Operating CF + CapEx | ~$54.1B |
+| Morgan Stanley DCF | EBITDA - Taxes - CapEx - NWC - SBC | ~$37.9B |
+| Difference | SBC (~$22B) + other adjustments | ~30% gap |
+
+**Always flag this in output metadata.** Downstream DCF skills need to decide whether to use yfinance FCF or reconstruct from components.
diff --git a/financial-data-collector/scripts/collect_data.py b/financial-data-collector/scripts/collect_data.py
new file mode 100644
index 0000000..82c62b1
--- /dev/null
+++ b/financial-data-collector/scripts/collect_data.py
@@ -0,0 +1,362 @@
+#!/usr/bin/env python3
+"""
+Collect real financial data for a US publicly traded company.
+Output: structured JSON consumable by downstream financial skills.
+
+Usage:
+    python collect_data.py TICKER [--years 5] [--output path/to/output.json]
+
+Examples:
+    python collect_data.py META
+    python collect_data.py AAPL --years 3 --output /tmp/aapl_data.json
+    python collect_data.py NVDA --years 5
+
+Dependencies: yfinance, pandas (via uv inline or pip)
+"""
+# /// script
+# requires-python = ">=3.11"
+# dependencies = ["yfinance>=0.2.0", "pandas>=2.0.0"]
+# ///
+
+import argparse
+import json
+import sys
+import time
+from datetime import date
+from pathlib import Path
+
+import pandas as pd
+import yfinance as yf
+
+# Field name aliases — yfinance row indices vary across versions
+FIELD_ALIASES = {
+    "revenue": ["Total Revenue", "Revenue", "Operating Revenue"],
+    "ebit": ["Operating Income", "EBIT"],
+    "ebitda": ["EBITDA", "Normalized EBITDA"],
+    "tax_expense": ["Tax Provision", "Income Tax Expense"],
+    "net_income": ["Net Income", "Net Income Common Stockholders"],
+    "capex": ["Capital Expenditure", "Capital Expenditures"],
+    "operating_cash_flow": ["Operating Cash Flow", "Cash Flow From Continuing Operating Activities"],
+    "depreciation_amortization": ["Depreciation And Amortization", "Depreciation Amortization Depletion"],
+    "free_cash_flow": ["Free Cash Flow"],
+    "change_in_nwc": ["Change In Working Capital", "Changes In Working Capital"],
+    "sbc": ["Stock Based Compensation"],
+    "total_debt": ["Total Debt"],
+    "long_term_debt": ["Long Term Debt"],
+    "short_term_debt": ["Short Long Term Debt", "Current Debt"],
+    "cash": ["Cash And Cash Equivalents"],
+    "short_investments": ["Other Short Term Investments", "Short Term Investments"],
+    "current_assets": ["Current Assets"],
+    "current_liabilities": ["Current Liabilities"],
+    "total_assets": ["Total Assets"],
+    "total_equity": ["Stockholders Equity", "Total Equity Gross Minority Interest"],
+}
+
+
+def safe_get(df: pd.DataFrame, field_key: str, col) -> float | None:
+    """Safely extract a value from a DataFrame using alias chain."""
+    aliases = FIELD_ALIASES.get(field_key, [field_key])
+    for alias in aliases:
+        if alias in df.index:
+            val = df.loc[alias, col]
+            if pd.notna(val):
+                return float(val)
+    return None
+
+
+def get_year_col(df: pd.DataFrame, year: int):
+    """Find the column matching a target year in a yfinance DataFrame."""
+    matches = [c for c in df.columns if c.year == year]
+    return matches[0] if matches else None
+
+
+def collect_market_data(ticker_obj: yf.Ticker) -> dict:
+    """Collect real-time market data."""
+    info = ticker_obj.info
+    price = info.get("currentPrice") or info.get("regularMarketPrice")
+    shares_raw = info.get("sharesOutstanding")
+    shares = shares_raw / 1e6 if shares_raw else None
+    mcap_raw = info.get("marketCap")
+    mcap = mcap_raw / 1e6 if mcap_raw else None
+    beta = info.get("beta")
+
+    return {
+        "current_price": price,
+        "shares_outstanding_millions": round(shares, 2) if shares else None,
+        "market_cap_millions": round(mcap, 2) if mcap else None,
+        "beta_5y_monthly": round(beta, 3) if beta else None,
+    }
+
+
+def collect_risk_free_rate() -> float | None:
+    """Get 10Y Treasury yield as risk-free rate proxy."""
+    try:
+        tnx = yf.Ticker("^TNX")
+        hist = tnx.history(period="5d")
+        if hist.empty:
+            return None
+        yield_pct = hist["Close"].iloc[-1]
+        return round(float(yield_pct) / 100, 4)  # Convert percentage to decimal
+    except Exception:
+        return None
+
+
+def collect_income_statement(ticker_obj: yf.Ticker, years: list[int]) -> dict:
+    """Collect income statement data for specified years."""
+    try:
+        financials = ticker_obj.financials
+    except Exception:
+        return {str(y): {"_source": "yfinance error"} for y in years}
+
+    if financials is None or financials.empty:
+        return {str(y): {"_source": "yfinance returned empty"} for y in years}
+
+    result = {}
+    nan_years = []
+    for year in years:
+        col = get_year_col(financials, year)
+        if col is None:
+            result[str(year)] = {"_source": f"yfinance has no column for {year}"}
+            nan_years.append(year)
+            continue
+
+        revenue = safe_get(financials, "revenue", col)
+        ebit = safe_get(financials, "ebit", col)
+
+        if revenue is None and ebit is None:
+            nan_years.append(year)
+            result[str(year)] = {
+                "revenue": None, "ebit": None, "ebitda": None,
+                "tax_expense": None, "net_income": None,
+                "_source": f"yfinance returned NaN for {year} — supplement from 10-K filing",
+            }
+        else:
+            result[str(year)] = {
+                "revenue": revenue,
+                "ebit": ebit,
+                "ebitda": safe_get(financials, "ebitda", col),
+                "tax_expense": safe_get(financials, "tax_expense", col),
+                "net_income": safe_get(financials, "net_income", col),
+                "_source": "yfinance",
+            }
+
+    return result, nan_years
+
+
+def collect_cash_flow(ticker_obj: yf.Ticker, years: list[int]) -> dict:
+    """Collect cash flow data."""
+    try:
+        cf = ticker_obj.cashflow
+    except Exception:
+        return {str(y): {"_source": "yfinance error"} for y in years}, []
+
+    if cf is None or cf.empty:
+        return {str(y): {"_source": "yfinance returned empty"} for y in years}, years
+
+    result = {}
+    nan_years = []
+    for year in years:
+        col = get_year_col(cf, year)
+        if col is None:
+            result[str(year)] = {"_source": f"yfinance has no column for {year}"}
+            nan_years.append(year)
+            continue
+
+        ocf = safe_get(cf, "operating_cash_flow", col)
+        capex = safe_get(cf, "capex", col)
+
+        if ocf is None and capex is None:
+            nan_years.append(year)
+            result[str(year)] = {
+                "operating_cash_flow": None, "capex": None,
+                "depreciation_amortization": None, "free_cash_flow": None,
+                "change_in_nwc": None, "sbc": None,
+                "_source": f"yfinance returned NaN for {year}",
+            }
+        else:
+            result[str(year)] = {
+                "operating_cash_flow": ocf,
+                "capex": capex,  # Negative = outflow (preserved)
+                "depreciation_amortization": safe_get(cf, "depreciation_amortization", col),
+                "free_cash_flow": safe_get(cf, "free_cash_flow", col),
+                "change_in_nwc": safe_get(cf, "change_in_nwc", col),
+                "sbc": safe_get(cf, "sbc", col),
+                "_source": "yfinance",
+            }
+
+    return result, nan_years
+
+
+def collect_balance_sheet(ticker_obj: yf.Ticker, latest_year: int) -> dict:
+    """Collect balance sheet data for the latest year."""
+    try:
+        bs = ticker_obj.balance_sheet
+    except Exception:
+        return {str(latest_year): {"_source": "yfinance error"}}
+
+    if bs is None or bs.empty:
+        return {str(latest_year): {"_source": "yfinance returned empty"}}
+
+    col = get_year_col(bs, latest_year)
+    if col is None:
+        return {str(latest_year): {"_source": f"yfinance has no column for {latest_year}"}}
+
+    total_debt = safe_get(bs, "total_debt", col)
+    if total_debt is None:
+        lt = safe_get(bs, "long_term_debt", col) or 0
+        st = safe_get(bs, "short_term_debt", col) or 0
+        total_debt = lt + st if (lt or st) else None
+
+    cash = safe_get(bs, "cash", col)
+    short_inv = safe_get(bs, "short_investments", col)
+    cash_equiv = (cash or 0) + (short_inv or 0) if (cash is not None or short_inv is not None) else None
+    net_debt = (total_debt - cash_equiv) if (total_debt is not None and cash_equiv is not None) else None
+
+    return {
+        str(latest_year): {
+            "total_debt": total_debt,
+            "cash_and_equivalents": cash_equiv,
+            "net_debt": round(net_debt, 2) if net_debt is not None else None,
+            "current_assets": safe_get(bs, "current_assets", col),
+            "current_liabilities": safe_get(bs, "current_liabilities", col),
+            "total_assets": safe_get(bs, "total_assets", col),
+            "total_equity": safe_get(bs, "total_equity", col),
+            "_source": "yfinance",
+        }
+    }
+
+
+def collect_analyst_estimates(ticker_obj: yf.Ticker) -> dict:
+    """Collect analyst consensus estimates."""
+    result = {"revenue_next_fy": None, "revenue_fy_after": None, "eps_next_fy": None, "_source": "missing"}
+
+    try:
+        rev_est = ticker_obj.revenue_estimate
+        if rev_est is not None and not rev_est.empty:
+            if "0y" in rev_est.index:
+                result["revenue_next_fy"] = float(rev_est.loc["0y", "avg"]) / 1e6 if pd.notna(rev_est.loc["0y", "avg"]) else None
+            if "+1y" in rev_est.index:
+                result["revenue_fy_after"] = float(rev_est.loc["+1y", "avg"]) / 1e6 if pd.notna(rev_est.loc["+1y", "avg"]) else None
+            result["_source"] = "yfinance revenue_estimate"
+    except Exception:
+        pass
+
+    try:
+        eps = ticker_obj.eps_trend
+        if eps is not None and not eps.empty:
+            if "0y" in eps.index:
+                val = eps.loc["0y", "current"] if "current" in eps.columns else eps.iloc[eps.index.get_loc("0y"), 0]
+                result["eps_next_fy"] = float(val) if pd.notna(val) else None
+    except Exception:
+        pass
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Collect financial data for a US public company")
+    parser.add_argument("ticker", help="Stock ticker symbol (e.g., META, AAPL)")
+    parser.add_argument("--years", type=int, default=5, help="Number of historical years (default: 5)")
+    parser.add_argument("--output", help="Output JSON file path (default: {TICKER}_financial_data.json)")
+    args = parser.parse_args()
+
+    ticker_symbol = args.ticker.upper()
+    current_year = date.today().year
+    # Exclude current year if annual report not yet filed (typically before Q4 earnings)
+    latest_full_year = current_year - 1
+    target_years = list(range(latest_full_year - args.years + 1, latest_full_year + 1))
+
+    output_path = args.output or f"{ticker_symbol}_financial_data.json"
+
+    print(f"Collecting data for {ticker_symbol} ({target_years[0]}-{target_years[-1]})...")
+    start_time = time.time()
+
+    ticker_obj = yf.Ticker(ticker_symbol)
+
+    # Verify ticker is valid
+    info = ticker_obj.info
+    if not info or info.get("regularMarketPrice") is None:
+        print(f"ERROR: {ticker_symbol} not found or no market data available", file=sys.stderr)
+        sys.exit(1)
+
+    company_name = info.get("longName") or info.get("shortName") or ticker_symbol
+
+    # Collect all data
+    print("  Market data...", end=" ", flush=True)
+    market_data = collect_market_data(ticker_obj)
+    print("OK")
+
+    print("  Risk-free rate...", end=" ", flush=True)
+    rfr = collect_risk_free_rate()
+    print(f"{rfr:.4f}" if rfr else "MISSING")
+
+    print("  Income statement...", end=" ", flush=True)
+    income_data, is_nan_years = collect_income_statement(ticker_obj, target_years)
+    print(f"OK (NaN years: {is_nan_years or 'none'})")
+
+    print("  Cash flow...", end=" ", flush=True)
+    cf_data, cf_nan_years = collect_cash_flow(ticker_obj, target_years)
+    print(f"OK (NaN years: {cf_nan_years or 'none'})")
+
+    print("  Balance sheet...", end=" ", flush=True)
+    bs_data = collect_balance_sheet(ticker_obj, latest_full_year)
+    print("OK")
+
+    print("  Analyst estimates...", end=" ", flush=True)
+    estimates = collect_analyst_estimates(ticker_obj)
+    print("OK" if estimates["_source"] != "missing" else "PARTIAL/MISSING")
+
+    elapsed = round(time.time() - start_time, 1)
+    all_nan_years = sorted(set(is_nan_years + cf_nan_years))
+
+    # Assemble output
+    output = {
+        "ticker": ticker_symbol,
+        "company_name": company_name,
+        "data_date": date.today().isoformat(),
+        "currency": "USD",
+        "unit": "millions_usd",
+        "data_sources": {
+            "market_data": "yfinance (live)",
+            "historical_financials": "yfinance annual financials",
+            "risk_free_rate": "yfinance ^TNX (10Y Treasury)",
+        },
+        "market_data": market_data,
+        "income_statement": income_data,
+        "cash_flow": cf_data,
+        "balance_sheet": bs_data,
+        "wacc_inputs": {
+            "risk_free_rate": rfr,
+            "beta": market_data.get("beta_5y_monthly"),
+            "credit_rating": None,
+            "_source": "yfinance + ^TNX",
+        },
+        "analyst_estimates": estimates,
+        "metadata": {
+            "_capex_convention": "negative = cash outflow (yfinance convention)",
+            "_fcf_note": "yfinance FCF = OperatingCF + CapEx. Does NOT deduct SBC.",
+            "_nan_years": all_nan_years,
+            "_collection_duration_seconds": elapsed,
+            "_target_years": target_years,
+        },
+    }
+
+    # Write output
+    Path(output_path).write_text(json.dumps(output, indent=2, ensure_ascii=False))
+    print(f"\nSaved to {output_path} ({elapsed}s)")
+
+    # Summary
+    if all_nan_years:
+        print(f"\n⚠️  WARNING: Years {all_nan_years} have missing data (NaN from yfinance).")
+        print("   Supplement from 10-K filings or SEC EDGAR before using in models.")
+
+    # Quick sanity check
+    if market_data["current_price"] and market_data["shares_outstanding_millions"]:
+        computed_mcap = market_data["current_price"] * market_data["shares_outstanding_millions"]
+        reported_mcap = market_data["market_cap_millions"]
+        if reported_mcap and abs(computed_mcap - reported_mcap) / reported_mcap > 0.05:
+            print(f"⚠️  Market cap mismatch: Price×Shares={computed_mcap:.0f}M vs Reported={reported_mcap:.0f}M")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/financial-data-collector/scripts/validate_data.py b/financial-data-collector/scripts/validate_data.py
new file mode 100644
index 0000000..6a1b11a
--- /dev/null
+++ b/financial-data-collector/scripts/validate_data.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""
+Validate financial data JSON output from collect_data.py.
+Checks completeness, consistency, and sanity of collected data.
+
+Usage:
+    python validate_data.py path/to/output.json
+
+Returns JSON validation report to stdout.
+"""
+# /// script
+# requires-python = ">=3.11"
+# dependencies = []
+# ///
+
+import json
+import sys
+from pathlib import Path
+
+
+def validate(data: dict) -> dict:
+    """Validate financial data JSON. Returns validation report."""
+    errors = []
+    warnings = []
+
+    # 1. Required top-level fields
+    for field in ["ticker", "company_name", "data_date", "market_data",
+                  "income_statement", "cash_flow", "balance_sheet", "wacc_inputs"]:
+        if field not in data:
+            errors.append(f"Missing required field: {field}")
+
+    if errors:
+        return {"status": "error", "errors": errors, "warnings": warnings}
+
+    md = data["market_data"]
+
+    # 2. Market data sanity
+    if md.get("current_price") is not None:
+        if md["current_price"] <= 0:
+            errors.append(f"Invalid stock price: {md['current_price']}")
+        if md["current_price"] > 10000:
+            warnings.append(f"Unusually high stock price: ${md['current_price']}")
+
+    if md.get("shares_outstanding_millions") is not None:
+        if md["shares_outstanding_millions"] <= 0:
+            errors.append(f"Invalid shares outstanding: {md['shares_outstanding_millions']}")
+
+    if md.get("beta_5y_monthly") is not None:
+        beta = md["beta_5y_monthly"]
+        if beta < 0.1 or beta > 5.0:
+            warnings.append(f"Unusual beta: {beta} (expected 0.3-3.0)")
+
+    # 3. Market cap cross-check
+    if md.get("current_price") and md.get("shares_outstanding_millions") and md.get("market_cap_millions"):
+        computed = md["current_price"] * md["shares_outstanding_millions"]
+        reported = md["market_cap_millions"]
+        pct_diff = abs(computed - reported) / reported
+        if pct_diff > 0.05:
+            # yfinance sharesOutstanding is basic; marketCap may use diluted. Known discrepancy.
+            warnings.append(f"Market cap mismatch ({pct_diff:.1%}): Price×Shares(basic)={computed:.0f}M vs Reported={reported:.0f}M. Likely basic vs diluted shares.")
+
+    # 4. Income statement completeness
+    is_data = data.get("income_statement", {})
+    years_with_data = 0
+    for year, vals in is_data.items():
+        if isinstance(vals, dict) and vals.get("revenue") is not None:
+            years_with_data += 1
+            # Revenue should be positive
+            if vals["revenue"] <= 0:
+                warnings.append(f"Non-positive revenue in {year}: {vals['revenue']}")
+            # EBIT margin sanity
+            if vals.get("ebit") is not None and vals["revenue"] > 0:
+                margin = vals["ebit"] / vals["revenue"]
+                if margin < -1.0 or margin > 0.8:
+                    warnings.append(f"Unusual EBIT margin in {year}: {margin:.1%}")
+
+    if years_with_data == 0:
+        errors.append("No income statement data available for any year")
+    elif years_with_data < 3:
+        warnings.append(f"Only {years_with_data} years of income statement data (recommend ≥3)")
+
+    # 5. Cash flow: CapEx sign convention
+    cf_data = data.get("cash_flow", {})
+    for year, vals in cf_data.items():
+        if isinstance(vals, dict) and vals.get("capex") is not None:
+            if vals["capex"] > 0:
+                warnings.append(f"CapEx is positive in {year} ({vals['capex']}). Expected negative (outflow).")
+
+    # 6. Balance sheet: Net debt consistency
+    bs_data = data.get("balance_sheet", {})
+    for year, vals in bs_data.items():
+        if isinstance(vals, dict):
+            td = vals.get("total_debt")
+            ce = vals.get("cash_and_equivalents")
+            nd = vals.get("net_debt")
+            if td is not None and ce is not None and nd is not None:
+                expected_nd = td - ce
+                if abs(expected_nd - nd) > 1.0:  # Allow $1M rounding
+                    errors.append(f"Net debt inconsistency in {year}: total_debt({td}) - cash({ce}) = {expected_nd} ≠ {nd}")
+
+    # 7. WACC inputs
+    wacc = data.get("wacc_inputs", {})
+    rfr = wacc.get("risk_free_rate")
+    if rfr is not None:
+        if rfr < 0 or rfr > 0.15:
+            warnings.append(f"Unusual risk-free rate: {rfr:.2%} (expected 1-8%)")
+    else:
+        warnings.append("Risk-free rate is missing")
+
+    # 8. NaN years tracking
+    meta = data.get("metadata", {})
+    nan_years = meta.get("_nan_years", [])
+    if nan_years:
+        warnings.append(f"NaN years detected: {nan_years}. Supplement from 10-K before using in models.")
+
+    # 9. Data source attribution
+    for section in ["income_statement", "cash_flow", "balance_sheet"]:
+        section_data = data.get(section, {})
+        for year, vals in section_data.items():
+            if isinstance(vals, dict) and "_source" not in vals:
+                warnings.append(f"Missing _source attribution in {section}.{year}")
+
+    status = "error" if errors else ("warning" if warnings else "success")
+    return {
+        "status": status,
+        "ticker": data.get("ticker"),
+        "years_with_data": years_with_data,
+        "errors": errors,
+        "warnings": warnings,
+        "error_count": len(errors),
+        "warning_count": len(warnings),
+    }
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python validate_data.py <json_file>", file=sys.stderr)
+        sys.exit(1)
+
+    json_path = sys.argv[1]
+    if not Path(json_path).exists():
+        print(json.dumps({"status": "error", "errors": [f"File not found: {json_path}"]}))
+        sys.exit(1)
+
+    data = json.loads(Path(json_path).read_text())
+    report = validate(data)
+
+    print(json.dumps(report, indent=2))
+
+    if report["status"] == "error":
+        sys.exit(1)
+    elif report["status"] == "warning":
+        sys.exit(0)  # Warnings are OK, just informational
+    else:
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()