feat: add financial-data-collector skill for US equity data collection

New skill that collects real financial data for any US publicly traded company
via yfinance. Outputs structured JSON with market data, historical financials,
WACC inputs, and analyst estimates. Includes 9-check validation script and
reference docs for yfinance pitfalls (NaN years, field aliases, FCF mismatch).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
daymade
2026-03-02 19:40:52 +08:00
parent 11b7539f10
commit 2896870061
11 changed files with 1045 additions and 9 deletions

View File

@@ -0,0 +1,159 @@
#!/usr/bin/env python3
"""
Validate financial data JSON output from collect_data.py.
Checks completeness, consistency, and sanity of collected data.
Usage:
python validate_data.py path/to/output.json
Returns JSON validation report to stdout.
"""
# /// script
# requires-python = ">=3.11"
# dependencies = []
# ///
import json
import sys
from pathlib import Path
def validate(data: dict) -> dict:
"""Validate financial data JSON. Returns validation report."""
errors = []
warnings = []
# 1. Required top-level fields
for field in ["ticker", "company_name", "data_date", "market_data",
"income_statement", "cash_flow", "balance_sheet", "wacc_inputs"]:
if field not in data:
errors.append(f"Missing required field: {field}")
if errors:
return {"status": "error", "errors": errors, "warnings": warnings}
md = data["market_data"]
# 2. Market data sanity
if md.get("current_price") is not None:
if md["current_price"] <= 0:
errors.append(f"Invalid stock price: {md['current_price']}")
if md["current_price"] > 10000:
warnings.append(f"Unusually high stock price: ${md['current_price']}")
if md.get("shares_outstanding_millions") is not None:
if md["shares_outstanding_millions"] <= 0:
errors.append(f"Invalid shares outstanding: {md['shares_outstanding_millions']}")
if md.get("beta_5y_monthly") is not None:
beta = md["beta_5y_monthly"]
if beta < 0.1 or beta > 5.0:
warnings.append(f"Unusual beta: {beta} (expected 0.3-3.0)")
# 3. Market cap cross-check
if md.get("current_price") and md.get("shares_outstanding_millions") and md.get("market_cap_millions"):
computed = md["current_price"] * md["shares_outstanding_millions"]
reported = md["market_cap_millions"]
pct_diff = abs(computed - reported) / reported
if pct_diff > 0.05:
# yfinance sharesOutstanding is basic; marketCap may use diluted. Known discrepancy.
warnings.append(f"Market cap mismatch ({pct_diff:.1%}): Price×Shares(basic)={computed:.0f}M vs Reported={reported:.0f}M. Likely basic vs diluted shares.")
# 4. Income statement completeness
is_data = data.get("income_statement", {})
years_with_data = 0
for year, vals in is_data.items():
if isinstance(vals, dict) and vals.get("revenue") is not None:
years_with_data += 1
# Revenue should be positive
if vals["revenue"] <= 0:
warnings.append(f"Non-positive revenue in {year}: {vals['revenue']}")
# EBIT margin sanity
if vals.get("ebit") is not None and vals["revenue"] > 0:
margin = vals["ebit"] / vals["revenue"]
if margin < -1.0 or margin > 0.8:
warnings.append(f"Unusual EBIT margin in {year}: {margin:.1%}")
if years_with_data == 0:
errors.append("No income statement data available for any year")
elif years_with_data < 3:
warnings.append(f"Only {years_with_data} years of income statement data (recommend ≥3)")
# 5. Cash flow: CapEx sign convention
cf_data = data.get("cash_flow", {})
for year, vals in cf_data.items():
if isinstance(vals, dict) and vals.get("capex") is not None:
if vals["capex"] > 0:
warnings.append(f"CapEx is positive in {year} ({vals['capex']}). Expected negative (outflow).")
# 6. Balance sheet: Net debt consistency
bs_data = data.get("balance_sheet", {})
for year, vals in bs_data.items():
if isinstance(vals, dict):
td = vals.get("total_debt")
ce = vals.get("cash_and_equivalents")
nd = vals.get("net_debt")
if td is not None and ce is not None and nd is not None:
expected_nd = td - ce
if abs(expected_nd - nd) > 1.0: # Allow $1M rounding
errors.append(f"Net debt inconsistency in {year}: total_debt({td}) - cash({ce}) = {expected_nd}{nd}")
# 7. WACC inputs
wacc = data.get("wacc_inputs", {})
rfr = wacc.get("risk_free_rate")
if rfr is not None:
if rfr < 0 or rfr > 0.15:
warnings.append(f"Unusual risk-free rate: {rfr:.2%} (expected 1-8%)")
else:
warnings.append("Risk-free rate is missing")
# 8. NaN years tracking
meta = data.get("metadata", {})
nan_years = meta.get("_nan_years", [])
if nan_years:
warnings.append(f"NaN years detected: {nan_years}. Supplement from 10-K before using in models.")
# 9. Data source attribution
for section in ["income_statement", "cash_flow", "balance_sheet"]:
section_data = data.get(section, {})
for year, vals in section_data.items():
if isinstance(vals, dict) and "_source" not in vals:
warnings.append(f"Missing _source attribution in {section}.{year}")
status = "error" if errors else ("warning" if warnings else "success")
return {
"status": status,
"ticker": data.get("ticker"),
"years_with_data": years_with_data,
"errors": errors,
"warnings": warnings,
"error_count": len(errors),
"warning_count": len(warnings),
}
def main():
if len(sys.argv) < 2:
print("Usage: python validate_data.py <json_file>", file=sys.stderr)
sys.exit(1)
json_path = sys.argv[1]
if not Path(json_path).exists():
print(json.dumps({"status": "error", "errors": [f"File not found: {json_path}"]}))
sys.exit(1)
data = json.loads(Path(json_path).read_text())
report = validate(data)
print(json.dumps(report, indent=2))
if report["status"] == "error":
sys.exit(1)
elif report["status"] == "warning":
sys.exit(0) # Warnings are OK, just informational
else:
sys.exit(0)
if __name__ == "__main__":
main()