Files
claude-code-skills-reference/financial-data-collector/scripts/validate_data.py
daymade 2896870061 feat: add financial-data-collector skill for US equity data collection
New skill that collects real financial data for any US publicly traded company
via yfinance. Outputs structured JSON with market data, historical financials,
WACC inputs, and analyst estimates. Includes 9-check validation script and
reference docs for yfinance pitfalls (NaN years, field aliases, FCF mismatch).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 19:40:52 +08:00

160 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Validate financial data JSON output from collect_data.py.
Checks completeness, consistency, and sanity of collected data.
Usage:
python validate_data.py path/to/output.json
Returns JSON validation report to stdout.
"""
# /// script
# requires-python = ">=3.11"
# dependencies = []
# ///
import json
import sys
from pathlib import Path
def validate(data: dict) -> dict:
"""Validate financial data JSON. Returns validation report."""
errors = []
warnings = []
# 1. Required top-level fields
for field in ["ticker", "company_name", "data_date", "market_data",
"income_statement", "cash_flow", "balance_sheet", "wacc_inputs"]:
if field not in data:
errors.append(f"Missing required field: {field}")
if errors:
return {"status": "error", "errors": errors, "warnings": warnings}
md = data["market_data"]
# 2. Market data sanity
if md.get("current_price") is not None:
if md["current_price"] <= 0:
errors.append(f"Invalid stock price: {md['current_price']}")
if md["current_price"] > 10000:
warnings.append(f"Unusually high stock price: ${md['current_price']}")
if md.get("shares_outstanding_millions") is not None:
if md["shares_outstanding_millions"] <= 0:
errors.append(f"Invalid shares outstanding: {md['shares_outstanding_millions']}")
if md.get("beta_5y_monthly") is not None:
beta = md["beta_5y_monthly"]
if beta < 0.1 or beta > 5.0:
warnings.append(f"Unusual beta: {beta} (expected 0.3-3.0)")
# 3. Market cap cross-check
if md.get("current_price") and md.get("shares_outstanding_millions") and md.get("market_cap_millions"):
computed = md["current_price"] * md["shares_outstanding_millions"]
reported = md["market_cap_millions"]
pct_diff = abs(computed - reported) / reported
if pct_diff > 0.05:
# yfinance sharesOutstanding is basic; marketCap may use diluted. Known discrepancy.
warnings.append(f"Market cap mismatch ({pct_diff:.1%}): Price×Shares(basic)={computed:.0f}M vs Reported={reported:.0f}M. Likely basic vs diluted shares.")
# 4. Income statement completeness
is_data = data.get("income_statement", {})
years_with_data = 0
for year, vals in is_data.items():
if isinstance(vals, dict) and vals.get("revenue") is not None:
years_with_data += 1
# Revenue should be positive
if vals["revenue"] <= 0:
warnings.append(f"Non-positive revenue in {year}: {vals['revenue']}")
# EBIT margin sanity
if vals.get("ebit") is not None and vals["revenue"] > 0:
margin = vals["ebit"] / vals["revenue"]
if margin < -1.0 or margin > 0.8:
warnings.append(f"Unusual EBIT margin in {year}: {margin:.1%}")
if years_with_data == 0:
errors.append("No income statement data available for any year")
elif years_with_data < 3:
warnings.append(f"Only {years_with_data} years of income statement data (recommend ≥3)")
# 5. Cash flow: CapEx sign convention
cf_data = data.get("cash_flow", {})
for year, vals in cf_data.items():
if isinstance(vals, dict) and vals.get("capex") is not None:
if vals["capex"] > 0:
warnings.append(f"CapEx is positive in {year} ({vals['capex']}). Expected negative (outflow).")
# 6. Balance sheet: Net debt consistency
bs_data = data.get("balance_sheet", {})
for year, vals in bs_data.items():
if isinstance(vals, dict):
td = vals.get("total_debt")
ce = vals.get("cash_and_equivalents")
nd = vals.get("net_debt")
if td is not None and ce is not None and nd is not None:
expected_nd = td - ce
if abs(expected_nd - nd) > 1.0: # Allow $1M rounding
errors.append(f"Net debt inconsistency in {year}: total_debt({td}) - cash({ce}) = {expected_nd}{nd}")
# 7. WACC inputs
wacc = data.get("wacc_inputs", {})
rfr = wacc.get("risk_free_rate")
if rfr is not None:
if rfr < 0 or rfr > 0.15:
warnings.append(f"Unusual risk-free rate: {rfr:.2%} (expected 1-8%)")
else:
warnings.append("Risk-free rate is missing")
# 8. NaN years tracking
meta = data.get("metadata", {})
nan_years = meta.get("_nan_years", [])
if nan_years:
warnings.append(f"NaN years detected: {nan_years}. Supplement from 10-K before using in models.")
# 9. Data source attribution
for section in ["income_statement", "cash_flow", "balance_sheet"]:
section_data = data.get(section, {})
for year, vals in section_data.items():
if isinstance(vals, dict) and "_source" not in vals:
warnings.append(f"Missing _source attribution in {section}.{year}")
status = "error" if errors else ("warning" if warnings else "success")
return {
"status": status,
"ticker": data.get("ticker"),
"years_with_data": years_with_data,
"errors": errors,
"warnings": warnings,
"error_count": len(errors),
"warning_count": len(warnings),
}
def main():
if len(sys.argv) < 2:
print("Usage: python validate_data.py <json_file>", file=sys.stderr)
sys.exit(1)
json_path = sys.argv[1]
if not Path(json_path).exists():
print(json.dumps({"status": "error", "errors": [f"File not found: {json_path}"]}))
sys.exit(1)
data = json.loads(Path(json_path).read_text())
report = validate(data)
print(json.dumps(report, indent=2))
if report["status"] == "error":
sys.exit(1)
elif report["status"] == "warning":
sys.exit(0) # Warnings are OK, just informational
else:
sys.exit(0)
if __name__ == "__main__":
main()