Files
daymade 2896870061 feat: add financial-data-collector skill for US equity data collection
New skill that collects real financial data for any US publicly traded company
via yfinance. Outputs structured JSON with market data, historical financials,
WACC inputs, and analyst estimates. Includes 9-check validation script and
reference docs for yfinance pitfalls (NaN years, field aliases, FCF mismatch).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 19:40:52 +08:00

363 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Collect real financial data for a US publicly traded company.
Output: structured JSON consumable by downstream financial skills.
Usage:
python collect_data.py TICKER [--years 5] [--output path/to/output.json]
Examples:
python collect_data.py META
python collect_data.py AAPL --years 3 --output /tmp/aapl_data.json
python collect_data.py NVDA --years 5
Dependencies: yfinance, pandas (via uv inline or pip)
"""
# /// script
# requires-python = ">=3.11"
# dependencies = ["yfinance>=0.2.0", "pandas>=2.0.0"]
# ///
import argparse
import json
import sys
import time
from datetime import date
from pathlib import Path
import pandas as pd
import yfinance as yf
# Field name aliases — yfinance row indices vary across versions
FIELD_ALIASES = {
"revenue": ["Total Revenue", "Revenue", "Operating Revenue"],
"ebit": ["Operating Income", "EBIT"],
"ebitda": ["EBITDA", "Normalized EBITDA"],
"tax_expense": ["Tax Provision", "Income Tax Expense"],
"net_income": ["Net Income", "Net Income Common Stockholders"],
"capex": ["Capital Expenditure", "Capital Expenditures"],
"operating_cash_flow": ["Operating Cash Flow", "Cash Flow From Continuing Operating Activities"],
"depreciation_amortization": ["Depreciation And Amortization", "Depreciation Amortization Depletion"],
"free_cash_flow": ["Free Cash Flow"],
"change_in_nwc": ["Change In Working Capital", "Changes In Working Capital"],
"sbc": ["Stock Based Compensation"],
"total_debt": ["Total Debt"],
"long_term_debt": ["Long Term Debt"],
"short_term_debt": ["Short Long Term Debt", "Current Debt"],
"cash": ["Cash And Cash Equivalents"],
"short_investments": ["Other Short Term Investments", "Short Term Investments"],
"current_assets": ["Current Assets"],
"current_liabilities": ["Current Liabilities"],
"total_assets": ["Total Assets"],
"total_equity": ["Stockholders Equity", "Total Equity Gross Minority Interest"],
}
def safe_get(df: pd.DataFrame, field_key: str, col) -> float | None:
"""Safely extract a value from a DataFrame using alias chain."""
aliases = FIELD_ALIASES.get(field_key, [field_key])
for alias in aliases:
if alias in df.index:
val = df.loc[alias, col]
if pd.notna(val):
return float(val)
return None
def get_year_col(df: pd.DataFrame, year: int):
"""Find the column matching a target year in a yfinance DataFrame."""
matches = [c for c in df.columns if c.year == year]
return matches[0] if matches else None
def collect_market_data(ticker_obj: yf.Ticker) -> dict:
"""Collect real-time market data."""
info = ticker_obj.info
price = info.get("currentPrice") or info.get("regularMarketPrice")
shares_raw = info.get("sharesOutstanding")
shares = shares_raw / 1e6 if shares_raw else None
mcap_raw = info.get("marketCap")
mcap = mcap_raw / 1e6 if mcap_raw else None
beta = info.get("beta")
return {
"current_price": price,
"shares_outstanding_millions": round(shares, 2) if shares else None,
"market_cap_millions": round(mcap, 2) if mcap else None,
"beta_5y_monthly": round(beta, 3) if beta else None,
}
def collect_risk_free_rate() -> float | None:
"""Get 10Y Treasury yield as risk-free rate proxy."""
try:
tnx = yf.Ticker("^TNX")
hist = tnx.history(period="5d")
if hist.empty:
return None
yield_pct = hist["Close"].iloc[-1]
return round(float(yield_pct) / 100, 4) # Convert percentage to decimal
except Exception:
return None
def collect_income_statement(ticker_obj: yf.Ticker, years: list[int]) -> dict:
"""Collect income statement data for specified years."""
try:
financials = ticker_obj.financials
except Exception:
return {str(y): {"_source": "yfinance error"} for y in years}
if financials is None or financials.empty:
return {str(y): {"_source": "yfinance returned empty"} for y in years}
result = {}
nan_years = []
for year in years:
col = get_year_col(financials, year)
if col is None:
result[str(year)] = {"_source": f"yfinance has no column for {year}"}
nan_years.append(year)
continue
revenue = safe_get(financials, "revenue", col)
ebit = safe_get(financials, "ebit", col)
if revenue is None and ebit is None:
nan_years.append(year)
result[str(year)] = {
"revenue": None, "ebit": None, "ebitda": None,
"tax_expense": None, "net_income": None,
"_source": f"yfinance returned NaN for {year} — supplement from 10-K filing",
}
else:
result[str(year)] = {
"revenue": revenue,
"ebit": ebit,
"ebitda": safe_get(financials, "ebitda", col),
"tax_expense": safe_get(financials, "tax_expense", col),
"net_income": safe_get(financials, "net_income", col),
"_source": "yfinance",
}
return result, nan_years
def collect_cash_flow(ticker_obj: yf.Ticker, years: list[int]) -> dict:
"""Collect cash flow data."""
try:
cf = ticker_obj.cashflow
except Exception:
return {str(y): {"_source": "yfinance error"} for y in years}, []
if cf is None or cf.empty:
return {str(y): {"_source": "yfinance returned empty"} for y in years}, years
result = {}
nan_years = []
for year in years:
col = get_year_col(cf, year)
if col is None:
result[str(year)] = {"_source": f"yfinance has no column for {year}"}
nan_years.append(year)
continue
ocf = safe_get(cf, "operating_cash_flow", col)
capex = safe_get(cf, "capex", col)
if ocf is None and capex is None:
nan_years.append(year)
result[str(year)] = {
"operating_cash_flow": None, "capex": None,
"depreciation_amortization": None, "free_cash_flow": None,
"change_in_nwc": None, "sbc": None,
"_source": f"yfinance returned NaN for {year}",
}
else:
result[str(year)] = {
"operating_cash_flow": ocf,
"capex": capex, # Negative = outflow (preserved)
"depreciation_amortization": safe_get(cf, "depreciation_amortization", col),
"free_cash_flow": safe_get(cf, "free_cash_flow", col),
"change_in_nwc": safe_get(cf, "change_in_nwc", col),
"sbc": safe_get(cf, "sbc", col),
"_source": "yfinance",
}
return result, nan_years
def collect_balance_sheet(ticker_obj: yf.Ticker, latest_year: int) -> dict:
"""Collect balance sheet data for the latest year."""
try:
bs = ticker_obj.balance_sheet
except Exception:
return {str(latest_year): {"_source": "yfinance error"}}
if bs is None or bs.empty:
return {str(latest_year): {"_source": "yfinance returned empty"}}
col = get_year_col(bs, latest_year)
if col is None:
return {str(latest_year): {"_source": f"yfinance has no column for {latest_year}"}}
total_debt = safe_get(bs, "total_debt", col)
if total_debt is None:
lt = safe_get(bs, "long_term_debt", col) or 0
st = safe_get(bs, "short_term_debt", col) or 0
total_debt = lt + st if (lt or st) else None
cash = safe_get(bs, "cash", col)
short_inv = safe_get(bs, "short_investments", col)
cash_equiv = (cash or 0) + (short_inv or 0) if (cash is not None or short_inv is not None) else None
net_debt = (total_debt - cash_equiv) if (total_debt is not None and cash_equiv is not None) else None
return {
str(latest_year): {
"total_debt": total_debt,
"cash_and_equivalents": cash_equiv,
"net_debt": round(net_debt, 2) if net_debt is not None else None,
"current_assets": safe_get(bs, "current_assets", col),
"current_liabilities": safe_get(bs, "current_liabilities", col),
"total_assets": safe_get(bs, "total_assets", col),
"total_equity": safe_get(bs, "total_equity", col),
"_source": "yfinance",
}
}
def collect_analyst_estimates(ticker_obj: yf.Ticker) -> dict:
"""Collect analyst consensus estimates."""
result = {"revenue_next_fy": None, "revenue_fy_after": None, "eps_next_fy": None, "_source": "missing"}
try:
rev_est = ticker_obj.revenue_estimate
if rev_est is not None and not rev_est.empty:
if "0y" in rev_est.index:
result["revenue_next_fy"] = float(rev_est.loc["0y", "avg"]) / 1e6 if pd.notna(rev_est.loc["0y", "avg"]) else None
if "+1y" in rev_est.index:
result["revenue_fy_after"] = float(rev_est.loc["+1y", "avg"]) / 1e6 if pd.notna(rev_est.loc["+1y", "avg"]) else None
result["_source"] = "yfinance revenue_estimate"
except Exception:
pass
try:
eps = ticker_obj.eps_trend
if eps is not None and not eps.empty:
if "0y" in eps.index:
val = eps.loc["0y", "current"] if "current" in eps.columns else eps.iloc[eps.index.get_loc("0y"), 0]
result["eps_next_fy"] = float(val) if pd.notna(val) else None
except Exception:
pass
return result
def main():
parser = argparse.ArgumentParser(description="Collect financial data for a US public company")
parser.add_argument("ticker", help="Stock ticker symbol (e.g., META, AAPL)")
parser.add_argument("--years", type=int, default=5, help="Number of historical years (default: 5)")
parser.add_argument("--output", help="Output JSON file path (default: {TICKER}_financial_data.json)")
args = parser.parse_args()
ticker_symbol = args.ticker.upper()
current_year = date.today().year
# Exclude current year if annual report not yet filed (typically before Q4 earnings)
latest_full_year = current_year - 1
target_years = list(range(latest_full_year - args.years + 1, latest_full_year + 1))
output_path = args.output or f"{ticker_symbol}_financial_data.json"
print(f"Collecting data for {ticker_symbol} ({target_years[0]}-{target_years[-1]})...")
start_time = time.time()
ticker_obj = yf.Ticker(ticker_symbol)
# Verify ticker is valid
info = ticker_obj.info
if not info or info.get("regularMarketPrice") is None:
print(f"ERROR: {ticker_symbol} not found or no market data available", file=sys.stderr)
sys.exit(1)
company_name = info.get("longName") or info.get("shortName") or ticker_symbol
# Collect all data
print(" Market data...", end=" ", flush=True)
market_data = collect_market_data(ticker_obj)
print("OK")
print(" Risk-free rate...", end=" ", flush=True)
rfr = collect_risk_free_rate()
print(f"{rfr:.4f}" if rfr else "MISSING")
print(" Income statement...", end=" ", flush=True)
income_data, is_nan_years = collect_income_statement(ticker_obj, target_years)
print(f"OK (NaN years: {is_nan_years or 'none'})")
print(" Cash flow...", end=" ", flush=True)
cf_data, cf_nan_years = collect_cash_flow(ticker_obj, target_years)
print(f"OK (NaN years: {cf_nan_years or 'none'})")
print(" Balance sheet...", end=" ", flush=True)
bs_data = collect_balance_sheet(ticker_obj, latest_full_year)
print("OK")
print(" Analyst estimates...", end=" ", flush=True)
estimates = collect_analyst_estimates(ticker_obj)
print("OK" if estimates["_source"] != "missing" else "PARTIAL/MISSING")
elapsed = round(time.time() - start_time, 1)
all_nan_years = sorted(set(is_nan_years + cf_nan_years))
# Assemble output
output = {
"ticker": ticker_symbol,
"company_name": company_name,
"data_date": date.today().isoformat(),
"currency": "USD",
"unit": "millions_usd",
"data_sources": {
"market_data": "yfinance (live)",
"historical_financials": "yfinance annual financials",
"risk_free_rate": "yfinance ^TNX (10Y Treasury)",
},
"market_data": market_data,
"income_statement": income_data,
"cash_flow": cf_data,
"balance_sheet": bs_data,
"wacc_inputs": {
"risk_free_rate": rfr,
"beta": market_data.get("beta_5y_monthly"),
"credit_rating": None,
"_source": "yfinance + ^TNX",
},
"analyst_estimates": estimates,
"metadata": {
"_capex_convention": "negative = cash outflow (yfinance convention)",
"_fcf_note": "yfinance FCF = OperatingCF + CapEx. Does NOT deduct SBC.",
"_nan_years": all_nan_years,
"_collection_duration_seconds": elapsed,
"_target_years": target_years,
},
}
# Write output
Path(output_path).write_text(json.dumps(output, indent=2, ensure_ascii=False))
print(f"\nSaved to {output_path} ({elapsed}s)")
# Summary
if all_nan_years:
print(f"\n⚠️ WARNING: Years {all_nan_years} have missing data (NaN from yfinance).")
print(" Supplement from 10-K filings or SEC EDGAR before using in models.")
# Quick sanity check
if market_data["current_price"] and market_data["shares_outstanding_millions"]:
computed_mcap = market_data["current_price"] * market_data["shares_outstanding_millions"]
reported_mcap = market_data["market_cap_millions"]
if reported_mcap and abs(computed_mcap - reported_mcap) / reported_mcap > 0.05:
print(f"⚠️ Market cap mismatch: Price×Shares={computed_mcap:.0f}M vs Reported={reported_mcap:.0f}M")
if __name__ == "__main__":
main()