Files
claude-skills-reference/engineering/statistical-analyst/scripts/hypothesis_tester.py
Reza Rezvani 7c2564845a refactor(engineering): move statistical-analyst to engineering/, fix cross-refs
- Move from data-analysis/ to engineering/
- Fix 5 cross-references to use correct domain paths
- Fix Python 3.9 compat in sample_size_calculator.py

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 11:18:12 +02:00

472 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
hypothesis_tester.py — Z-test (proportions), Welch's t-test (means), Chi-square (categorical).
All math uses Python stdlib (math module only). No scipy, numpy, or pandas required.
Usage:
python3 hypothesis_tester.py --test ztest \
--control-n 5000 --control-x 250 \
--treatment-n 5000 --treatment-x 310
python3 hypothesis_tester.py --test ttest \
--control-mean 42.3 --control-std 18.1 --control-n 800 \
--treatment-mean 46.1 --treatment-std 19.4 --treatment-n 820
python3 hypothesis_tester.py --test chi2 \
--observed "120,80,50" --expected "100,100,50"
"""
import argparse
import json
import math
import sys
# ---------------------------------------------------------------------------
# Normal / t-distribution approximations (stdlib only)
# ---------------------------------------------------------------------------
def normal_cdf(z: float) -> float:
"""Cumulative distribution function of standard normal using math.erfc."""
return 0.5 * math.erfc(-z / math.sqrt(2))
def normal_ppf(p: float) -> float:
"""Percent-point function (inverse CDF) of standard normal via bisection."""
lo, hi = -10.0, 10.0
for _ in range(100):
mid = (lo + hi) / 2
if normal_cdf(mid) < p:
lo = mid
else:
hi = mid
return (lo + hi) / 2
def t_cdf(t: float, df: float) -> float:
"""
CDF of t-distribution via regularized incomplete beta function approximation.
Uses the relation: P(T ≤ t) = I_{x}(df/2, 1/2) where x = df/(df+t^2).
Falls back to normal CDF for large df (> 1000).
"""
if df > 1000:
return normal_cdf(t)
x = df / (df + t * t)
# Regularized incomplete beta via continued fraction (Lentz)
ib = _regularized_incomplete_beta(x, df / 2, 0.5)
p = ib / 2
return p if t <= 0 else 1 - p
def _regularized_incomplete_beta(x: float, a: float, b: float) -> float:
"""Regularized incomplete beta I_x(a,b) via continued fraction expansion."""
if x < 0 or x > 1:
return 0.0
if x == 0:
return 0.0
if x == 1:
return 1.0
lbeta = math.lgamma(a) + math.lgamma(b) - math.lgamma(a + b)
front = math.exp(math.log(x) * a + math.log(1 - x) * b - lbeta) / a
# Use symmetry for better convergence
if x > (a + 1) / (a + b + 2):
return 1 - _regularized_incomplete_beta(1 - x, b, a)
# Lentz continued fraction
TINY = 1e-30
f = TINY
C = f
D = 0.0
for m in range(200):
for s in (0, 1):
if m == 0 and s == 0:
num = 1.0
elif s == 0:
num = m * (b - m) * x / ((a + 2 * m - 1) * (a + 2 * m))
else:
num = -(a + m) * (a + b + m) * x / ((a + 2 * m) * (a + 2 * m + 1))
D = 1 + num * D
if abs(D) < TINY:
D = TINY
D = 1 / D
C = 1 + num / C
if abs(C) < TINY:
C = TINY
f *= C * D
if abs(C * D - 1) < 1e-10:
break
return front * f
def two_tail_p_normal(z: float) -> float:
return 2 * (1 - normal_cdf(abs(z)))
def two_tail_p_t(t: float, df: float) -> float:
return 2 * (1 - t_cdf(abs(t), df))
# ---------------------------------------------------------------------------
# Effect sizes
# ---------------------------------------------------------------------------
def cohens_h(p1: float, p2: float) -> float:
"""Cohen's h for two proportions."""
return 2 * math.asin(math.sqrt(p1)) - 2 * math.asin(math.sqrt(p2))
def cohens_d(mean1: float, std1: float, n1: int, mean2: float, std2: float, n2: int) -> float:
"""Cohen's d using pooled standard deviation."""
pooled = math.sqrt(((n1 - 1) * std1 ** 2 + (n2 - 1) * std2 ** 2) / (n1 + n2 - 2))
return (mean1 - mean2) / pooled if pooled else 0.0
def cramers_v(chi2: float, n: int, k: int) -> float:
"""Cramér's V effect size for chi-square test."""
return math.sqrt(chi2 / (n * (k - 1))) if n and k > 1 else 0.0
def effect_label(val: float, metric: str) -> str:
thresholds = {"h": [0.2, 0.5, 0.8], "d": [0.2, 0.5, 0.8], "v": [0.1, 0.3, 0.5]}
t = thresholds.get(metric, [0.2, 0.5, 0.8])
v = abs(val)
if v < t[0]:
return "negligible"
if v < t[1]:
return "small"
if v < t[2]:
return "medium"
return "large"
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
def ztest_proportions(cn: int, cx: int, tn: int, tx: int, alpha: float) -> dict:
"""Two-proportion Z-test."""
if cn <= 0 or tn <= 0:
return {"error": "Sample sizes must be positive."}
p_c = cx / cn
p_t = tx / tn
p_pool = (cx + tx) / (cn + tn)
se = math.sqrt(p_pool * (1 - p_pool) * (1 / cn + 1 / tn))
if se == 0:
return {"error": "Standard error is zero — check input values."}
z = (p_t - p_c) / se
p_value = two_tail_p_normal(z)
# Confidence interval for difference (unpooled SE)
se_diff = math.sqrt(p_c * (1 - p_c) / cn + p_t * (1 - p_t) / tn)
z_crit = normal_ppf(1 - alpha / 2)
diff = p_t - p_c
ci_lo = diff - z_crit * se_diff
ci_hi = diff + z_crit * se_diff
h = cohens_h(p_t, p_c)
lift = (p_t - p_c) / p_c * 100 if p_c else 0
return {
"test": "Two-proportion Z-test",
"control": {"n": cn, "conversions": cx, "rate": round(p_c, 6)},
"treatment": {"n": tn, "conversions": tx, "rate": round(p_t, 6)},
"difference": round(diff, 6),
"relative_lift_pct": round(lift, 2),
"z_statistic": round(z, 4),
"p_value": round(p_value, 6),
"significant": p_value < alpha,
"alpha": alpha,
"confidence_interval": {
"level": f"{int((1 - alpha) * 100)}%",
"lower": round(ci_lo, 6),
"upper": round(ci_hi, 6),
},
"effect_size": {
"cohens_h": round(abs(h), 4),
"interpretation": effect_label(h, "h"),
},
}
def ttest_means(cm: float, cs: float, cn: int, tm: float, ts: float, tn: int, alpha: float) -> dict:
"""Welch's two-sample t-test (unequal variances)."""
if cn < 2 or tn < 2:
return {"error": "Each group needs at least 2 observations."}
se = math.sqrt(cs ** 2 / cn + ts ** 2 / tn)
if se == 0:
return {"error": "Standard error is zero — check std values."}
t = (tm - cm) / se
# WelchSatterthwaite degrees of freedom
num = (cs ** 2 / cn + ts ** 2 / tn) ** 2
denom = (cs ** 2 / cn) ** 2 / (cn - 1) + (ts ** 2 / tn) ** 2 / (tn - 1)
df = num / denom if denom else cn + tn - 2
p_value = two_tail_p_t(t, df)
z_crit = normal_ppf(1 - alpha / 2) if df > 1000 else normal_ppf(1 - alpha / 2)
# Use t critical value approximation
from_t = abs(t) / (p_value / 2) if p_value > 0 else z_crit # rough
t_crit = normal_ppf(1 - alpha / 2) # normal approx for CI
diff = tm - cm
ci_lo = diff - t_crit * se
ci_hi = diff + t_crit * se
d = cohens_d(tm, ts, tn, cm, cs, cn)
lift = (tm - cm) / cm * 100 if cm else 0
return {
"test": "Welch's two-sample t-test",
"control": {"n": cn, "mean": round(cm, 4), "std": round(cs, 4)},
"treatment": {"n": tn, "mean": round(tm, 4), "std": round(ts, 4)},
"difference": round(diff, 4),
"relative_lift_pct": round(lift, 2),
"t_statistic": round(t, 4),
"degrees_of_freedom": round(df, 1),
"p_value": round(p_value, 6),
"significant": p_value < alpha,
"alpha": alpha,
"confidence_interval": {
"level": f"{int((1 - alpha) * 100)}%",
"lower": round(ci_lo, 4),
"upper": round(ci_hi, 4),
},
"effect_size": {
"cohens_d": round(abs(d), 4),
"interpretation": effect_label(d, "d"),
},
}
def chi2_test(observed: list[float], expected: list[float], alpha: float) -> dict:
"""Chi-square goodness-of-fit test."""
if len(observed) != len(expected):
return {"error": "Observed and expected must have the same number of categories."}
if any(e <= 0 for e in expected):
return {"error": "Expected values must all be positive."}
if any(e < 5 for e in expected):
return {"warning": "Some expected values < 5 — chi-square approximation may be unreliable.",
"suggestion": "Consider combining categories or using Fisher's exact test."}
chi2 = sum((o - e) ** 2 / e for o, e in zip(observed, expected))
k = len(observed)
df = k - 1
n = sum(observed)
# Chi-square CDF via regularized gamma function approximation
p_value = 1 - _chi2_cdf(chi2, df)
v = cramers_v(chi2, int(n), k)
return {
"test": "Chi-square goodness-of-fit",
"categories": k,
"observed": observed,
"expected": expected,
"chi2_statistic": round(chi2, 4),
"degrees_of_freedom": df,
"p_value": round(p_value, 6),
"significant": p_value < alpha,
"alpha": alpha,
"effect_size": {
"cramers_v": round(v, 4),
"interpretation": effect_label(v, "v"),
},
}
def _chi2_cdf(x: float, k: float) -> float:
"""CDF of chi-square via regularized lower incomplete gamma."""
if x <= 0:
return 0.0
return _regularized_gamma(k / 2, x / 2)
def _regularized_gamma(a: float, x: float) -> float:
"""Lower regularized incomplete gamma P(a, x) via series expansion."""
if x < 0:
return 0.0
if x == 0:
return 0.0
if x < a + 1:
# Series expansion
ap = a
delta = 1.0 / a
total = delta
for _ in range(300):
ap += 1
delta *= x / ap
total += delta
if abs(delta) < abs(total) * 1e-10:
break
return total * math.exp(-x + a * math.log(x) - math.lgamma(a))
else:
# Continued fraction (Lentz)
b = x + 1 - a
c = 1e30
d = 1 / b
f = d
for i in range(1, 300):
an = -i * (i - a)
b += 2
d = an * d + b
if abs(d) < 1e-30:
d = 1e-30
c = b + an / c
if abs(c) < 1e-30:
c = 1e-30
d = 1 / d
delta = d * c
f *= delta
if abs(delta - 1) < 1e-10:
break
return 1 - math.exp(-x + a * math.log(x) - math.lgamma(a)) * f
# ---------------------------------------------------------------------------
# Reporting
# ---------------------------------------------------------------------------
DIRECTION = {True: "statistically significant", False: "NOT statistically significant"}
def verdict(result: dict) -> str:
if "error" in result:
return f"ERROR: {result['error']}"
sig = result.get("significant", False)
p = result.get("p_value", 1.0)
alpha = result.get("alpha", 0.05)
diff = result.get("difference", 0)
lift = result.get("relative_lift_pct")
ci = result.get("confidence_interval", {})
es = result.get("effect_size", {})
es_name = "Cohen's h" if "cohens_h" in es else ("Cohen's d" if "cohens_d" in es else "Cramér's V")
es_val = es.get("cohens_h") or es.get("cohens_d") or es.get("cramers_v", 0)
es_interp = es.get("interpretation", "")
lines = [
"",
"=" * 60,
f" {result.get('test', 'Hypothesis Test')}",
"=" * 60,
]
if "control" in result and "rate" in result["control"]:
c = result["control"]
t = result["treatment"]
lines += [
f" Control: {c['rate']:.4%} (n={c['n']}, conversions={c['conversions']})",
f" Treatment: {t['rate']:.4%} (n={t['n']}, conversions={t['conversions']})",
f" Difference: {diff:+.4%} ({'+' if lift >= 0 else ''}{lift:.1f}% relative lift)",
]
elif "control" in result and "mean" in result["control"]:
c = result["control"]
t = result["treatment"]
lines += [
f" Control: mean={c['mean']} std={c['std']} n={c['n']}",
f" Treatment: mean={t['mean']} std={t['std']} n={t['n']}",
f" Difference: {diff:+.4f} ({'+' if lift >= 0 else ''}{lift:.1f}% relative lift)",
]
elif "observed" in result:
lines += [
f" Observed: {result['observed']}",
f" Expected: {result['expected']}",
]
lines += [
"",
f" p-value: {p:.6f} (α={alpha})",
f" Result: {DIRECTION[sig].upper()}",
]
if ci:
lines.append(f" {ci['level']} CI: [{ci['lower']}, {ci['upper']}]")
lines += [
f" Effect: {es_name} = {es_val} ({es_interp})",
"",
]
# Plain English verdict
if sig:
lines.append(f" ✅ VERDICT: The difference is real (p={p:.4f} < α={alpha}).")
if es_interp in ("negligible", "small"):
lines.append(" ⚠️ BUT: Effect is small — confirm practical significance before shipping.")
else:
lines.append(" Effect size is meaningful. Recommend shipping if no negative guardrails.")
else:
lines.append(f" ❌ VERDICT: Insufficient evidence to conclude a difference exists (p={p:.4f}α={alpha}).")
lines.append(" Options: extend the test, increase MDE, or kill if underpowered.")
lines.append("=" * 60)
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Run hypothesis tests on experiment results.")
parser.add_argument("--test", choices=["ztest", "ttest", "chi2"], required=True)
parser.add_argument("--alpha", type=float, default=0.05, help="Significance level (default: 0.05)")
parser.add_argument("--format", choices=["text", "json"], default="text")
# Z-test / t-test shared
parser.add_argument("--control-n", type=int)
parser.add_argument("--treatment-n", type=int)
# Z-test
parser.add_argument("--control-x", type=int, help="Conversions in control group")
parser.add_argument("--treatment-x", type=int, help="Conversions in treatment group")
# t-test
parser.add_argument("--control-mean", type=float)
parser.add_argument("--control-std", type=float)
parser.add_argument("--treatment-mean", type=float)
parser.add_argument("--treatment-std", type=float)
# chi2
parser.add_argument("--observed", help="Comma-separated observed counts")
parser.add_argument("--expected", help="Comma-separated expected counts")
args = parser.parse_args()
if args.test == "ztest":
for req in ["control_n", "control_x", "treatment_n", "treatment_x"]:
if getattr(args, req) is None:
print(f"Error: --{req.replace('_', '-')} is required for ztest", file=sys.stderr)
sys.exit(1)
result = ztest_proportions(args.control_n, args.control_x, args.treatment_n, args.treatment_x, args.alpha)
elif args.test == "ttest":
for req in ["control_n", "control_mean", "control_std", "treatment_n", "treatment_mean", "treatment_std"]:
if getattr(args, req) is None:
print(f"Error: --{req.replace('_', '-')} is required for ttest", file=sys.stderr)
sys.exit(1)
result = ttest_means(
args.control_mean, args.control_std, args.control_n,
args.treatment_mean, args.treatment_std, args.treatment_n,
args.alpha
)
elif args.test == "chi2":
if not args.observed or not args.expected:
print("Error: --observed and --expected are required for chi2", file=sys.stderr)
sys.exit(1)
observed = [float(x.strip()) for x in args.observed.split(",")]
expected = [float(x.strip()) for x in args.expected.split(",")]
result = chi2_test(observed, expected, args.alpha)
if args.format == "json":
print(json.dumps(result, indent=2))
else:
if "error" in result:
print(f"Error: {result['error']}", file=sys.stderr)
sys.exit(1)
print(verdict(result))
if __name__ == "__main__":
main()