- Move from data-analysis/ to engineering/ - Fix 5 cross-references to use correct domain paths - Fix Python 3.9 compat in sample_size_calculator.py Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
472 lines
16 KiB
Python
472 lines
16 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
hypothesis_tester.py — Z-test (proportions), Welch's t-test (means), Chi-square (categorical).
|
||
|
||
All math uses Python stdlib (math module only). No scipy, numpy, or pandas required.
|
||
|
||
Usage:
|
||
python3 hypothesis_tester.py --test ztest \
|
||
--control-n 5000 --control-x 250 \
|
||
--treatment-n 5000 --treatment-x 310
|
||
|
||
python3 hypothesis_tester.py --test ttest \
|
||
--control-mean 42.3 --control-std 18.1 --control-n 800 \
|
||
--treatment-mean 46.1 --treatment-std 19.4 --treatment-n 820
|
||
|
||
python3 hypothesis_tester.py --test chi2 \
|
||
--observed "120,80,50" --expected "100,100,50"
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import math
|
||
import sys
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Normal / t-distribution approximations (stdlib only)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def normal_cdf(z: float) -> float:
|
||
"""Cumulative distribution function of standard normal using math.erfc."""
|
||
return 0.5 * math.erfc(-z / math.sqrt(2))
|
||
|
||
|
||
def normal_ppf(p: float) -> float:
|
||
"""Percent-point function (inverse CDF) of standard normal via bisection."""
|
||
lo, hi = -10.0, 10.0
|
||
for _ in range(100):
|
||
mid = (lo + hi) / 2
|
||
if normal_cdf(mid) < p:
|
||
lo = mid
|
||
else:
|
||
hi = mid
|
||
return (lo + hi) / 2
|
||
|
||
|
||
def t_cdf(t: float, df: float) -> float:
|
||
"""
|
||
CDF of t-distribution via regularized incomplete beta function approximation.
|
||
Uses the relation: P(T ≤ t) = I_{x}(df/2, 1/2) where x = df/(df+t^2).
|
||
Falls back to normal CDF for large df (> 1000).
|
||
"""
|
||
if df > 1000:
|
||
return normal_cdf(t)
|
||
x = df / (df + t * t)
|
||
# Regularized incomplete beta via continued fraction (Lentz)
|
||
ib = _regularized_incomplete_beta(x, df / 2, 0.5)
|
||
p = ib / 2
|
||
return p if t <= 0 else 1 - p
|
||
|
||
|
||
def _regularized_incomplete_beta(x: float, a: float, b: float) -> float:
|
||
"""Regularized incomplete beta I_x(a,b) via continued fraction expansion."""
|
||
if x < 0 or x > 1:
|
||
return 0.0
|
||
if x == 0:
|
||
return 0.0
|
||
if x == 1:
|
||
return 1.0
|
||
lbeta = math.lgamma(a) + math.lgamma(b) - math.lgamma(a + b)
|
||
front = math.exp(math.log(x) * a + math.log(1 - x) * b - lbeta) / a
|
||
# Use symmetry for better convergence
|
||
if x > (a + 1) / (a + b + 2):
|
||
return 1 - _regularized_incomplete_beta(1 - x, b, a)
|
||
# Lentz continued fraction
|
||
TINY = 1e-30
|
||
f = TINY
|
||
C = f
|
||
D = 0.0
|
||
for m in range(200):
|
||
for s in (0, 1):
|
||
if m == 0 and s == 0:
|
||
num = 1.0
|
||
elif s == 0:
|
||
num = m * (b - m) * x / ((a + 2 * m - 1) * (a + 2 * m))
|
||
else:
|
||
num = -(a + m) * (a + b + m) * x / ((a + 2 * m) * (a + 2 * m + 1))
|
||
D = 1 + num * D
|
||
if abs(D) < TINY:
|
||
D = TINY
|
||
D = 1 / D
|
||
C = 1 + num / C
|
||
if abs(C) < TINY:
|
||
C = TINY
|
||
f *= C * D
|
||
if abs(C * D - 1) < 1e-10:
|
||
break
|
||
return front * f
|
||
|
||
|
||
def two_tail_p_normal(z: float) -> float:
|
||
return 2 * (1 - normal_cdf(abs(z)))
|
||
|
||
|
||
def two_tail_p_t(t: float, df: float) -> float:
|
||
return 2 * (1 - t_cdf(abs(t), df))
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Effect sizes
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def cohens_h(p1: float, p2: float) -> float:
|
||
"""Cohen's h for two proportions."""
|
||
return 2 * math.asin(math.sqrt(p1)) - 2 * math.asin(math.sqrt(p2))
|
||
|
||
|
||
def cohens_d(mean1: float, std1: float, n1: int, mean2: float, std2: float, n2: int) -> float:
|
||
"""Cohen's d using pooled standard deviation."""
|
||
pooled = math.sqrt(((n1 - 1) * std1 ** 2 + (n2 - 1) * std2 ** 2) / (n1 + n2 - 2))
|
||
return (mean1 - mean2) / pooled if pooled else 0.0
|
||
|
||
|
||
def cramers_v(chi2: float, n: int, k: int) -> float:
|
||
"""Cramér's V effect size for chi-square test."""
|
||
return math.sqrt(chi2 / (n * (k - 1))) if n and k > 1 else 0.0
|
||
|
||
|
||
def effect_label(val: float, metric: str) -> str:
|
||
thresholds = {"h": [0.2, 0.5, 0.8], "d": [0.2, 0.5, 0.8], "v": [0.1, 0.3, 0.5]}
|
||
t = thresholds.get(metric, [0.2, 0.5, 0.8])
|
||
v = abs(val)
|
||
if v < t[0]:
|
||
return "negligible"
|
||
if v < t[1]:
|
||
return "small"
|
||
if v < t[2]:
|
||
return "medium"
|
||
return "large"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Tests
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def ztest_proportions(cn: int, cx: int, tn: int, tx: int, alpha: float) -> dict:
|
||
"""Two-proportion Z-test."""
|
||
if cn <= 0 or tn <= 0:
|
||
return {"error": "Sample sizes must be positive."}
|
||
|
||
p_c = cx / cn
|
||
p_t = tx / tn
|
||
p_pool = (cx + tx) / (cn + tn)
|
||
|
||
se = math.sqrt(p_pool * (1 - p_pool) * (1 / cn + 1 / tn))
|
||
if se == 0:
|
||
return {"error": "Standard error is zero — check input values."}
|
||
|
||
z = (p_t - p_c) / se
|
||
p_value = two_tail_p_normal(z)
|
||
|
||
# Confidence interval for difference (unpooled SE)
|
||
se_diff = math.sqrt(p_c * (1 - p_c) / cn + p_t * (1 - p_t) / tn)
|
||
z_crit = normal_ppf(1 - alpha / 2)
|
||
diff = p_t - p_c
|
||
ci_lo = diff - z_crit * se_diff
|
||
ci_hi = diff + z_crit * se_diff
|
||
|
||
h = cohens_h(p_t, p_c)
|
||
lift = (p_t - p_c) / p_c * 100 if p_c else 0
|
||
|
||
return {
|
||
"test": "Two-proportion Z-test",
|
||
"control": {"n": cn, "conversions": cx, "rate": round(p_c, 6)},
|
||
"treatment": {"n": tn, "conversions": tx, "rate": round(p_t, 6)},
|
||
"difference": round(diff, 6),
|
||
"relative_lift_pct": round(lift, 2),
|
||
"z_statistic": round(z, 4),
|
||
"p_value": round(p_value, 6),
|
||
"significant": p_value < alpha,
|
||
"alpha": alpha,
|
||
"confidence_interval": {
|
||
"level": f"{int((1 - alpha) * 100)}%",
|
||
"lower": round(ci_lo, 6),
|
||
"upper": round(ci_hi, 6),
|
||
},
|
||
"effect_size": {
|
||
"cohens_h": round(abs(h), 4),
|
||
"interpretation": effect_label(h, "h"),
|
||
},
|
||
}
|
||
|
||
|
||
def ttest_means(cm: float, cs: float, cn: int, tm: float, ts: float, tn: int, alpha: float) -> dict:
|
||
"""Welch's two-sample t-test (unequal variances)."""
|
||
if cn < 2 or tn < 2:
|
||
return {"error": "Each group needs at least 2 observations."}
|
||
|
||
se = math.sqrt(cs ** 2 / cn + ts ** 2 / tn)
|
||
if se == 0:
|
||
return {"error": "Standard error is zero — check std values."}
|
||
|
||
t = (tm - cm) / se
|
||
|
||
# Welch–Satterthwaite degrees of freedom
|
||
num = (cs ** 2 / cn + ts ** 2 / tn) ** 2
|
||
denom = (cs ** 2 / cn) ** 2 / (cn - 1) + (ts ** 2 / tn) ** 2 / (tn - 1)
|
||
df = num / denom if denom else cn + tn - 2
|
||
|
||
p_value = two_tail_p_t(t, df)
|
||
|
||
z_crit = normal_ppf(1 - alpha / 2) if df > 1000 else normal_ppf(1 - alpha / 2)
|
||
# Use t critical value approximation
|
||
from_t = abs(t) / (p_value / 2) if p_value > 0 else z_crit # rough
|
||
t_crit = normal_ppf(1 - alpha / 2) # normal approx for CI
|
||
|
||
diff = tm - cm
|
||
ci_lo = diff - t_crit * se
|
||
ci_hi = diff + t_crit * se
|
||
|
||
d = cohens_d(tm, ts, tn, cm, cs, cn)
|
||
lift = (tm - cm) / cm * 100 if cm else 0
|
||
|
||
return {
|
||
"test": "Welch's two-sample t-test",
|
||
"control": {"n": cn, "mean": round(cm, 4), "std": round(cs, 4)},
|
||
"treatment": {"n": tn, "mean": round(tm, 4), "std": round(ts, 4)},
|
||
"difference": round(diff, 4),
|
||
"relative_lift_pct": round(lift, 2),
|
||
"t_statistic": round(t, 4),
|
||
"degrees_of_freedom": round(df, 1),
|
||
"p_value": round(p_value, 6),
|
||
"significant": p_value < alpha,
|
||
"alpha": alpha,
|
||
"confidence_interval": {
|
||
"level": f"{int((1 - alpha) * 100)}%",
|
||
"lower": round(ci_lo, 4),
|
||
"upper": round(ci_hi, 4),
|
||
},
|
||
"effect_size": {
|
||
"cohens_d": round(abs(d), 4),
|
||
"interpretation": effect_label(d, "d"),
|
||
},
|
||
}
|
||
|
||
|
||
def chi2_test(observed: list[float], expected: list[float], alpha: float) -> dict:
|
||
"""Chi-square goodness-of-fit test."""
|
||
if len(observed) != len(expected):
|
||
return {"error": "Observed and expected must have the same number of categories."}
|
||
if any(e <= 0 for e in expected):
|
||
return {"error": "Expected values must all be positive."}
|
||
if any(e < 5 for e in expected):
|
||
return {"warning": "Some expected values < 5 — chi-square approximation may be unreliable.",
|
||
"suggestion": "Consider combining categories or using Fisher's exact test."}
|
||
|
||
chi2 = sum((o - e) ** 2 / e for o, e in zip(observed, expected))
|
||
k = len(observed)
|
||
df = k - 1
|
||
n = sum(observed)
|
||
|
||
# Chi-square CDF via regularized gamma function approximation
|
||
p_value = 1 - _chi2_cdf(chi2, df)
|
||
|
||
v = cramers_v(chi2, int(n), k)
|
||
|
||
return {
|
||
"test": "Chi-square goodness-of-fit",
|
||
"categories": k,
|
||
"observed": observed,
|
||
"expected": expected,
|
||
"chi2_statistic": round(chi2, 4),
|
||
"degrees_of_freedom": df,
|
||
"p_value": round(p_value, 6),
|
||
"significant": p_value < alpha,
|
||
"alpha": alpha,
|
||
"effect_size": {
|
||
"cramers_v": round(v, 4),
|
||
"interpretation": effect_label(v, "v"),
|
||
},
|
||
}
|
||
|
||
|
||
def _chi2_cdf(x: float, k: float) -> float:
|
||
"""CDF of chi-square via regularized lower incomplete gamma."""
|
||
if x <= 0:
|
||
return 0.0
|
||
return _regularized_gamma(k / 2, x / 2)
|
||
|
||
|
||
def _regularized_gamma(a: float, x: float) -> float:
|
||
"""Lower regularized incomplete gamma P(a, x) via series expansion."""
|
||
if x < 0:
|
||
return 0.0
|
||
if x == 0:
|
||
return 0.0
|
||
if x < a + 1:
|
||
# Series expansion
|
||
ap = a
|
||
delta = 1.0 / a
|
||
total = delta
|
||
for _ in range(300):
|
||
ap += 1
|
||
delta *= x / ap
|
||
total += delta
|
||
if abs(delta) < abs(total) * 1e-10:
|
||
break
|
||
return total * math.exp(-x + a * math.log(x) - math.lgamma(a))
|
||
else:
|
||
# Continued fraction (Lentz)
|
||
b = x + 1 - a
|
||
c = 1e30
|
||
d = 1 / b
|
||
f = d
|
||
for i in range(1, 300):
|
||
an = -i * (i - a)
|
||
b += 2
|
||
d = an * d + b
|
||
if abs(d) < 1e-30:
|
||
d = 1e-30
|
||
c = b + an / c
|
||
if abs(c) < 1e-30:
|
||
c = 1e-30
|
||
d = 1 / d
|
||
delta = d * c
|
||
f *= delta
|
||
if abs(delta - 1) < 1e-10:
|
||
break
|
||
return 1 - math.exp(-x + a * math.log(x) - math.lgamma(a)) * f
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Reporting
|
||
# ---------------------------------------------------------------------------
|
||
|
||
DIRECTION = {True: "statistically significant", False: "NOT statistically significant"}
|
||
|
||
|
||
def verdict(result: dict) -> str:
|
||
if "error" in result:
|
||
return f"ERROR: {result['error']}"
|
||
sig = result.get("significant", False)
|
||
p = result.get("p_value", 1.0)
|
||
alpha = result.get("alpha", 0.05)
|
||
diff = result.get("difference", 0)
|
||
lift = result.get("relative_lift_pct")
|
||
ci = result.get("confidence_interval", {})
|
||
es = result.get("effect_size", {})
|
||
es_name = "Cohen's h" if "cohens_h" in es else ("Cohen's d" if "cohens_d" in es else "Cramér's V")
|
||
es_val = es.get("cohens_h") or es.get("cohens_d") or es.get("cramers_v", 0)
|
||
es_interp = es.get("interpretation", "")
|
||
|
||
lines = [
|
||
"",
|
||
"=" * 60,
|
||
f" {result.get('test', 'Hypothesis Test')}",
|
||
"=" * 60,
|
||
]
|
||
|
||
if "control" in result and "rate" in result["control"]:
|
||
c = result["control"]
|
||
t = result["treatment"]
|
||
lines += [
|
||
f" Control: {c['rate']:.4%} (n={c['n']}, conversions={c['conversions']})",
|
||
f" Treatment: {t['rate']:.4%} (n={t['n']}, conversions={t['conversions']})",
|
||
f" Difference: {diff:+.4%} ({'+' if lift >= 0 else ''}{lift:.1f}% relative lift)",
|
||
]
|
||
elif "control" in result and "mean" in result["control"]:
|
||
c = result["control"]
|
||
t = result["treatment"]
|
||
lines += [
|
||
f" Control: mean={c['mean']} std={c['std']} n={c['n']}",
|
||
f" Treatment: mean={t['mean']} std={t['std']} n={t['n']}",
|
||
f" Difference: {diff:+.4f} ({'+' if lift >= 0 else ''}{lift:.1f}% relative lift)",
|
||
]
|
||
elif "observed" in result:
|
||
lines += [
|
||
f" Observed: {result['observed']}",
|
||
f" Expected: {result['expected']}",
|
||
]
|
||
|
||
lines += [
|
||
"",
|
||
f" p-value: {p:.6f} (α={alpha})",
|
||
f" Result: {DIRECTION[sig].upper()}",
|
||
]
|
||
if ci:
|
||
lines.append(f" {ci['level']} CI: [{ci['lower']}, {ci['upper']}]")
|
||
lines += [
|
||
f" Effect: {es_name} = {es_val} ({es_interp})",
|
||
"",
|
||
]
|
||
|
||
# Plain English verdict
|
||
if sig:
|
||
lines.append(f" ✅ VERDICT: The difference is real (p={p:.4f} < α={alpha}).")
|
||
if es_interp in ("negligible", "small"):
|
||
lines.append(" ⚠️ BUT: Effect is small — confirm practical significance before shipping.")
|
||
else:
|
||
lines.append(" Effect size is meaningful. Recommend shipping if no negative guardrails.")
|
||
else:
|
||
lines.append(f" ❌ VERDICT: Insufficient evidence to conclude a difference exists (p={p:.4f} ≥ α={alpha}).")
|
||
lines.append(" Options: extend the test, increase MDE, or kill if underpowered.")
|
||
|
||
lines.append("=" * 60)
|
||
return "\n".join(lines)
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Run hypothesis tests on experiment results.")
|
||
parser.add_argument("--test", choices=["ztest", "ttest", "chi2"], required=True)
|
||
parser.add_argument("--alpha", type=float, default=0.05, help="Significance level (default: 0.05)")
|
||
parser.add_argument("--format", choices=["text", "json"], default="text")
|
||
|
||
# Z-test / t-test shared
|
||
parser.add_argument("--control-n", type=int)
|
||
parser.add_argument("--treatment-n", type=int)
|
||
|
||
# Z-test
|
||
parser.add_argument("--control-x", type=int, help="Conversions in control group")
|
||
parser.add_argument("--treatment-x", type=int, help="Conversions in treatment group")
|
||
|
||
# t-test
|
||
parser.add_argument("--control-mean", type=float)
|
||
parser.add_argument("--control-std", type=float)
|
||
parser.add_argument("--treatment-mean", type=float)
|
||
parser.add_argument("--treatment-std", type=float)
|
||
|
||
# chi2
|
||
parser.add_argument("--observed", help="Comma-separated observed counts")
|
||
parser.add_argument("--expected", help="Comma-separated expected counts")
|
||
|
||
args = parser.parse_args()
|
||
|
||
if args.test == "ztest":
|
||
for req in ["control_n", "control_x", "treatment_n", "treatment_x"]:
|
||
if getattr(args, req) is None:
|
||
print(f"Error: --{req.replace('_', '-')} is required for ztest", file=sys.stderr)
|
||
sys.exit(1)
|
||
result = ztest_proportions(args.control_n, args.control_x, args.treatment_n, args.treatment_x, args.alpha)
|
||
|
||
elif args.test == "ttest":
|
||
for req in ["control_n", "control_mean", "control_std", "treatment_n", "treatment_mean", "treatment_std"]:
|
||
if getattr(args, req) is None:
|
||
print(f"Error: --{req.replace('_', '-')} is required for ttest", file=sys.stderr)
|
||
sys.exit(1)
|
||
result = ttest_means(
|
||
args.control_mean, args.control_std, args.control_n,
|
||
args.treatment_mean, args.treatment_std, args.treatment_n,
|
||
args.alpha
|
||
)
|
||
|
||
elif args.test == "chi2":
|
||
if not args.observed or not args.expected:
|
||
print("Error: --observed and --expected are required for chi2", file=sys.stderr)
|
||
sys.exit(1)
|
||
observed = [float(x.strip()) for x in args.observed.split(",")]
|
||
expected = [float(x.strip()) for x in args.expected.split(",")]
|
||
result = chi2_test(observed, expected, args.alpha)
|
||
|
||
if args.format == "json":
|
||
print(json.dumps(result, indent=2))
|
||
else:
|
||
if "error" in result:
|
||
print(f"Error: {result['error']}", file=sys.stderr)
|
||
sys.exit(1)
|
||
print(verdict(result))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|