claude-skills-reference/engineering/statistical-analyst/scripts/hypothesis_tester.py

#!/usr/bin/env python3
"""
hypothesis_tester.py — Z-test (proportions), Welch's t-test (means), Chi-square (categorical).

All math uses Python stdlib (math module only). No scipy, numpy, or pandas required.

Usage:
    python3 hypothesis_tester.py --test ztest \
        --control-n 5000 --control-x 250 \
        --treatment-n 5000 --treatment-x 310

    python3 hypothesis_tester.py --test ttest \
        --control-mean 42.3 --control-std 18.1 --control-n 800 \
        --treatment-mean 46.1 --treatment-std 19.4 --treatment-n 820

    python3 hypothesis_tester.py --test chi2 \
        --observed "120,80,50" --expected "100,100,50"
"""

import argparse
import json
import math
import sys


# ---------------------------------------------------------------------------
# Normal / t-distribution approximations (stdlib only)
# ---------------------------------------------------------------------------

def normal_cdf(z: float) -> float:
    """Cumulative distribution function of standard normal using math.erfc."""
    return 0.5 * math.erfc(-z / math.sqrt(2))


def normal_ppf(p: float) -> float:
    """Percent-point function (inverse CDF) of standard normal via bisection."""
    lo, hi = -10.0, 10.0
    for _ in range(100):
        mid = (lo + hi) / 2
        if normal_cdf(mid) < p:
            lo = mid
        else:
            hi = mid
    return (lo + hi) / 2


def t_cdf(t: float, df: float) -> float:
    """
    CDF of t-distribution via regularized incomplete beta function approximation.
    Uses the relation: P(T ≤ t) = I_{x}(df/2, 1/2) where x = df/(df+t^2).
    Falls back to normal CDF for large df (> 1000).
    """
    if df > 1000:
        return normal_cdf(t)
    x = df / (df + t * t)
    # Regularized incomplete beta via continued fraction (Lentz)
    ib = _regularized_incomplete_beta(x, df / 2, 0.5)
    p = ib / 2
    return p if t <= 0 else 1 - p


def _regularized_incomplete_beta(x: float, a: float, b: float) -> float:
    """Regularized incomplete beta I_x(a,b) via continued fraction expansion."""
    if x < 0 or x > 1:
        return 0.0
    if x == 0:
        return 0.0
    if x == 1:
        return 1.0
    lbeta = math.lgamma(a) + math.lgamma(b) - math.lgamma(a + b)
    front = math.exp(math.log(x) * a + math.log(1 - x) * b - lbeta) / a
    # Use symmetry for better convergence
    if x > (a + 1) / (a + b + 2):
        return 1 - _regularized_incomplete_beta(1 - x, b, a)
    # Lentz continued fraction
    TINY = 1e-30
    f = TINY
    C = f
    D = 0.0
    for m in range(200):
        for s in (0, 1):
            if m == 0 and s == 0:
                num = 1.0
            elif s == 0:
                num = m * (b - m) * x / ((a + 2 * m - 1) * (a + 2 * m))
            else:
                num = -(a + m) * (a + b + m) * x / ((a + 2 * m) * (a + 2 * m + 1))
            D = 1 + num * D
            if abs(D) < TINY:
                D = TINY
            D = 1 / D
            C = 1 + num / C
            if abs(C) < TINY:
                C = TINY
            f *= C * D
            if abs(C * D - 1) < 1e-10:
                break
    return front * f


def two_tail_p_normal(z: float) -> float:
    return 2 * (1 - normal_cdf(abs(z)))


def two_tail_p_t(t: float, df: float) -> float:
    return 2 * (1 - t_cdf(abs(t), df))


# ---------------------------------------------------------------------------
# Effect sizes
# ---------------------------------------------------------------------------

def cohens_h(p1: float, p2: float) -> float:
    """Cohen's h for two proportions."""
    return 2 * math.asin(math.sqrt(p1)) - 2 * math.asin(math.sqrt(p2))


def cohens_d(mean1: float, std1: float, n1: int, mean2: float, std2: float, n2: int) -> float:
    """Cohen's d using pooled standard deviation."""
    pooled = math.sqrt(((n1 - 1) * std1 ** 2 + (n2 - 1) * std2 ** 2) / (n1 + n2 - 2))
    return (mean1 - mean2) / pooled if pooled else 0.0


def cramers_v(chi2: float, n: int, k: int) -> float:
    """Cramér's V effect size for chi-square test."""
    return math.sqrt(chi2 / (n * (k - 1))) if n and k > 1 else 0.0


def effect_label(val: float, metric: str) -> str:
    thresholds = {"h": [0.2, 0.5, 0.8], "d": [0.2, 0.5, 0.8], "v": [0.1, 0.3, 0.5]}
    t = thresholds.get(metric, [0.2, 0.5, 0.8])
    v = abs(val)
    if v < t[0]:
        return "negligible"
    if v < t[1]:
        return "small"
    if v < t[2]:
        return "medium"
    return "large"


# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------

def ztest_proportions(cn: int, cx: int, tn: int, tx: int, alpha: float) -> dict:
    """Two-proportion Z-test."""
    if cn <= 0 or tn <= 0:
        return {"error": "Sample sizes must be positive."}

    p_c = cx / cn
    p_t = tx / tn
    p_pool = (cx + tx) / (cn + tn)

    se = math.sqrt(p_pool * (1 - p_pool) * (1 / cn + 1 / tn))
    if se == 0:
        return {"error": "Standard error is zero — check input values."}

    z = (p_t - p_c) / se
    p_value = two_tail_p_normal(z)

    # Confidence interval for difference (unpooled SE)
    se_diff = math.sqrt(p_c * (1 - p_c) / cn + p_t * (1 - p_t) / tn)
    z_crit = normal_ppf(1 - alpha / 2)
    diff = p_t - p_c
    ci_lo = diff - z_crit * se_diff
    ci_hi = diff + z_crit * se_diff

    h = cohens_h(p_t, p_c)
    lift = (p_t - p_c) / p_c * 100 if p_c else 0

    return {
        "test": "Two-proportion Z-test",
        "control": {"n": cn, "conversions": cx, "rate": round(p_c, 6)},
        "treatment": {"n": tn, "conversions": tx, "rate": round(p_t, 6)},
        "difference": round(diff, 6),
        "relative_lift_pct": round(lift, 2),
        "z_statistic": round(z, 4),
        "p_value": round(p_value, 6),
        "significant": p_value < alpha,
        "alpha": alpha,
        "confidence_interval": {
            "level": f"{int((1 - alpha) * 100)}%",
            "lower": round(ci_lo, 6),
            "upper": round(ci_hi, 6),
        },
        "effect_size": {
            "cohens_h": round(abs(h), 4),
            "interpretation": effect_label(h, "h"),
        },
    }


def ttest_means(cm: float, cs: float, cn: int, tm: float, ts: float, tn: int, alpha: float) -> dict:
    """Welch's two-sample t-test (unequal variances)."""
    if cn < 2 or tn < 2:
        return {"error": "Each group needs at least 2 observations."}

    se = math.sqrt(cs ** 2 / cn + ts ** 2 / tn)
    if se == 0:
        return {"error": "Standard error is zero — check std values."}

    t = (tm - cm) / se

    # Welch–Satterthwaite degrees of freedom
    num = (cs ** 2 / cn + ts ** 2 / tn) ** 2
    denom = (cs ** 2 / cn) ** 2 / (cn - 1) + (ts ** 2 / tn) ** 2 / (tn - 1)
    df = num / denom if denom else cn + tn - 2

    p_value = two_tail_p_t(t, df)

    z_crit = normal_ppf(1 - alpha / 2) if df > 1000 else normal_ppf(1 - alpha / 2)
    # Use t critical value approximation
    from_t = abs(t) / (p_value / 2) if p_value > 0 else z_crit  # rough
    t_crit = normal_ppf(1 - alpha / 2)  # normal approx for CI

    diff = tm - cm
    ci_lo = diff - t_crit * se
    ci_hi = diff + t_crit * se

    d = cohens_d(tm, ts, tn, cm, cs, cn)
    lift = (tm - cm) / cm * 100 if cm else 0

    return {
        "test": "Welch's two-sample t-test",
        "control": {"n": cn, "mean": round(cm, 4), "std": round(cs, 4)},
        "treatment": {"n": tn, "mean": round(tm, 4), "std": round(ts, 4)},
        "difference": round(diff, 4),
        "relative_lift_pct": round(lift, 2),
        "t_statistic": round(t, 4),
        "degrees_of_freedom": round(df, 1),
        "p_value": round(p_value, 6),
        "significant": p_value < alpha,
        "alpha": alpha,
        "confidence_interval": {
            "level": f"{int((1 - alpha) * 100)}%",
            "lower": round(ci_lo, 4),
            "upper": round(ci_hi, 4),
        },
        "effect_size": {
            "cohens_d": round(abs(d), 4),
            "interpretation": effect_label(d, "d"),
        },
    }


def chi2_test(observed: list[float], expected: list[float], alpha: float) -> dict:
    """Chi-square goodness-of-fit test."""
    if len(observed) != len(expected):
        return {"error": "Observed and expected must have the same number of categories."}
    if any(e <= 0 for e in expected):
        return {"error": "Expected values must all be positive."}
    if any(e < 5 for e in expected):
        return {"warning": "Some expected values < 5 — chi-square approximation may be unreliable.",
                "suggestion": "Consider combining categories or using Fisher's exact test."}

    chi2 = sum((o - e) ** 2 / e for o, e in zip(observed, expected))
    k = len(observed)
    df = k - 1
    n = sum(observed)

    # Chi-square CDF via regularized gamma function approximation
    p_value = 1 - _chi2_cdf(chi2, df)

    v = cramers_v(chi2, int(n), k)

    return {
        "test": "Chi-square goodness-of-fit",
        "categories": k,
        "observed": observed,
        "expected": expected,
        "chi2_statistic": round(chi2, 4),
        "degrees_of_freedom": df,
        "p_value": round(p_value, 6),
        "significant": p_value < alpha,
        "alpha": alpha,
        "effect_size": {
            "cramers_v": round(v, 4),
            "interpretation": effect_label(v, "v"),
        },
    }


def _chi2_cdf(x: float, k: float) -> float:
    """CDF of chi-square via regularized lower incomplete gamma."""
    if x <= 0:
        return 0.0
    return _regularized_gamma(k / 2, x / 2)


def _regularized_gamma(a: float, x: float) -> float:
    """Lower regularized incomplete gamma P(a, x) via series expansion."""
    if x < 0:
        return 0.0
    if x == 0:
        return 0.0
    if x < a + 1:
        # Series expansion
        ap = a
        delta = 1.0 / a
        total = delta
        for _ in range(300):
            ap += 1
            delta *= x / ap
            total += delta
            if abs(delta) < abs(total) * 1e-10:
                break
        return total * math.exp(-x + a * math.log(x) - math.lgamma(a))
    else:
        # Continued fraction (Lentz)
        b = x + 1 - a
        c = 1e30
        d = 1 / b
        f = d
        for i in range(1, 300):
            an = -i * (i - a)
            b += 2
            d = an * d + b
            if abs(d) < 1e-30:
                d = 1e-30
            c = b + an / c
            if abs(c) < 1e-30:
                c = 1e-30
            d = 1 / d
            delta = d * c
            f *= delta
            if abs(delta - 1) < 1e-10:
                break
        return 1 - math.exp(-x + a * math.log(x) - math.lgamma(a)) * f


# ---------------------------------------------------------------------------
# Reporting
# ---------------------------------------------------------------------------

DIRECTION = {True: "statistically significant", False: "NOT statistically significant"}


def verdict(result: dict) -> str:
    if "error" in result:
        return f"ERROR: {result['error']}"
    sig = result.get("significant", False)
    p = result.get("p_value", 1.0)
    alpha = result.get("alpha", 0.05)
    diff = result.get("difference", 0)
    lift = result.get("relative_lift_pct")
    ci = result.get("confidence_interval", {})
    es = result.get("effect_size", {})
    es_name = "Cohen's h" if "cohens_h" in es else ("Cohen's d" if "cohens_d" in es else "Cramér's V")
    es_val = es.get("cohens_h") or es.get("cohens_d") or es.get("cramers_v", 0)
    es_interp = es.get("interpretation", "")

    lines = [
        "",
        "=" * 60,
        f"  {result.get('test', 'Hypothesis Test')}",
        "=" * 60,
    ]

    if "control" in result and "rate" in result["control"]:
        c = result["control"]
        t = result["treatment"]
        lines += [
            f"  Control:   {c['rate']:.4%}  (n={c['n']}, conversions={c['conversions']})",
            f"  Treatment: {t['rate']:.4%}  (n={t['n']}, conversions={t['conversions']})",
            f"  Difference: {diff:+.4%}  ({'+' if lift >= 0 else ''}{lift:.1f}% relative lift)",
        ]
    elif "control" in result and "mean" in result["control"]:
        c = result["control"]
        t = result["treatment"]
        lines += [
            f"  Control:   mean={c['mean']}  std={c['std']}  n={c['n']}",
            f"  Treatment: mean={t['mean']}  std={t['std']}  n={t['n']}",
            f"  Difference: {diff:+.4f}  ({'+' if lift >= 0 else ''}{lift:.1f}% relative lift)",
        ]
    elif "observed" in result:
        lines += [
            f"  Observed: {result['observed']}",
            f"  Expected: {result['expected']}",
        ]

    lines += [
        "",
        f"  p-value:    {p:.6f}  (α={alpha})",
        f"  Result:     {DIRECTION[sig].upper()}",
    ]
    if ci:
        lines.append(f"  {ci['level']} CI: [{ci['lower']}, {ci['upper']}]")
    lines += [
        f"  Effect:     {es_name} = {es_val}  ({es_interp})",
        "",
    ]

    # Plain English verdict
    if sig:
        lines.append(f"  ✅ VERDICT: The difference is real (p={p:.4f} < α={alpha}).")
        if es_interp in ("negligible", "small"):
            lines.append("  ⚠️  BUT: Effect is small — confirm practical significance before shipping.")
        else:
            lines.append("  Effect size is meaningful. Recommend shipping if no negative guardrails.")
    else:
        lines.append(f"  ❌ VERDICT: Insufficient evidence to conclude a difference exists (p={p:.4f} ≥ α={alpha}).")
        lines.append("  Options: extend the test, increase MDE, or kill if underpowered.")

    lines.append("=" * 60)
    return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser(description="Run hypothesis tests on experiment results.")
    parser.add_argument("--test", choices=["ztest", "ttest", "chi2"], required=True)
    parser.add_argument("--alpha", type=float, default=0.05, help="Significance level (default: 0.05)")
    parser.add_argument("--format", choices=["text", "json"], default="text")

    # Z-test / t-test shared
    parser.add_argument("--control-n", type=int)
    parser.add_argument("--treatment-n", type=int)

    # Z-test
    parser.add_argument("--control-x", type=int, help="Conversions in control group")
    parser.add_argument("--treatment-x", type=int, help="Conversions in treatment group")

    # t-test
    parser.add_argument("--control-mean", type=float)
    parser.add_argument("--control-std", type=float)
    parser.add_argument("--treatment-mean", type=float)
    parser.add_argument("--treatment-std", type=float)

    # chi2
    parser.add_argument("--observed", help="Comma-separated observed counts")
    parser.add_argument("--expected", help="Comma-separated expected counts")

    args = parser.parse_args()

    if args.test == "ztest":
        for req in ["control_n", "control_x", "treatment_n", "treatment_x"]:
            if getattr(args, req) is None:
                print(f"Error: --{req.replace('_', '-')} is required for ztest", file=sys.stderr)
                sys.exit(1)
        result = ztest_proportions(args.control_n, args.control_x, args.treatment_n, args.treatment_x, args.alpha)

    elif args.test == "ttest":
        for req in ["control_n", "control_mean", "control_std", "treatment_n", "treatment_mean", "treatment_std"]:
            if getattr(args, req) is None:
                print(f"Error: --{req.replace('_', '-')} is required for ttest", file=sys.stderr)
                sys.exit(1)
        result = ttest_means(
            args.control_mean, args.control_std, args.control_n,
            args.treatment_mean, args.treatment_std, args.treatment_n,
            args.alpha
        )

    elif args.test == "chi2":
        if not args.observed or not args.expected:
            print("Error: --observed and --expected are required for chi2", file=sys.stderr)
            sys.exit(1)
        observed = [float(x.strip()) for x in args.observed.split(",")]
        expected = [float(x.strip()) for x in args.expected.split(",")]
        result = chi2_test(observed, expected, args.alpha)

    if args.format == "json":
        print(json.dumps(result, indent=2))
    else:
        if "error" in result:
            print(f"Error: {result['error']}", file=sys.stderr)
            sys.exit(1)
        print(verdict(result))


if __name__ == "__main__":
    main()