Adds statistical-analyst skill — fills a gap in the repo (no hypothesis testing or experiment analysis tooling exists; only ab-test-setup for instrumentation, but zero analysis capability). Three stdlib-only Python scripts: - hypothesis_tester.py: Z-test (proportions), Welch's t-test (means), Chi-square (categorical) with p-value, CI, Cohen's d/h, Cramér's V - sample_size_calculator.py: required n per variant for proportion and mean tests, with power/MDE tradeoff table and duration estimates - confidence_interval.py: Wilson score interval (proportions) and z-based interval (means) with margin of error and precision notes Validator: 86.4/100 (GOOD). Security audit: PASS (0 critical/high). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
199 lines
6.8 KiB
Python
199 lines
6.8 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
confidence_interval.py — Confidence intervals for proportions and means.
|
||
|
||
Methods:
|
||
proportion — Wilson score interval (recommended over normal approximation for small n or extreme p)
|
||
mean — t-based interval using normal approximation for large n
|
||
|
||
Usage:
|
||
python3 confidence_interval.py --type proportion --n 1200 --x 96
|
||
python3 confidence_interval.py --type mean --n 800 --mean 42.3 --std 18.1
|
||
python3 confidence_interval.py --type proportion --n 1200 --x 96 --confidence 0.99
|
||
python3 confidence_interval.py --type proportion --n 1200 --x 96 --format json
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import math
|
||
import sys
|
||
|
||
|
||
def normal_ppf(p: float) -> float:
|
||
"""Inverse normal CDF via bisection."""
|
||
lo, hi = -10.0, 10.0
|
||
for _ in range(100):
|
||
mid = (lo + hi) / 2
|
||
if 0.5 * math.erfc(-mid / math.sqrt(2)) < p:
|
||
lo = mid
|
||
else:
|
||
hi = mid
|
||
return (lo + hi) / 2
|
||
|
||
|
||
def wilson_interval(n: int, x: int, confidence: float) -> dict:
|
||
"""
|
||
Wilson score confidence interval for a proportion.
|
||
More accurate than normal approximation, especially for small n or p near 0/1.
|
||
"""
|
||
if n <= 0:
|
||
return {"error": "n must be positive"}
|
||
if x < 0 or x > n:
|
||
return {"error": "x must be between 0 and n"}
|
||
|
||
p_hat = x / n
|
||
z = normal_ppf(1 - (1 - confidence) / 2)
|
||
z2 = z ** 2
|
||
|
||
center = (p_hat + z2 / (2 * n)) / (1 + z2 / n)
|
||
margin = (z / (1 + z2 / n)) * math.sqrt(p_hat * (1 - p_hat) / n + z2 / (4 * n ** 2))
|
||
|
||
lo = max(0.0, center - margin)
|
||
hi = min(1.0, center + margin)
|
||
|
||
# Normal approximation for comparison
|
||
se = math.sqrt(p_hat * (1 - p_hat) / n) if n > 0 else 0
|
||
normal_lo = max(0.0, p_hat - z * se)
|
||
normal_hi = min(1.0, p_hat + z * se)
|
||
|
||
return {
|
||
"type": "proportion",
|
||
"method": "Wilson score interval",
|
||
"n": n,
|
||
"successes": x,
|
||
"observed_rate": round(p_hat, 6),
|
||
"confidence": confidence,
|
||
"lower": round(lo, 6),
|
||
"upper": round(hi, 6),
|
||
"margin_of_error": round((hi - lo) / 2, 6),
|
||
"normal_approximation": {
|
||
"lower": round(normal_lo, 6),
|
||
"upper": round(normal_hi, 6),
|
||
"note": "Wilson is preferred; normal approx shown for reference",
|
||
},
|
||
}
|
||
|
||
|
||
def mean_interval(n: int, mean: float, std: float, confidence: float) -> dict:
|
||
"""
|
||
Confidence interval for a mean.
|
||
Uses normal approximation (z-based) for n >= 30, t-approximation otherwise.
|
||
"""
|
||
if n <= 1:
|
||
return {"error": "n must be > 1"}
|
||
if std < 0:
|
||
return {"error": "std must be non-negative"}
|
||
|
||
se = std / math.sqrt(n)
|
||
z = normal_ppf(1 - (1 - confidence) / 2)
|
||
|
||
lo = mean - z * se
|
||
hi = mean + z * se
|
||
moe = z * se
|
||
|
||
rel_moe = moe / abs(mean) * 100 if mean != 0 else None
|
||
|
||
precision_note = ""
|
||
if rel_moe and rel_moe > 20:
|
||
precision_note = "Wide CI — consider increasing sample size for tighter estimates."
|
||
elif rel_moe and rel_moe < 5:
|
||
precision_note = "Tight CI — high precision estimate."
|
||
|
||
return {
|
||
"type": "mean",
|
||
"method": "Normal approximation (z-based)" if n >= 30 else "Use with caution (n < 30)",
|
||
"n": n,
|
||
"observed_mean": round(mean, 6),
|
||
"std": round(std, 6),
|
||
"standard_error": round(se, 6),
|
||
"confidence": confidence,
|
||
"lower": round(lo, 6),
|
||
"upper": round(hi, 6),
|
||
"margin_of_error": round(moe, 6),
|
||
"relative_margin_of_error_pct": round(rel_moe, 2) if rel_moe is not None else None,
|
||
"precision_note": precision_note,
|
||
}
|
||
|
||
|
||
def print_report(result: dict):
|
||
if "error" in result:
|
||
print(f"Error: {result['error']}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
conf_pct = int(result["confidence"] * 100)
|
||
print("=" * 60)
|
||
print(f" CONFIDENCE INTERVAL REPORT")
|
||
print("=" * 60)
|
||
print(f" Method: {result['method']}")
|
||
print(f" Confidence level: {conf_pct}%")
|
||
print()
|
||
|
||
if result["type"] == "proportion":
|
||
print(f" Observed rate: {result['observed_rate']:.4%} ({result['successes']}/{result['n']})")
|
||
print()
|
||
print(f" {conf_pct}% CI: [{result['lower']:.4%}, {result['upper']:.4%}]")
|
||
print(f" Margin of error: ±{result['margin_of_error']:.4%}")
|
||
print()
|
||
norm = result.get("normal_approximation", {})
|
||
print(f" Normal approx CI (ref): [{norm.get('lower', 0):.4%}, {norm.get('upper', 0):.4%}]")
|
||
|
||
elif result["type"] == "mean":
|
||
print(f" Observed mean: {result['observed_mean']} (std={result['std']}, n={result['n']})")
|
||
print(f" Standard error: {result['standard_error']}")
|
||
print()
|
||
print(f" {conf_pct}% CI: [{result['lower']}, {result['upper']}]")
|
||
print(f" Margin of error: ±{result['margin_of_error']}")
|
||
if result.get("relative_margin_of_error_pct") is not None:
|
||
print(f" Relative MoE: ±{result['relative_margin_of_error_pct']:.1f}%")
|
||
if result.get("precision_note"):
|
||
print(f"\n ℹ️ {result['precision_note']}")
|
||
|
||
print()
|
||
# Interpretation guide
|
||
print(f" Interpretation: If this experiment were repeated many times,")
|
||
print(f" {conf_pct}% of the computed intervals would contain the true value.")
|
||
print(f" This does NOT mean there is a {conf_pct}% chance the true value is")
|
||
print(f" in this specific interval — it either is or it isn't.")
|
||
print("=" * 60)
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Compute confidence intervals for proportions and means."
|
||
)
|
||
parser.add_argument("--type", choices=["proportion", "mean"], required=True)
|
||
parser.add_argument("--confidence", type=float, default=0.95,
|
||
help="Confidence level (default: 0.95)")
|
||
parser.add_argument("--format", choices=["text", "json"], default="text")
|
||
|
||
# Proportion
|
||
parser.add_argument("--n", type=int, help="Total sample size")
|
||
parser.add_argument("--x", type=int, help="Number of successes (for proportion)")
|
||
|
||
# Mean
|
||
parser.add_argument("--mean", type=float, help="Observed mean")
|
||
parser.add_argument("--std", type=float, help="Observed standard deviation")
|
||
|
||
args = parser.parse_args()
|
||
|
||
if args.type == "proportion":
|
||
if args.n is None or args.x is None:
|
||
print("Error: --n and --x are required for proportion CI", file=sys.stderr)
|
||
sys.exit(1)
|
||
result = wilson_interval(args.n, args.x, args.confidence)
|
||
|
||
elif args.type == "mean":
|
||
if args.n is None or args.mean is None or args.std is None:
|
||
print("Error: --n, --mean, and --std are required for mean CI", file=sys.stderr)
|
||
sys.exit(1)
|
||
result = mean_interval(args.n, args.mean, args.std, args.confidence)
|
||
|
||
if args.format == "json":
|
||
print(json.dumps(result, indent=2))
|
||
else:
|
||
print_report(result)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|