diff --git a/product-team/experiment-designer/SKILL.md b/product-team/experiment-designer/SKILL.md new file mode 100644 index 0000000..b74e59b --- /dev/null +++ b/product-team/experiment-designer/SKILL.md @@ -0,0 +1,104 @@ +--- +name: experiment-designer +description: Use when planning product experiments, writing testable hypotheses, estimating sample size, prioritizing tests, or interpreting A/B outcomes with practical statistical rigor. +--- + +# Experiment Designer + +Design, prioritize, and evaluate product experiments with clear hypotheses and defensible decisions. + +## When To Use + +Use this skill for: +- A/B and multivariate experiment planning +- Hypothesis writing and success criteria definition +- Sample size and minimum detectable effect planning +- Experiment prioritization with ICE scoring +- Reading statistical output for product decisions + +## Core Workflow + +1. Write hypothesis in If/Then/Because format +- If we change `[intervention]` +- Then `[metric]` will change by `[expected direction/magnitude]` +- Because `[behavioral mechanism]` + +2. Define metrics before running test +- Primary metric: single decision metric +- Guardrail metrics: quality/risk protection +- Secondary metrics: diagnostics only + +3. Estimate sample size +- Baseline conversion or baseline mean +- Minimum detectable effect (MDE) +- Significance level (alpha) and power + +Use: +```bash +python3 scripts/sample_size_calculator.py --baseline-rate 0.12 --mde 0.02 --mde-type absolute +``` + +4. Prioritize experiments with ICE +- Impact: potential upside +- Confidence: evidence quality +- Ease: cost/speed/complexity + +ICE Score = (Impact * Confidence * Ease) / 10 + +5. Launch with stopping rules +- Decide fixed sample size or fixed duration in advance +- Avoid repeated peeking without proper method +- Monitor guardrails continuously + +6. Interpret results +- Statistical significance is not business significance +- Compare point estimate + confidence interval to decision threshold +- Investigate novelty effects and segment heterogeneity + +## Hypothesis Quality Checklist + +- [ ] Contains explicit intervention and audience +- [ ] Specifies measurable metric change +- [ ] States plausible causal reason +- [ ] Includes expected minimum effect +- [ ] Defines failure condition + +## Common Experiment Pitfalls + +- Underpowered tests leading to false negatives +- Running too many simultaneous changes without isolation +- Changing targeting or implementation mid-test +- Stopping early on random spikes +- Ignoring sample ratio mismatch and instrumentation drift +- Declaring success from p-value without effect-size context + +## Statistical Interpretation Guardrails + +- p-value < alpha indicates evidence against null, not guaranteed truth. +- Confidence interval crossing zero/no-effect means uncertain directional claim. +- Wide intervals imply low precision even when significant. +- Use practical significance thresholds tied to business impact. + +See: +- `references/experiment-playbook.md` +- `references/statistics-reference.md` + +## Tooling + +### `scripts/sample_size_calculator.py` + +Computes required sample size (per variant and total) from: +- baseline rate +- MDE (absolute or relative) +- significance level (alpha) +- statistical power + +Example: +```bash +python3 scripts/sample_size_calculator.py \ + --baseline-rate 0.10 \ + --mde 0.015 \ + --mde-type absolute \ + --alpha 0.05 \ + --power 0.8 +``` diff --git a/product-team/experiment-designer/references/experiment-playbook.md b/product-team/experiment-designer/references/experiment-playbook.md new file mode 100644 index 0000000..07b2cc9 --- /dev/null +++ b/product-team/experiment-designer/references/experiment-playbook.md @@ -0,0 +1,70 @@ +# Experiment Playbook + +## Experiment Types + +### A/B Test +- Compare one control versus one variant. +- Best for high-confidence directional decisions. + +### Multivariate Test +- Test combinations of multiple factors. +- Useful for interaction effects, requires larger traffic. + +### Holdout Test +- Keep a percentage unexposed to intervention. +- Useful for measuring incremental lift over broader changes. + +## Metric Design + +### Primary Metric +- One metric that decides ship/no-ship. +- Must align with user value and business objective. + +### Guardrail Metrics +- Prevent local optimization damage. +- Examples: error rate, latency, churn proxy, support contacts. + +### Diagnostic Metrics +- Explain why change happened. +- Do not use as decision gate unless pre-specified. + +## Stopping Rules + +Define before launch: +- Fixed sample size per group +- Minimum run duration (to capture weekday/weekend behavior) +- Guardrail breach thresholds (pause criteria) + +Avoid: +- Continuous peeking with fixed-horizon inference +- Changing success metric mid-test +- Retroactive segmentation without correction + +## Novelty and Primacy Effects + +- Novelty effect: short-term spike due to newness, not durable value. +- Primacy effect: early exposure creates bias in user behavior. + +Mitigation: +- Run long enough for behavior stabilization. +- Check returning users and delayed cohorts separately. +- Re-run key tests when stakes are high. + +## Pre-Launch Checklist + +- [ ] Hypothesis complete (If/Then/Because) +- [ ] Metric definitions frozen +- [ ] Instrumentation validated +- [ ] Randomization and assignment verified +- [ ] Sample size and duration approved +- [ ] Rollback plan documented + +## Post-Test Readout Template + +1. Hypothesis and scope +2. Experiment setup and quality checks +3. Primary metric effect size + confidence interval +4. Guardrail status +5. Segment-level observations (pre-registered only) +6. Decision: ship, iterate, or reject +7. Follow-up experiments diff --git a/product-team/experiment-designer/references/statistics-reference.md b/product-team/experiment-designer/references/statistics-reference.md new file mode 100644 index 0000000..b933863 --- /dev/null +++ b/product-team/experiment-designer/references/statistics-reference.md @@ -0,0 +1,56 @@ +# Statistics Reference for Product Managers + +## p-value + +The p-value is the probability of observing data at least as extreme as yours if there were no true effect. + +- Small p-value means data is less consistent with "no effect". +- It does not tell you the probability that the variant is best. + +## Confidence Interval (CI) + +A CI gives a plausible range for the true effect size. + +- Narrow interval: more precise estimate. +- Wide interval: uncertain estimate. +- If CI includes zero (or no-effect), directional confidence is weak. + +## Minimum Detectable Effect (MDE) + +The smallest effect worth detecting. + +- Set MDE by business value threshold, not wishful optimism. +- Smaller MDE requires larger sample size. + +## Statistical Power + +Power is the probability of detecting a true effect of at least MDE. + +- Common target: 80% (0.8) +- Higher power increases sample requirements. + +## Type I and Type II Errors + +- Type I (false positive): claim effect when none exists (controlled by alpha). +- Type II (false negative): miss a real effect (controlled by power). + +## Practical Significance + +An effect can be statistically significant but too small to matter. + +Always ask: +- Does the effect clear implementation cost? +- Does it move strategic KPIs materially? + +## Power Analysis Inputs + +For conversion experiments (two proportions): +- Baseline conversion rate +- MDE (absolute points or relative uplift) +- Alpha (e.g., 0.05) +- Power (e.g., 0.8) + +Output: +- Required sample size per variant +- Total sample size +- Approximate runtime based on traffic volume diff --git a/product-team/experiment-designer/scripts/sample_size_calculator.py b/product-team/experiment-designer/scripts/sample_size_calculator.py new file mode 100755 index 0000000..69bd483 --- /dev/null +++ b/product-team/experiment-designer/scripts/sample_size_calculator.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""Calculate sample size for two-proportion A/B tests.""" + +import argparse +import math +import statistics + + +def clamp_rate(value: float, name: str) -> float: + if value <= 0 or value >= 1: + raise ValueError(f"{name} must be between 0 and 1 (exclusive).") + return value + + +def required_sample_size_per_group( + baseline_rate: float, + target_rate: float, + alpha: float, + power: float, +) -> int: + delta = abs(target_rate - baseline_rate) + if delta <= 0: + raise ValueError("MDE resolves to zero; target and baseline must differ.") + + z_alpha = statistics.NormalDist().inv_cdf(1 - alpha / 2) + z_beta = statistics.NormalDist().inv_cdf(power) + pooled = (baseline_rate + target_rate) / 2 + + numerator = 2 * pooled * (1 - pooled) * (z_alpha + z_beta) ** 2 + n = numerator / (delta ** 2) + return math.ceil(n) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Compute sample size for two-proportion product experiments." + ) + parser.add_argument("--baseline-rate", type=float, required=True) + parser.add_argument( + "--mde", + type=float, + required=True, + help="Minimum detectable effect. Absolute points when --mde-type absolute, otherwise relative uplift.", + ) + parser.add_argument("--mde-type", choices=["absolute", "relative"], default="relative") + parser.add_argument("--alpha", type=float, default=0.05) + parser.add_argument("--power", type=float, default=0.8) + parser.add_argument( + "--daily-samples", + type=int, + default=0, + help="Optional total daily samples to estimate runtime in days.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + baseline = clamp_rate(args.baseline_rate, "baseline-rate") + + if args.mde <= 0: + raise ValueError("mde must be > 0") + if args.alpha <= 0 or args.alpha >= 1: + raise ValueError("alpha must be between 0 and 1") + if args.power <= 0 or args.power >= 1: + raise ValueError("power must be between 0 and 1") + + if args.mde_type == "absolute": + target = baseline + args.mde + else: + target = baseline * (1 + args.mde) + + target = clamp_rate(target, "target-rate") + + n_per_group = required_sample_size_per_group( + baseline_rate=baseline, + target_rate=target, + alpha=args.alpha, + power=args.power, + ) + total_n = n_per_group * 2 + + print("A/B Test Sample Size Estimate") + print(f"baseline_rate: {baseline:.6f}") + print(f"target_rate: {target:.6f}") + print(f"mde_type: {args.mde_type}") + print(f"alpha: {args.alpha}") + print(f"power: {args.power}") + print(f"n_per_group: {n_per_group}") + print(f"n_total: {total_n}") + + if args.daily_samples > 0: + days = math.ceil(total_n / args.daily_samples) + print(f"estimated_days_at_daily_samples_{args.daily_samples}: {days}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())