Files
claude-skills-reference/engineering/data-quality-auditor/scripts/outlier_detector.py
Reza Rezvani 5710a7b763 chore: post-merge sync — plugins, audits, docs, cross-platform indexes
New skills integrated:
- engineering/behuman, code-tour, demo-video, data-quality-auditor

Plugins & marketplace:
- Add plugin.json for code-tour, demo-video, data-quality-auditor
- Add all 3 to marketplace.json (31 total plugins)
- Update marketplace counts to 248 skills, 332 tools, 460 refs

Skill fixes:
- Move data-quality-auditor from data-analysis/ to engineering/
- Fix cross-refs: code-tour, demo-video, data-quality-auditor
- Add evals.json for code-tour (5 scenarios) and demo-video (4 scenarios)
- demo-video: add output artifacts, prereqs check, references extraction
- code-tour: add default persona, parallel discovery, trivial repo guidance
- Fix Python 3.9 compat (from __future__ import annotations)

product-analytics audit fixes:
- Expand SKILL.md from 82 to 147 lines (anti-patterns, cross-refs, examples)
- Add --format json to all metrics_calculator.py subcommands
- Add error handling (FileNotFoundError, KeyError)

Docs & indexes:
- Update CLAUDE.md, README.md, docs/index.md, docs/getting-started.md counts
- Sync Codex (192 skills) and Gemini (280 items) indexes
- Regenerate MkDocs pages (279 pages, 311 HTML)
- Add 3 new nav entries to mkdocs.yml
- Update mkdocs.yml site_description

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 02:05:19 +02:00

264 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
from __future__ import annotations
"""
outlier_detector.py — Multi-method outlier detection for numeric columns.
Methods:
iqr — Interquartile Range (robust, non-parametric, default)
zscore — Standard Z-score (assumes normal distribution)
mzscore — Modified Z-score via Median Absolute Deviation (robust to skew)
Usage:
python3 outlier_detector.py --file data.csv
python3 outlier_detector.py --file data.csv --method iqr
python3 outlier_detector.py --file data.csv --method zscore --threshold 2.5
python3 outlier_detector.py --file data.csv --columns col1,col2
python3 outlier_detector.py --file data.csv --format json
"""
import argparse
import csv
import json
import math
import sys
NULL_STRINGS = {"", "null", "none", "n/a", "na", "nan", "nil", "undefined", "missing"}
def load_csv(filepath: str) -> tuple[list[str], list[dict]]:
with open(filepath, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
rows = list(reader)
headers = reader.fieldnames or []
return headers, rows
def is_null(val: str) -> bool:
return val.strip().lower() in NULL_STRINGS
def to_float(val: str) -> float | None:
try:
return float(val.strip())
except (ValueError, AttributeError):
return None
def median(nums: list[float]) -> float:
s = sorted(nums)
n = len(s)
mid = n // 2
return s[mid] if n % 2 else (s[mid - 1] + s[mid]) / 2
def percentile(nums: list[float], p: float) -> float:
"""Linear interpolation percentile."""
s = sorted(nums)
n = len(s)
if n == 1:
return s[0]
idx = p / 100 * (n - 1)
lo = int(idx)
hi = lo + 1
frac = idx - lo
if hi >= n:
return s[-1]
return s[lo] + frac * (s[hi] - s[lo])
def mean(nums: list[float]) -> float:
return sum(nums) / len(nums)
def std(nums: list[float], mu: float) -> float:
if len(nums) < 2:
return 0.0
variance = sum((x - mu) ** 2 for x in nums) / (len(nums) - 1)
return math.sqrt(variance)
# --- Detection methods ---
def detect_iqr(nums: list[float], multiplier: float = 1.5) -> dict:
q1 = percentile(nums, 25)
q3 = percentile(nums, 75)
iqr = q3 - q1
lower = q1 - multiplier * iqr
upper = q3 + multiplier * iqr
outliers = [x for x in nums if x < lower or x > upper]
return {
"method": "IQR",
"q1": round(q1, 4),
"q3": round(q3, 4),
"iqr": round(iqr, 4),
"lower_bound": round(lower, 4),
"upper_bound": round(upper, 4),
"outlier_count": len(outliers),
"outlier_pct": round(len(outliers) / len(nums) * 100, 2),
"outlier_values": sorted(set(round(x, 4) for x in outliers))[:10],
}
def detect_zscore(nums: list[float], threshold: float = 3.0) -> dict:
mu = mean(nums)
sigma = std(nums, mu)
if sigma == 0:
return {"method": "Z-score", "outlier_count": 0, "outlier_pct": 0.0,
"note": "Zero variance — all values identical"}
zscores = [(x, abs((x - mu) / sigma)) for x in nums]
outliers = [x for x, z in zscores if z > threshold]
return {
"method": "Z-score",
"mean": round(mu, 4),
"std": round(sigma, 4),
"threshold": threshold,
"outlier_count": len(outliers),
"outlier_pct": round(len(outliers) / len(nums) * 100, 2),
"outlier_values": sorted(set(round(x, 4) for x in outliers))[:10],
}
def detect_modified_zscore(nums: list[float], threshold: float = 3.5) -> dict:
"""Iglewicz-Hoaglin modified Z-score using Median Absolute Deviation."""
med = median(nums)
mad = median([abs(x - med) for x in nums])
if mad == 0:
return {"method": "Modified Z-score (MAD)", "outlier_count": 0, "outlier_pct": 0.0,
"note": "MAD is zero — consider Z-score instead"}
mzscores = [(x, 0.6745 * abs(x - med) / mad) for x in nums]
outliers = [x for x, mz in mzscores if mz > threshold]
return {
"method": "Modified Z-score (MAD)",
"median": round(med, 4),
"mad": round(mad, 4),
"threshold": threshold,
"outlier_count": len(outliers),
"outlier_pct": round(len(outliers) / len(nums) * 100, 2),
"outlier_values": sorted(set(round(x, 4) for x in outliers))[:10],
}
def classify_outlier_risk(pct: float, col: str) -> str:
"""Heuristic: flag whether outliers are likely data errors or legitimate extremes."""
if pct > 10:
return "High outlier rate — likely systematic data quality issue or wrong data type"
if pct > 5:
return "Elevated outlier rate — investigate source; may be mixed populations"
if pct > 1:
return "Moderate — review individually; could be legitimate extremes or entry errors"
if pct > 0:
return "Low — verify extreme values against source; likely legitimate but worth checking"
return "Clean — no outliers detected"
def analyze_column(col: str, nums: list[float], method: str, threshold: float) -> dict:
if len(nums) < 4:
return {"column": col, "status": "Skipped — fewer than 4 numeric values"}
if method == "iqr":
result = detect_iqr(nums, multiplier=threshold if threshold != 3.0 else 1.5)
elif method == "zscore":
result = detect_zscore(nums, threshold=threshold)
elif method == "mzscore":
result = detect_modified_zscore(nums, threshold=threshold)
else:
result = detect_iqr(nums)
result["column"] = col
result["total_numeric"] = len(nums)
result["risk_assessment"] = classify_outlier_risk(result.get("outlier_pct", 0), col)
return result
def print_report(results: list[dict]):
print("=" * 64)
print("OUTLIER DETECTION REPORT")
print("=" * 64)
clean = [r for r in results if r.get("outlier_count", 0) == 0 and "status" not in r]
flagged = [r for r in results if r.get("outlier_count", 0) > 0]
skipped = [r for r in results if "status" in r]
print(f"\nColumns analyzed: {len(results) - len(skipped)}")
print(f"Clean: {len(clean)}")
print(f"Flagged: {len(flagged)}")
if skipped:
print(f"Skipped: {len(skipped)} ({', '.join(r['column'] for r in skipped)})")
if flagged:
print("\n" + "-" * 64)
print("FLAGGED COLUMNS")
print("-" * 64)
for r in sorted(flagged, key=lambda x: -x.get("outlier_pct", 0)):
pct = r.get("outlier_pct", 0)
indicator = "🔴" if pct > 5 else "🟡"
print(f"\n {indicator} {r['column']} ({r['method']})")
print(f" Outliers: {r['outlier_count']} / {r['total_numeric']} rows ({pct}%)")
if "lower_bound" in r:
print(f" Bounds: [{r['lower_bound']}, {r['upper_bound']}] | IQR: {r['iqr']}")
if "mean" in r:
print(f" Mean: {r['mean']} | Std: {r['std']} | Threshold: ±{r['threshold']}σ")
if "median" in r:
print(f" Median: {r['median']} | MAD: {r['mad']} | Threshold: {r['threshold']}")
if r.get("outlier_values"):
vals = ", ".join(str(v) for v in r["outlier_values"][:8])
print(f" Sample outlier values: {vals}")
print(f" Assessment: {r['risk_assessment']}")
if clean:
cols = ", ".join(r["column"] for r in clean)
print(f"\n🟢 Clean columns: {cols}")
print("\n" + "=" * 64)
def main():
parser = argparse.ArgumentParser(description="Detect outliers in numeric columns of a CSV dataset.")
parser.add_argument("--file", required=True, help="Path to CSV file")
parser.add_argument("--method", choices=["iqr", "zscore", "mzscore"], default="iqr",
help="Detection method (default: iqr)")
parser.add_argument("--threshold", type=float, default=None,
help="Method threshold (IQR multiplier default 1.5; Z-score default 3.0; mzscore default 3.5)")
parser.add_argument("--columns", help="Comma-separated columns to check (default: all numeric)")
parser.add_argument("--format", choices=["text", "json"], default="text")
args = parser.parse_args()
# Set default thresholds per method
if args.threshold is None:
args.threshold = {"iqr": 1.5, "zscore": 3.0, "mzscore": 3.5}[args.method]
try:
headers, rows = load_csv(args.file)
except FileNotFoundError:
print(f"Error: file not found: {args.file}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error reading file: {e}", file=sys.stderr)
sys.exit(1)
if not rows:
print("Error: CSV file is empty.", file=sys.stderr)
sys.exit(1)
selected = args.columns.split(",") if args.columns else headers
missing_cols = [c for c in selected if c not in headers]
if missing_cols:
print(f"Error: columns not found: {', '.join(missing_cols)}", file=sys.stderr)
sys.exit(1)
results = []
for col in selected:
raw = [row.get(col, "") for row in rows]
nums = [n for v in raw if not is_null(v) and (n := to_float(v)) is not None]
results.append(analyze_column(col, nums, args.method, args.threshold))
if args.format == "json":
print(json.dumps(results, indent=2))
else:
print_report(results)
if __name__ == "__main__":
main()