feat(data-analysis): data-quality-auditor

Adds a new data-quality-auditor skill with three stdlib-only Python tools: - data_profiler.py: full dataset profile with DQS (0-100) across 5 dimensions - missing_value_analyzer.py: MCAR/MAR/MNAR classification + imputation strategies - outlier_detector.py: IQR, Z-score, and Modified Z-score (MAD) outlier detection Validator: 86.4/100 (GOOD). Security audit: PASS (0 critical/high). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-31 23:14:13 -07:00
parent 1a06eacbb8
commit a6e4cdbbeb
5 changed files with 1086 additions and 0 deletions
--- a/data-analysis/data-quality-auditor/scripts/missing_value_analyzer.py
+++ b/data-analysis/data-quality-auditor/scripts/missing_value_analyzer.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""
+missing_value_analyzer.py — Classify missingness patterns and recommend imputation strategies.
+
+Usage:
+    python3 missing_value_analyzer.py --file data.csv
+    python3 missing_value_analyzer.py --file data.csv --threshold 0.05
+    python3 missing_value_analyzer.py --file data.csv --format json
+"""
+
+import argparse
+import csv
+import json
+import sys
+from collections import defaultdict
+
+
+NULL_STRINGS = {"", "null", "none", "n/a", "na", "nan", "nil", "undefined", "missing"}
+
+
+def load_csv(filepath: str) -> tuple[list[str], list[dict]]:
+    with open(filepath, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        rows = list(reader)
+        headers = reader.fieldnames or []
+    return headers, rows
+
+
+def is_null(val: str) -> bool:
+    return val.strip().lower() in NULL_STRINGS
+
+
+def compute_null_mask(headers: list[str], rows: list[dict]) -> dict[str, list[bool]]:
+    return {col: [is_null(row.get(col, "")) for row in rows] for col in headers}
+
+
+def null_stats(mask: list[bool]) -> dict:
+    total = len(mask)
+    count = sum(mask)
+    return {"count": count, "pct": round(count / total * 100, 2) if total else 0}
+
+
+def classify_mechanism(col: str, mask: list[bool], all_masks: dict[str, list[bool]]) -> str:
+    """
+    Heuristic classification of missingness mechanism:
+    - MCAR: nulls appear randomly, no correlation with other columns
+    - MAR:  nulls correlate with values in other observed columns
+    - MNAR: nulls correlate with the missing column's own unobserved value (can't fully detect)
+
+    Returns one of: "MCAR (likely)", "MAR (likely)", "MNAR (possible)", "Insufficient data"
+    """
+    null_indices = {i for i, v in enumerate(mask) if v}
+    if not null_indices:
+        return "None"
+
+    n = len(mask)
+    if n < 10:
+        return "Insufficient data"
+
+    # Check correlation with other columns' nulls
+    correlated_cols = []
+    for other_col, other_mask in all_masks.items():
+        if other_col == col:
+            continue
+        other_null_indices = {i for i, v in enumerate(other_mask) if v}
+        if not other_null_indices:
+            continue
+        overlap = len(null_indices & other_null_indices)
+        union = len(null_indices | other_null_indices)
+        jaccard = overlap / union if union else 0
+        if jaccard > 0.5:
+            correlated_cols.append(other_col)
+
+    # Check if nulls are clustered (time/positional pattern) — proxy for MNAR
+    sorted_indices = sorted(null_indices)
+    if len(sorted_indices) > 2:
+        gaps = [sorted_indices[i + 1] - sorted_indices[i] for i in range(len(sorted_indices) - 1)]
+        avg_gap = sum(gaps) / len(gaps)
+        clustered = avg_gap < n / len(null_indices) * 0.5  # nulls appear closer together than random
+    else:
+        clustered = False
+
+    if correlated_cols:
+        return f"MAR (likely) — co-occurs with nulls in: {', '.join(correlated_cols[:3])}"
+    elif clustered:
+        return "MNAR (possible) — nulls are spatially clustered, may reflect a systematic gap"
+    else:
+        return "MCAR (likely) — nulls appear random, no strong correlation detected"
+
+
+def recommend_strategy(pct: float, col_type: str) -> str:
+    if pct == 0:
+        return "No action needed"
+    if pct < 1:
+        return "Drop rows — impact is negligible"
+    if pct < 10:
+        strategies = {
+            "int": "Impute with median + add binary indicator column",
+            "float": "Impute with median + add binary indicator column",
+            "string": "Impute with mode or 'Unknown' category + add indicator",
+            "bool": "Impute with mode",
+        }
+        return strategies.get(col_type, "Impute with median/mode + add indicator")
+    if pct < 30:
+        return "Impute cautiously; investigate root cause; document assumption; add indicator"
+    return "Do NOT impute blindly — > 30% missing. Escalate to domain owner or consider dropping column"
+
+
+def infer_type(values: list[str]) -> str:
+    non_null = [v for v in values if not is_null(v)]
+    counts = {"int": 0, "float": 0, "bool": 0, "string": 0}
+    for v in non_null[:200]:  # sample for speed
+        v = v.strip()
+        if v.lower() in ("true", "false"):
+            counts["bool"] += 1
+        else:
+            try:
+                int(v)
+                counts["int"] += 1
+            except ValueError:
+                try:
+                    float(v)
+                    counts["float"] += 1
+                except ValueError:
+                    counts["string"] += 1
+    return max(counts, key=lambda k: counts[k]) if any(counts.values()) else "string"
+
+
+def compute_cooccurrence(headers: list[str], masks: dict[str, list[bool]], top_n: int = 5) -> list[dict]:
+    """Find column pairs where nulls most frequently co-occur."""
+    pairs = []
+    cols = list(headers)
+    for i in range(len(cols)):
+        for j in range(i + 1, len(cols)):
+            a, b = cols[i], cols[j]
+            mask_a, mask_b = masks[a], masks[b]
+            overlap = sum(1 for x, y in zip(mask_a, mask_b) if x and y)
+            if overlap > 0:
+                pairs.append({"col_a": a, "col_b": b, "co_null_rows": overlap})
+    pairs.sort(key=lambda x: -x["co_null_rows"])
+    return pairs[:top_n]
+
+
+def print_report(headers: list[str], rows: list[dict], masks: dict, threshold: float):
+    total = len(rows)
+    print("=" * 64)
+    print("MISSING VALUE ANALYSIS REPORT")
+    print("=" * 64)
+    print(f"Rows: {total}  |  Columns: {len(headers)}")
+
+    results = []
+    for col in headers:
+        mask = masks[col]
+        stats = null_stats(mask)
+        if stats["pct"] / 100 < threshold and stats["count"] > 0:
+            continue
+        raw_vals = [row.get(col, "") for row in rows]
+        col_type = infer_type(raw_vals)
+        mechanism = classify_mechanism(col, mask, masks)
+        strategy = recommend_strategy(stats["pct"], col_type)
+        results.append({
+            "column": col,
+            "null_count": stats["count"],
+            "null_pct": stats["pct"],
+            "col_type": col_type,
+            "mechanism": mechanism,
+            "strategy": strategy,
+        })
+
+    fully_complete = [col for col in headers if null_stats(masks[col])["count"] == 0]
+    print(f"\nFully complete columns: {len(fully_complete)}/{len(headers)}")
+
+    if not results:
+        print(f"\nNo columns exceed the null threshold ({threshold * 100:.1f}%).")
+    else:
+        print(f"\nColumns with missing values (threshold >= {threshold * 100:.1f}%):\n")
+        for r in sorted(results, key=lambda x: -x["null_pct"]):
+            indicator = "🔴" if r["null_pct"] > 30 else ("🟡" if r["null_pct"] > 10 else "🟢")
+            print(f"  {indicator} {r['column']}")
+            print(f"     Nulls: {r['null_count']} ({r['null_pct']}%)  |  Type: {r['col_type']}")
+            print(f"     Mechanism: {r['mechanism']}")
+            print(f"     Strategy:  {r['strategy']}")
+            print()
+
+    cooccur = compute_cooccurrence(headers, masks)
+    if cooccur:
+        print("-" * 64)
+        print("NULL CO-OCCURRENCE (top pairs)")
+        print("-" * 64)
+        for pair in cooccur:
+            print(f"  {pair['col_a']} + {pair['col_b']}  →  {pair['co_null_rows']} rows both null")
+
+    print("\n" + "=" * 64)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyze missing values in a CSV dataset.")
+    parser.add_argument("--file", required=True, help="Path to CSV file")
+    parser.add_argument("--threshold", type=float, default=0.0,
+                        help="Only show columns with null fraction above this (e.g. 0.05 = 5%%)")
+    parser.add_argument("--format", choices=["text", "json"], default="text")
+    args = parser.parse_args()
+
+    try:
+        headers, rows = load_csv(args.file)
+    except FileNotFoundError:
+        print(f"Error: file not found: {args.file}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error reading file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    if not rows:
+        print("Error: CSV file is empty.", file=sys.stderr)
+        sys.exit(1)
+
+    masks = compute_null_mask(headers, rows)
+
+    if args.format == "json":
+        output = []
+        for col in headers:
+            mask = masks[col]
+            stats = null_stats(mask)
+            raw_vals = [row.get(col, "") for row in rows]
+            col_type = infer_type(raw_vals)
+            mechanism = classify_mechanism(col, mask, masks)
+            strategy = recommend_strategy(stats["pct"], col_type)
+            output.append({
+                "column": col,
+                "null_count": stats["count"],
+                "null_pct": stats["pct"],
+                "col_type": col_type,
+                "mechanism": mechanism,
+                "strategy": strategy,
+            })
+        print(json.dumps({"total_rows": len(rows), "columns": output}, indent=2))
+    else:
+        print_report(headers, rows, masks, args.threshold)
+
+
+if __name__ == "__main__":
+    main()