claude-skills-reference/data-analysis/data-quality-auditor/scripts/missing_value_analyzer.py

#!/usr/bin/env python3
"""
missing_value_analyzer.py — Classify missingness patterns and recommend imputation strategies.

Usage:
    python3 missing_value_analyzer.py --file data.csv
    python3 missing_value_analyzer.py --file data.csv --threshold 0.05
    python3 missing_value_analyzer.py --file data.csv --format json
"""

import argparse
import csv
import json
import sys
from collections import defaultdict


NULL_STRINGS = {"", "null", "none", "n/a", "na", "nan", "nil", "undefined", "missing"}


def load_csv(filepath: str) -> tuple[list[str], list[dict]]:
    with open(filepath, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        rows = list(reader)
        headers = reader.fieldnames or []
    return headers, rows


def is_null(val: str) -> bool:
    return val.strip().lower() in NULL_STRINGS


def compute_null_mask(headers: list[str], rows: list[dict]) -> dict[str, list[bool]]:
    return {col: [is_null(row.get(col, "")) for row in rows] for col in headers}


def null_stats(mask: list[bool]) -> dict:
    total = len(mask)
    count = sum(mask)
    return {"count": count, "pct": round(count / total * 100, 2) if total else 0}


def classify_mechanism(col: str, mask: list[bool], all_masks: dict[str, list[bool]]) -> str:
    """
    Heuristic classification of missingness mechanism:
    - MCAR: nulls appear randomly, no correlation with other columns
    - MAR:  nulls correlate with values in other observed columns
    - MNAR: nulls correlate with the missing column's own unobserved value (can't fully detect)

    Returns one of: "MCAR (likely)", "MAR (likely)", "MNAR (possible)", "Insufficient data"
    """
    null_indices = {i for i, v in enumerate(mask) if v}
    if not null_indices:
        return "None"

    n = len(mask)
    if n < 10:
        return "Insufficient data"

    # Check correlation with other columns' nulls
    correlated_cols = []
    for other_col, other_mask in all_masks.items():
        if other_col == col:
            continue
        other_null_indices = {i for i, v in enumerate(other_mask) if v}
        if not other_null_indices:
            continue
        overlap = len(null_indices & other_null_indices)
        union = len(null_indices | other_null_indices)
        jaccard = overlap / union if union else 0
        if jaccard > 0.5:
            correlated_cols.append(other_col)

    # Check if nulls are clustered (time/positional pattern) — proxy for MNAR
    sorted_indices = sorted(null_indices)
    if len(sorted_indices) > 2:
        gaps = [sorted_indices[i + 1] - sorted_indices[i] for i in range(len(sorted_indices) - 1)]
        avg_gap = sum(gaps) / len(gaps)
        clustered = avg_gap < n / len(null_indices) * 0.5  # nulls appear closer together than random
    else:
        clustered = False

    if correlated_cols:
        return f"MAR (likely) — co-occurs with nulls in: {', '.join(correlated_cols[:3])}"
    elif clustered:
        return "MNAR (possible) — nulls are spatially clustered, may reflect a systematic gap"
    else:
        return "MCAR (likely) — nulls appear random, no strong correlation detected"


def recommend_strategy(pct: float, col_type: str) -> str:
    if pct == 0:
        return "No action needed"
    if pct < 1:
        return "Drop rows — impact is negligible"
    if pct < 10:
        strategies = {
            "int": "Impute with median + add binary indicator column",
            "float": "Impute with median + add binary indicator column",
            "string": "Impute with mode or 'Unknown' category + add indicator",
            "bool": "Impute with mode",
        }
        return strategies.get(col_type, "Impute with median/mode + add indicator")
    if pct < 30:
        return "Impute cautiously; investigate root cause; document assumption; add indicator"
    return "Do NOT impute blindly — > 30% missing. Escalate to domain owner or consider dropping column"


def infer_type(values: list[str]) -> str:
    non_null = [v for v in values if not is_null(v)]
    counts = {"int": 0, "float": 0, "bool": 0, "string": 0}
    for v in non_null[:200]:  # sample for speed
        v = v.strip()
        if v.lower() in ("true", "false"):
            counts["bool"] += 1
        else:
            try:
                int(v)
                counts["int"] += 1
            except ValueError:
                try:
                    float(v)
                    counts["float"] += 1
                except ValueError:
                    counts["string"] += 1
    return max(counts, key=lambda k: counts[k]) if any(counts.values()) else "string"


def compute_cooccurrence(headers: list[str], masks: dict[str, list[bool]], top_n: int = 5) -> list[dict]:
    """Find column pairs where nulls most frequently co-occur."""
    pairs = []
    cols = list(headers)
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            a, b = cols[i], cols[j]
            mask_a, mask_b = masks[a], masks[b]
            overlap = sum(1 for x, y in zip(mask_a, mask_b) if x and y)
            if overlap > 0:
                pairs.append({"col_a": a, "col_b": b, "co_null_rows": overlap})
    pairs.sort(key=lambda x: -x["co_null_rows"])
    return pairs[:top_n]


def print_report(headers: list[str], rows: list[dict], masks: dict, threshold: float):
    total = len(rows)
    print("=" * 64)
    print("MISSING VALUE ANALYSIS REPORT")
    print("=" * 64)
    print(f"Rows: {total}  |  Columns: {len(headers)}")

    results = []
    for col in headers:
        mask = masks[col]
        stats = null_stats(mask)
        if stats["pct"] / 100 < threshold and stats["count"] > 0:
            continue
        raw_vals = [row.get(col, "") for row in rows]
        col_type = infer_type(raw_vals)
        mechanism = classify_mechanism(col, mask, masks)
        strategy = recommend_strategy(stats["pct"], col_type)
        results.append({
            "column": col,
            "null_count": stats["count"],
            "null_pct": stats["pct"],
            "col_type": col_type,
            "mechanism": mechanism,
            "strategy": strategy,
        })

    fully_complete = [col for col in headers if null_stats(masks[col])["count"] == 0]
    print(f"\nFully complete columns: {len(fully_complete)}/{len(headers)}")

    if not results:
        print(f"\nNo columns exceed the null threshold ({threshold * 100:.1f}%).")
    else:
        print(f"\nColumns with missing values (threshold >= {threshold * 100:.1f}%):\n")
        for r in sorted(results, key=lambda x: -x["null_pct"]):
            indicator = "🔴" if r["null_pct"] > 30 else ("🟡" if r["null_pct"] > 10 else "🟢")
            print(f"  {indicator} {r['column']}")
            print(f"     Nulls: {r['null_count']} ({r['null_pct']}%)  |  Type: {r['col_type']}")
            print(f"     Mechanism: {r['mechanism']}")
            print(f"     Strategy:  {r['strategy']}")
            print()

    cooccur = compute_cooccurrence(headers, masks)
    if cooccur:
        print("-" * 64)
        print("NULL CO-OCCURRENCE (top pairs)")
        print("-" * 64)
        for pair in cooccur:
            print(f"  {pair['col_a']} + {pair['col_b']}  →  {pair['co_null_rows']} rows both null")

    print("\n" + "=" * 64)


def main():
    parser = argparse.ArgumentParser(description="Analyze missing values in a CSV dataset.")
    parser.add_argument("--file", required=True, help="Path to CSV file")
    parser.add_argument("--threshold", type=float, default=0.0,
                        help="Only show columns with null fraction above this (e.g. 0.05 = 5%%)")
    parser.add_argument("--format", choices=["text", "json"], default="text")
    args = parser.parse_args()

    try:
        headers, rows = load_csv(args.file)
    except FileNotFoundError:
        print(f"Error: file not found: {args.file}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error reading file: {e}", file=sys.stderr)
        sys.exit(1)

    if not rows:
        print("Error: CSV file is empty.", file=sys.stderr)
        sys.exit(1)

    masks = compute_null_mask(headers, rows)

    if args.format == "json":
        output = []
        for col in headers:
            mask = masks[col]
            stats = null_stats(mask)
            raw_vals = [row.get(col, "") for row in rows]
            col_type = infer_type(raw_vals)
            mechanism = classify_mechanism(col, mask, masks)
            strategy = recommend_strategy(stats["pct"], col_type)
            output.append({
                "column": col,
                "null_count": stats["count"],
                "null_pct": stats["pct"],
                "col_type": col_type,
                "mechanism": mechanism,
                "strategy": strategy,
            })
        print(json.dumps({"total_rows": len(rows), "columns": output}, indent=2))
    else:
        print_report(headers, rows, masks, args.threshold)


if __name__ == "__main__":
    main()