Files
claude-skills-reference/data-analysis/data-quality-auditor/scripts/missing_value_analyzer.py
amitdhanda48 a6e4cdbbeb feat(data-analysis): data-quality-auditor
Adds a new data-quality-auditor skill with three stdlib-only Python tools:
- data_profiler.py: full dataset profile with DQS (0-100) across 5 dimensions
- missing_value_analyzer.py: MCAR/MAR/MNAR classification + imputation strategies
- outlier_detector.py: IQR, Z-score, and Modified Z-score (MAD) outlier detection

Validator: 86.4/100 (GOOD). Security audit: PASS (0 critical/high).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-31 23:14:13 -07:00

243 lines
8.8 KiB
Python

#!/usr/bin/env python3
"""
missing_value_analyzer.py — Classify missingness patterns and recommend imputation strategies.
Usage:
python3 missing_value_analyzer.py --file data.csv
python3 missing_value_analyzer.py --file data.csv --threshold 0.05
python3 missing_value_analyzer.py --file data.csv --format json
"""
import argparse
import csv
import json
import sys
from collections import defaultdict
NULL_STRINGS = {"", "null", "none", "n/a", "na", "nan", "nil", "undefined", "missing"}
def load_csv(filepath: str) -> tuple[list[str], list[dict]]:
with open(filepath, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
rows = list(reader)
headers = reader.fieldnames or []
return headers, rows
def is_null(val: str) -> bool:
return val.strip().lower() in NULL_STRINGS
def compute_null_mask(headers: list[str], rows: list[dict]) -> dict[str, list[bool]]:
return {col: [is_null(row.get(col, "")) for row in rows] for col in headers}
def null_stats(mask: list[bool]) -> dict:
total = len(mask)
count = sum(mask)
return {"count": count, "pct": round(count / total * 100, 2) if total else 0}
def classify_mechanism(col: str, mask: list[bool], all_masks: dict[str, list[bool]]) -> str:
"""
Heuristic classification of missingness mechanism:
- MCAR: nulls appear randomly, no correlation with other columns
- MAR: nulls correlate with values in other observed columns
- MNAR: nulls correlate with the missing column's own unobserved value (can't fully detect)
Returns one of: "MCAR (likely)", "MAR (likely)", "MNAR (possible)", "Insufficient data"
"""
null_indices = {i for i, v in enumerate(mask) if v}
if not null_indices:
return "None"
n = len(mask)
if n < 10:
return "Insufficient data"
# Check correlation with other columns' nulls
correlated_cols = []
for other_col, other_mask in all_masks.items():
if other_col == col:
continue
other_null_indices = {i for i, v in enumerate(other_mask) if v}
if not other_null_indices:
continue
overlap = len(null_indices & other_null_indices)
union = len(null_indices | other_null_indices)
jaccard = overlap / union if union else 0
if jaccard > 0.5:
correlated_cols.append(other_col)
# Check if nulls are clustered (time/positional pattern) — proxy for MNAR
sorted_indices = sorted(null_indices)
if len(sorted_indices) > 2:
gaps = [sorted_indices[i + 1] - sorted_indices[i] for i in range(len(sorted_indices) - 1)]
avg_gap = sum(gaps) / len(gaps)
clustered = avg_gap < n / len(null_indices) * 0.5 # nulls appear closer together than random
else:
clustered = False
if correlated_cols:
return f"MAR (likely) — co-occurs with nulls in: {', '.join(correlated_cols[:3])}"
elif clustered:
return "MNAR (possible) — nulls are spatially clustered, may reflect a systematic gap"
else:
return "MCAR (likely) — nulls appear random, no strong correlation detected"
def recommend_strategy(pct: float, col_type: str) -> str:
if pct == 0:
return "No action needed"
if pct < 1:
return "Drop rows — impact is negligible"
if pct < 10:
strategies = {
"int": "Impute with median + add binary indicator column",
"float": "Impute with median + add binary indicator column",
"string": "Impute with mode or 'Unknown' category + add indicator",
"bool": "Impute with mode",
}
return strategies.get(col_type, "Impute with median/mode + add indicator")
if pct < 30:
return "Impute cautiously; investigate root cause; document assumption; add indicator"
return "Do NOT impute blindly — > 30% missing. Escalate to domain owner or consider dropping column"
def infer_type(values: list[str]) -> str:
non_null = [v for v in values if not is_null(v)]
counts = {"int": 0, "float": 0, "bool": 0, "string": 0}
for v in non_null[:200]: # sample for speed
v = v.strip()
if v.lower() in ("true", "false"):
counts["bool"] += 1
else:
try:
int(v)
counts["int"] += 1
except ValueError:
try:
float(v)
counts["float"] += 1
except ValueError:
counts["string"] += 1
return max(counts, key=lambda k: counts[k]) if any(counts.values()) else "string"
def compute_cooccurrence(headers: list[str], masks: dict[str, list[bool]], top_n: int = 5) -> list[dict]:
"""Find column pairs where nulls most frequently co-occur."""
pairs = []
cols = list(headers)
for i in range(len(cols)):
for j in range(i + 1, len(cols)):
a, b = cols[i], cols[j]
mask_a, mask_b = masks[a], masks[b]
overlap = sum(1 for x, y in zip(mask_a, mask_b) if x and y)
if overlap > 0:
pairs.append({"col_a": a, "col_b": b, "co_null_rows": overlap})
pairs.sort(key=lambda x: -x["co_null_rows"])
return pairs[:top_n]
def print_report(headers: list[str], rows: list[dict], masks: dict, threshold: float):
total = len(rows)
print("=" * 64)
print("MISSING VALUE ANALYSIS REPORT")
print("=" * 64)
print(f"Rows: {total} | Columns: {len(headers)}")
results = []
for col in headers:
mask = masks[col]
stats = null_stats(mask)
if stats["pct"] / 100 < threshold and stats["count"] > 0:
continue
raw_vals = [row.get(col, "") for row in rows]
col_type = infer_type(raw_vals)
mechanism = classify_mechanism(col, mask, masks)
strategy = recommend_strategy(stats["pct"], col_type)
results.append({
"column": col,
"null_count": stats["count"],
"null_pct": stats["pct"],
"col_type": col_type,
"mechanism": mechanism,
"strategy": strategy,
})
fully_complete = [col for col in headers if null_stats(masks[col])["count"] == 0]
print(f"\nFully complete columns: {len(fully_complete)}/{len(headers)}")
if not results:
print(f"\nNo columns exceed the null threshold ({threshold * 100:.1f}%).")
else:
print(f"\nColumns with missing values (threshold >= {threshold * 100:.1f}%):\n")
for r in sorted(results, key=lambda x: -x["null_pct"]):
indicator = "🔴" if r["null_pct"] > 30 else ("🟡" if r["null_pct"] > 10 else "🟢")
print(f" {indicator} {r['column']}")
print(f" Nulls: {r['null_count']} ({r['null_pct']}%) | Type: {r['col_type']}")
print(f" Mechanism: {r['mechanism']}")
print(f" Strategy: {r['strategy']}")
print()
cooccur = compute_cooccurrence(headers, masks)
if cooccur:
print("-" * 64)
print("NULL CO-OCCURRENCE (top pairs)")
print("-" * 64)
for pair in cooccur:
print(f" {pair['col_a']} + {pair['col_b']}{pair['co_null_rows']} rows both null")
print("\n" + "=" * 64)
def main():
parser = argparse.ArgumentParser(description="Analyze missing values in a CSV dataset.")
parser.add_argument("--file", required=True, help="Path to CSV file")
parser.add_argument("--threshold", type=float, default=0.0,
help="Only show columns with null fraction above this (e.g. 0.05 = 5%%)")
parser.add_argument("--format", choices=["text", "json"], default="text")
args = parser.parse_args()
try:
headers, rows = load_csv(args.file)
except FileNotFoundError:
print(f"Error: file not found: {args.file}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error reading file: {e}", file=sys.stderr)
sys.exit(1)
if not rows:
print("Error: CSV file is empty.", file=sys.stderr)
sys.exit(1)
masks = compute_null_mask(headers, rows)
if args.format == "json":
output = []
for col in headers:
mask = masks[col]
stats = null_stats(mask)
raw_vals = [row.get(col, "") for row in rows]
col_type = infer_type(raw_vals)
mechanism = classify_mechanism(col, mask, masks)
strategy = recommend_strategy(stats["pct"], col_type)
output.append({
"column": col,
"null_count": stats["count"],
"null_pct": stats["pct"],
"col_type": col_type,
"mechanism": mechanism,
"strategy": strategy,
})
print(json.dumps({"total_rows": len(rows), "columns": output}, indent=2))
else:
print_report(headers, rows, masks, args.threshold)
if __name__ == "__main__":
main()