Adds a new data-quality-auditor skill with three stdlib-only Python tools: - data_profiler.py: full dataset profile with DQS (0-100) across 5 dimensions - missing_value_analyzer.py: MCAR/MAR/MNAR classification + imputation strategies - outlier_detector.py: IQR, Z-score, and Modified Z-score (MAD) outlier detection Validator: 86.4/100 (GOOD). Security audit: PASS (0 critical/high). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
243 lines
8.8 KiB
Python
243 lines
8.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
missing_value_analyzer.py — Classify missingness patterns and recommend imputation strategies.
|
|
|
|
Usage:
|
|
python3 missing_value_analyzer.py --file data.csv
|
|
python3 missing_value_analyzer.py --file data.csv --threshold 0.05
|
|
python3 missing_value_analyzer.py --file data.csv --format json
|
|
"""
|
|
|
|
import argparse
|
|
import csv
|
|
import json
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
|
|
NULL_STRINGS = {"", "null", "none", "n/a", "na", "nan", "nil", "undefined", "missing"}
|
|
|
|
|
|
def load_csv(filepath: str) -> tuple[list[str], list[dict]]:
|
|
with open(filepath, newline="", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
rows = list(reader)
|
|
headers = reader.fieldnames or []
|
|
return headers, rows
|
|
|
|
|
|
def is_null(val: str) -> bool:
|
|
return val.strip().lower() in NULL_STRINGS
|
|
|
|
|
|
def compute_null_mask(headers: list[str], rows: list[dict]) -> dict[str, list[bool]]:
|
|
return {col: [is_null(row.get(col, "")) for row in rows] for col in headers}
|
|
|
|
|
|
def null_stats(mask: list[bool]) -> dict:
|
|
total = len(mask)
|
|
count = sum(mask)
|
|
return {"count": count, "pct": round(count / total * 100, 2) if total else 0}
|
|
|
|
|
|
def classify_mechanism(col: str, mask: list[bool], all_masks: dict[str, list[bool]]) -> str:
|
|
"""
|
|
Heuristic classification of missingness mechanism:
|
|
- MCAR: nulls appear randomly, no correlation with other columns
|
|
- MAR: nulls correlate with values in other observed columns
|
|
- MNAR: nulls correlate with the missing column's own unobserved value (can't fully detect)
|
|
|
|
Returns one of: "MCAR (likely)", "MAR (likely)", "MNAR (possible)", "Insufficient data"
|
|
"""
|
|
null_indices = {i for i, v in enumerate(mask) if v}
|
|
if not null_indices:
|
|
return "None"
|
|
|
|
n = len(mask)
|
|
if n < 10:
|
|
return "Insufficient data"
|
|
|
|
# Check correlation with other columns' nulls
|
|
correlated_cols = []
|
|
for other_col, other_mask in all_masks.items():
|
|
if other_col == col:
|
|
continue
|
|
other_null_indices = {i for i, v in enumerate(other_mask) if v}
|
|
if not other_null_indices:
|
|
continue
|
|
overlap = len(null_indices & other_null_indices)
|
|
union = len(null_indices | other_null_indices)
|
|
jaccard = overlap / union if union else 0
|
|
if jaccard > 0.5:
|
|
correlated_cols.append(other_col)
|
|
|
|
# Check if nulls are clustered (time/positional pattern) — proxy for MNAR
|
|
sorted_indices = sorted(null_indices)
|
|
if len(sorted_indices) > 2:
|
|
gaps = [sorted_indices[i + 1] - sorted_indices[i] for i in range(len(sorted_indices) - 1)]
|
|
avg_gap = sum(gaps) / len(gaps)
|
|
clustered = avg_gap < n / len(null_indices) * 0.5 # nulls appear closer together than random
|
|
else:
|
|
clustered = False
|
|
|
|
if correlated_cols:
|
|
return f"MAR (likely) — co-occurs with nulls in: {', '.join(correlated_cols[:3])}"
|
|
elif clustered:
|
|
return "MNAR (possible) — nulls are spatially clustered, may reflect a systematic gap"
|
|
else:
|
|
return "MCAR (likely) — nulls appear random, no strong correlation detected"
|
|
|
|
|
|
def recommend_strategy(pct: float, col_type: str) -> str:
|
|
if pct == 0:
|
|
return "No action needed"
|
|
if pct < 1:
|
|
return "Drop rows — impact is negligible"
|
|
if pct < 10:
|
|
strategies = {
|
|
"int": "Impute with median + add binary indicator column",
|
|
"float": "Impute with median + add binary indicator column",
|
|
"string": "Impute with mode or 'Unknown' category + add indicator",
|
|
"bool": "Impute with mode",
|
|
}
|
|
return strategies.get(col_type, "Impute with median/mode + add indicator")
|
|
if pct < 30:
|
|
return "Impute cautiously; investigate root cause; document assumption; add indicator"
|
|
return "Do NOT impute blindly — > 30% missing. Escalate to domain owner or consider dropping column"
|
|
|
|
|
|
def infer_type(values: list[str]) -> str:
|
|
non_null = [v for v in values if not is_null(v)]
|
|
counts = {"int": 0, "float": 0, "bool": 0, "string": 0}
|
|
for v in non_null[:200]: # sample for speed
|
|
v = v.strip()
|
|
if v.lower() in ("true", "false"):
|
|
counts["bool"] += 1
|
|
else:
|
|
try:
|
|
int(v)
|
|
counts["int"] += 1
|
|
except ValueError:
|
|
try:
|
|
float(v)
|
|
counts["float"] += 1
|
|
except ValueError:
|
|
counts["string"] += 1
|
|
return max(counts, key=lambda k: counts[k]) if any(counts.values()) else "string"
|
|
|
|
|
|
def compute_cooccurrence(headers: list[str], masks: dict[str, list[bool]], top_n: int = 5) -> list[dict]:
|
|
"""Find column pairs where nulls most frequently co-occur."""
|
|
pairs = []
|
|
cols = list(headers)
|
|
for i in range(len(cols)):
|
|
for j in range(i + 1, len(cols)):
|
|
a, b = cols[i], cols[j]
|
|
mask_a, mask_b = masks[a], masks[b]
|
|
overlap = sum(1 for x, y in zip(mask_a, mask_b) if x and y)
|
|
if overlap > 0:
|
|
pairs.append({"col_a": a, "col_b": b, "co_null_rows": overlap})
|
|
pairs.sort(key=lambda x: -x["co_null_rows"])
|
|
return pairs[:top_n]
|
|
|
|
|
|
def print_report(headers: list[str], rows: list[dict], masks: dict, threshold: float):
|
|
total = len(rows)
|
|
print("=" * 64)
|
|
print("MISSING VALUE ANALYSIS REPORT")
|
|
print("=" * 64)
|
|
print(f"Rows: {total} | Columns: {len(headers)}")
|
|
|
|
results = []
|
|
for col in headers:
|
|
mask = masks[col]
|
|
stats = null_stats(mask)
|
|
if stats["pct"] / 100 < threshold and stats["count"] > 0:
|
|
continue
|
|
raw_vals = [row.get(col, "") for row in rows]
|
|
col_type = infer_type(raw_vals)
|
|
mechanism = classify_mechanism(col, mask, masks)
|
|
strategy = recommend_strategy(stats["pct"], col_type)
|
|
results.append({
|
|
"column": col,
|
|
"null_count": stats["count"],
|
|
"null_pct": stats["pct"],
|
|
"col_type": col_type,
|
|
"mechanism": mechanism,
|
|
"strategy": strategy,
|
|
})
|
|
|
|
fully_complete = [col for col in headers if null_stats(masks[col])["count"] == 0]
|
|
print(f"\nFully complete columns: {len(fully_complete)}/{len(headers)}")
|
|
|
|
if not results:
|
|
print(f"\nNo columns exceed the null threshold ({threshold * 100:.1f}%).")
|
|
else:
|
|
print(f"\nColumns with missing values (threshold >= {threshold * 100:.1f}%):\n")
|
|
for r in sorted(results, key=lambda x: -x["null_pct"]):
|
|
indicator = "🔴" if r["null_pct"] > 30 else ("🟡" if r["null_pct"] > 10 else "🟢")
|
|
print(f" {indicator} {r['column']}")
|
|
print(f" Nulls: {r['null_count']} ({r['null_pct']}%) | Type: {r['col_type']}")
|
|
print(f" Mechanism: {r['mechanism']}")
|
|
print(f" Strategy: {r['strategy']}")
|
|
print()
|
|
|
|
cooccur = compute_cooccurrence(headers, masks)
|
|
if cooccur:
|
|
print("-" * 64)
|
|
print("NULL CO-OCCURRENCE (top pairs)")
|
|
print("-" * 64)
|
|
for pair in cooccur:
|
|
print(f" {pair['col_a']} + {pair['col_b']} → {pair['co_null_rows']} rows both null")
|
|
|
|
print("\n" + "=" * 64)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Analyze missing values in a CSV dataset.")
|
|
parser.add_argument("--file", required=True, help="Path to CSV file")
|
|
parser.add_argument("--threshold", type=float, default=0.0,
|
|
help="Only show columns with null fraction above this (e.g. 0.05 = 5%%)")
|
|
parser.add_argument("--format", choices=["text", "json"], default="text")
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
headers, rows = load_csv(args.file)
|
|
except FileNotFoundError:
|
|
print(f"Error: file not found: {args.file}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error reading file: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if not rows:
|
|
print("Error: CSV file is empty.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
masks = compute_null_mask(headers, rows)
|
|
|
|
if args.format == "json":
|
|
output = []
|
|
for col in headers:
|
|
mask = masks[col]
|
|
stats = null_stats(mask)
|
|
raw_vals = [row.get(col, "") for row in rows]
|
|
col_type = infer_type(raw_vals)
|
|
mechanism = classify_mechanism(col, mask, masks)
|
|
strategy = recommend_strategy(stats["pct"], col_type)
|
|
output.append({
|
|
"column": col,
|
|
"null_count": stats["count"],
|
|
"null_pct": stats["pct"],
|
|
"col_type": col_type,
|
|
"mechanism": mechanism,
|
|
"strategy": strategy,
|
|
})
|
|
print(json.dumps({"total_rows": len(rows), "columns": output}, indent=2))
|
|
else:
|
|
print_report(headers, rows, masks, args.threshold)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|