feat(data-analysis): data-quality-auditor
Adds a new data-quality-auditor skill with three stdlib-only Python tools: - data_profiler.py: full dataset profile with DQS (0-100) across 5 dimensions - missing_value_analyzer.py: MCAR/MAR/MNAR classification + imputation strategies - outlier_detector.py: IQR, Z-score, and Modified Z-score (MAD) outlier detection Validator: 86.4/100 (GOOD). Security audit: PASS (0 critical/high). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,242 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
missing_value_analyzer.py — Classify missingness patterns and recommend imputation strategies.
|
||||
|
||||
Usage:
|
||||
python3 missing_value_analyzer.py --file data.csv
|
||||
python3 missing_value_analyzer.py --file data.csv --threshold 0.05
|
||||
python3 missing_value_analyzer.py --file data.csv --format json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
NULL_STRINGS = {"", "null", "none", "n/a", "na", "nan", "nil", "undefined", "missing"}
|
||||
|
||||
|
||||
def load_csv(filepath: str) -> tuple[list[str], list[dict]]:
|
||||
with open(filepath, newline="", encoding="utf-8") as f:
|
||||
reader = csv.DictReader(f)
|
||||
rows = list(reader)
|
||||
headers = reader.fieldnames or []
|
||||
return headers, rows
|
||||
|
||||
|
||||
def is_null(val: str) -> bool:
|
||||
return val.strip().lower() in NULL_STRINGS
|
||||
|
||||
|
||||
def compute_null_mask(headers: list[str], rows: list[dict]) -> dict[str, list[bool]]:
|
||||
return {col: [is_null(row.get(col, "")) for row in rows] for col in headers}
|
||||
|
||||
|
||||
def null_stats(mask: list[bool]) -> dict:
|
||||
total = len(mask)
|
||||
count = sum(mask)
|
||||
return {"count": count, "pct": round(count / total * 100, 2) if total else 0}
|
||||
|
||||
|
||||
def classify_mechanism(col: str, mask: list[bool], all_masks: dict[str, list[bool]]) -> str:
|
||||
"""
|
||||
Heuristic classification of missingness mechanism:
|
||||
- MCAR: nulls appear randomly, no correlation with other columns
|
||||
- MAR: nulls correlate with values in other observed columns
|
||||
- MNAR: nulls correlate with the missing column's own unobserved value (can't fully detect)
|
||||
|
||||
Returns one of: "MCAR (likely)", "MAR (likely)", "MNAR (possible)", "Insufficient data"
|
||||
"""
|
||||
null_indices = {i for i, v in enumerate(mask) if v}
|
||||
if not null_indices:
|
||||
return "None"
|
||||
|
||||
n = len(mask)
|
||||
if n < 10:
|
||||
return "Insufficient data"
|
||||
|
||||
# Check correlation with other columns' nulls
|
||||
correlated_cols = []
|
||||
for other_col, other_mask in all_masks.items():
|
||||
if other_col == col:
|
||||
continue
|
||||
other_null_indices = {i for i, v in enumerate(other_mask) if v}
|
||||
if not other_null_indices:
|
||||
continue
|
||||
overlap = len(null_indices & other_null_indices)
|
||||
union = len(null_indices | other_null_indices)
|
||||
jaccard = overlap / union if union else 0
|
||||
if jaccard > 0.5:
|
||||
correlated_cols.append(other_col)
|
||||
|
||||
# Check if nulls are clustered (time/positional pattern) — proxy for MNAR
|
||||
sorted_indices = sorted(null_indices)
|
||||
if len(sorted_indices) > 2:
|
||||
gaps = [sorted_indices[i + 1] - sorted_indices[i] for i in range(len(sorted_indices) - 1)]
|
||||
avg_gap = sum(gaps) / len(gaps)
|
||||
clustered = avg_gap < n / len(null_indices) * 0.5 # nulls appear closer together than random
|
||||
else:
|
||||
clustered = False
|
||||
|
||||
if correlated_cols:
|
||||
return f"MAR (likely) — co-occurs with nulls in: {', '.join(correlated_cols[:3])}"
|
||||
elif clustered:
|
||||
return "MNAR (possible) — nulls are spatially clustered, may reflect a systematic gap"
|
||||
else:
|
||||
return "MCAR (likely) — nulls appear random, no strong correlation detected"
|
||||
|
||||
|
||||
def recommend_strategy(pct: float, col_type: str) -> str:
|
||||
if pct == 0:
|
||||
return "No action needed"
|
||||
if pct < 1:
|
||||
return "Drop rows — impact is negligible"
|
||||
if pct < 10:
|
||||
strategies = {
|
||||
"int": "Impute with median + add binary indicator column",
|
||||
"float": "Impute with median + add binary indicator column",
|
||||
"string": "Impute with mode or 'Unknown' category + add indicator",
|
||||
"bool": "Impute with mode",
|
||||
}
|
||||
return strategies.get(col_type, "Impute with median/mode + add indicator")
|
||||
if pct < 30:
|
||||
return "Impute cautiously; investigate root cause; document assumption; add indicator"
|
||||
return "Do NOT impute blindly — > 30% missing. Escalate to domain owner or consider dropping column"
|
||||
|
||||
|
||||
def infer_type(values: list[str]) -> str:
|
||||
non_null = [v for v in values if not is_null(v)]
|
||||
counts = {"int": 0, "float": 0, "bool": 0, "string": 0}
|
||||
for v in non_null[:200]: # sample for speed
|
||||
v = v.strip()
|
||||
if v.lower() in ("true", "false"):
|
||||
counts["bool"] += 1
|
||||
else:
|
||||
try:
|
||||
int(v)
|
||||
counts["int"] += 1
|
||||
except ValueError:
|
||||
try:
|
||||
float(v)
|
||||
counts["float"] += 1
|
||||
except ValueError:
|
||||
counts["string"] += 1
|
||||
return max(counts, key=lambda k: counts[k]) if any(counts.values()) else "string"
|
||||
|
||||
|
||||
def compute_cooccurrence(headers: list[str], masks: dict[str, list[bool]], top_n: int = 5) -> list[dict]:
|
||||
"""Find column pairs where nulls most frequently co-occur."""
|
||||
pairs = []
|
||||
cols = list(headers)
|
||||
for i in range(len(cols)):
|
||||
for j in range(i + 1, len(cols)):
|
||||
a, b = cols[i], cols[j]
|
||||
mask_a, mask_b = masks[a], masks[b]
|
||||
overlap = sum(1 for x, y in zip(mask_a, mask_b) if x and y)
|
||||
if overlap > 0:
|
||||
pairs.append({"col_a": a, "col_b": b, "co_null_rows": overlap})
|
||||
pairs.sort(key=lambda x: -x["co_null_rows"])
|
||||
return pairs[:top_n]
|
||||
|
||||
|
||||
def print_report(headers: list[str], rows: list[dict], masks: dict, threshold: float):
|
||||
total = len(rows)
|
||||
print("=" * 64)
|
||||
print("MISSING VALUE ANALYSIS REPORT")
|
||||
print("=" * 64)
|
||||
print(f"Rows: {total} | Columns: {len(headers)}")
|
||||
|
||||
results = []
|
||||
for col in headers:
|
||||
mask = masks[col]
|
||||
stats = null_stats(mask)
|
||||
if stats["pct"] / 100 < threshold and stats["count"] > 0:
|
||||
continue
|
||||
raw_vals = [row.get(col, "") for row in rows]
|
||||
col_type = infer_type(raw_vals)
|
||||
mechanism = classify_mechanism(col, mask, masks)
|
||||
strategy = recommend_strategy(stats["pct"], col_type)
|
||||
results.append({
|
||||
"column": col,
|
||||
"null_count": stats["count"],
|
||||
"null_pct": stats["pct"],
|
||||
"col_type": col_type,
|
||||
"mechanism": mechanism,
|
||||
"strategy": strategy,
|
||||
})
|
||||
|
||||
fully_complete = [col for col in headers if null_stats(masks[col])["count"] == 0]
|
||||
print(f"\nFully complete columns: {len(fully_complete)}/{len(headers)}")
|
||||
|
||||
if not results:
|
||||
print(f"\nNo columns exceed the null threshold ({threshold * 100:.1f}%).")
|
||||
else:
|
||||
print(f"\nColumns with missing values (threshold >= {threshold * 100:.1f}%):\n")
|
||||
for r in sorted(results, key=lambda x: -x["null_pct"]):
|
||||
indicator = "🔴" if r["null_pct"] > 30 else ("🟡" if r["null_pct"] > 10 else "🟢")
|
||||
print(f" {indicator} {r['column']}")
|
||||
print(f" Nulls: {r['null_count']} ({r['null_pct']}%) | Type: {r['col_type']}")
|
||||
print(f" Mechanism: {r['mechanism']}")
|
||||
print(f" Strategy: {r['strategy']}")
|
||||
print()
|
||||
|
||||
cooccur = compute_cooccurrence(headers, masks)
|
||||
if cooccur:
|
||||
print("-" * 64)
|
||||
print("NULL CO-OCCURRENCE (top pairs)")
|
||||
print("-" * 64)
|
||||
for pair in cooccur:
|
||||
print(f" {pair['col_a']} + {pair['col_b']} → {pair['co_null_rows']} rows both null")
|
||||
|
||||
print("\n" + "=" * 64)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Analyze missing values in a CSV dataset.")
|
||||
parser.add_argument("--file", required=True, help="Path to CSV file")
|
||||
parser.add_argument("--threshold", type=float, default=0.0,
|
||||
help="Only show columns with null fraction above this (e.g. 0.05 = 5%%)")
|
||||
parser.add_argument("--format", choices=["text", "json"], default="text")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
headers, rows = load_csv(args.file)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: file not found: {args.file}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not rows:
|
||||
print("Error: CSV file is empty.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
masks = compute_null_mask(headers, rows)
|
||||
|
||||
if args.format == "json":
|
||||
output = []
|
||||
for col in headers:
|
||||
mask = masks[col]
|
||||
stats = null_stats(mask)
|
||||
raw_vals = [row.get(col, "") for row in rows]
|
||||
col_type = infer_type(raw_vals)
|
||||
mechanism = classify_mechanism(col, mask, masks)
|
||||
strategy = recommend_strategy(stats["pct"], col_type)
|
||||
output.append({
|
||||
"column": col,
|
||||
"null_count": stats["count"],
|
||||
"null_pct": stats["pct"],
|
||||
"col_type": col_type,
|
||||
"mechanism": mechanism,
|
||||
"strategy": strategy,
|
||||
})
|
||||
print(json.dumps({"total_rows": len(rows), "columns": output}, indent=2))
|
||||
else:
|
||||
print_report(headers, rows, masks, args.threshold)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user