Files
claude-code-skills-reference/claude-export-txt-better/scripts/validate-claude-export-fix.py
daymade 042c837db6 feat(claude-export-txt-better): add Claude Code export file fixer
Add skill to fix broken line wrapping in Claude Code exported .txt files.
Reconstructs tables, paragraphs, paths, and tool calls that were hard-wrapped
at fixed column widths.

Features:
- State-machine parser with next-line look-ahead
- Handles 10 content types (user prompts, Claude responses, tables, tool calls, etc.)
- Pangu spacing for CJK/ASCII mixed text
- 53 automated validation checks
- Safety: never modifies original files, verifies marker counts

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-11 14:02:26 +08:00

313 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Automated validation for fix-claude-export.py output.
Runs a comprehensive suite of checks against a fixed file and its original,
reporting PASS/FAIL for each with evidence. Designed to be run after every
fix iteration as a quality gate.
Usage:
uv run scripts/validate-claude-export-fix.py <original.txt> <fixed.txt>
uv run scripts/validate-claude-export-fix.py <original.txt> <fixed.txt> --verbose
"""
from __future__ import annotations
import argparse
import re
import sys
import unicodedata
from dataclasses import dataclass, field
from pathlib import Path
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def display_width(s: str) -> int:
return sum(2 if unicodedata.east_asian_width(ch) in ("W", "F") else 1 for ch in s)
def is_cjk_ideograph(ch: str) -> bool:
return unicodedata.east_asian_width(ch) in ("W", "F") and unicodedata.category(ch) == "Lo"
# ---------------------------------------------------------------------------
# Check infrastructure
# ---------------------------------------------------------------------------
@dataclass
class CheckResult:
name: str
passed: bool
detail: str
category: str = ""
@dataclass
class ValidationReport:
results: list[CheckResult] = field(default_factory=list)
def add(self, name: str, passed: bool, detail: str, category: str = ""):
self.results.append(CheckResult(name, passed, detail, category))
@property
def passed(self) -> int:
return sum(1 for r in self.results if r.passed)
@property
def failed(self) -> int:
return sum(1 for r in self.results if not r.passed)
def print_report(self, verbose: bool = False):
cats = {}
for r in self.results:
cats.setdefault(r.category or "General", []).append(r)
for cat, checks in cats.items():
print(f"\n{'=' * 60}")
print(f" {cat}")
print(f"{'=' * 60}")
for r in checks:
icon = "" if r.passed else ""
print(f" {icon} {r.name}")
if verbose or not r.passed:
for line in r.detail.split("\n"):
print(f" {line}")
total = len(self.results)
print(f"\n{'=' * 60}")
print(f" TOTAL: {self.passed}/{total} passed, {self.failed} failed")
print(f"{'=' * 60}")
# ---------------------------------------------------------------------------
# Check implementations
# ---------------------------------------------------------------------------
def check_marker_counts(orig: str, fixed: str, report: ValidationReport):
"""Verify structural markers are preserved exactly."""
markers = [
("", "User prompts"),
("", "Claude actions"),
("", "Stars/crunched"),
("", "Tool results"),
("", "Expansion indicators"),
]
for marker, name in markers:
orig_count = orig.count(marker)
fixed_count = fixed.count(marker)
report.add(
f"Marker {marker} ({name}): {orig_count}",
orig_count == fixed_count,
f"orig={orig_count} fixed={fixed_count}",
"Structural Integrity",
)
def check_table_borders(fixed: str, report: ValidationReport):
"""Verify table border corners are balanced."""
for ch, name in [("", "top-left"), ("", "top-right"), ("", "bottom-right")]:
count = fixed.count(ch)
report.add(
f"Table corner {ch} ({name}): {count}",
True, # Just record the count
f"count={count}",
"Structural Integrity",
)
tl = fixed.count("")
tr = fixed.count("")
br = fixed.count("")
report.add(
"Table corners balanced (┌ = ┐ = ┘)",
tl == tr == br,
f"┌={tl} ┐={tr} ┘={br}",
"Structural Integrity",
)
def check_line_reduction(orig: str, fixed: str, report: ValidationReport):
"""Output should have fewer lines than input (joins happened)."""
orig_lines = orig.count("\n")
fixed_lines = fixed.count("\n")
report.add(
f"Line reduction: {orig_lines}{fixed_lines}",
fixed_lines < orig_lines,
f"delta={orig_lines - fixed_lines} ({(orig_lines - fixed_lines) / orig_lines * 100:.1f}% reduction)",
"Structural Integrity",
)
def check_table_border_completeness(fixed: str, report: ValidationReport):
"""Verify table border lines have matching left and right ends."""
lines = fixed.split("\n")
broken = []
for i, line in enumerate(lines):
stripped = line.strip()
if not stripped:
continue
# Lines starting with a left border char should have a right border char
if stripped[0] == "" and "" not in stripped:
broken.append((i + 1, "┌ without ┐", stripped[:80]))
elif stripped[0] == "" and "" not in stripped:
broken.append((i + 1, "├ without ┤", stripped[:80]))
elif stripped[0] == "" and "" not in stripped:
broken.append((i + 1, "└ without ┘", stripped[:80]))
report.add(
f"Table borders complete: {len(broken)} broken",
len(broken) == 0,
"\n".join(f" L{ln}: {desc}" for ln, desc, _ in broken[:5])
if broken else "all borders have matching ends",
"Structural Integrity",
)
def check_phase_separation(fixed: str, report: ValidationReport):
"""Verify Phase N: items are on separate lines."""
lines = fixed.split("\n")
multi_phase_lines = []
for i, line in enumerate(lines):
# Count "Phase N:" occurrences on this line
matches = re.findall(r"Phase \d+:", line)
if len(matches) >= 2:
# Allow pipeline diagrams with arrows (legitimate multi-phase)
if "" in line:
continue
# Allow status updates like "Phase 3 进度: 3/5"
if line.strip().startswith(""):
continue
multi_phase_lines.append((i + 1, matches, line[:80]))
report.add(
"Phase items on separate lines",
len(multi_phase_lines) == 0,
f"{len(multi_phase_lines)} violations" + (
"\n" + "\n".join(f" L{ln}: {m}" for ln, m, _ in multi_phase_lines[:5])
if multi_phase_lines else ""
),
"Over-Join Prevention",
)
def check_runaway_joins(fixed: str, report: ValidationReport):
"""Flag lines with very high display width that might be runaway joins."""
lines = fixed.split("\n")
runaways = []
for i, line in enumerate(lines):
dw = display_width(line)
if dw > 500:
# Check if it's a legitimate long line (user prompt)
if line.startswith(" "):
continue
runaways.append((i + 1, dw, line[:60]))
report.add(
f"No runaway joins (dw > 500): {len(runaways)} found",
len(runaways) == 0,
"\n".join(f" L{ln}: dw={dw} [{preview}...]" for ln, dw, preview in runaways[:5])
if runaways else "none",
"Over-Join Prevention",
)
def check_en_cjk_no_space(fixed: str, report: ValidationReport):
"""Count remaining EN-CJK adjacency without space (potential pangu misses)."""
# Only check at join boundaries (lines that were modified), not all text
pattern_alnum_cjk = re.compile(r"[a-zA-Z0-9][一-龥]")
pattern_cjk_alnum = re.compile(r"[一-龥][a-zA-Z0-9]")
violations_ac = len(pattern_alnum_cjk.findall(fixed))
violations_ca = len(pattern_cjk_alnum.findall(fixed))
total = violations_ac + violations_ca
# This is informational — some violations are in original content, code, etc.
report.add(
f"EN-CJK adjacency count: {total}",
True, # Informational
f"ASCII→CJK: {violations_ac}, CJK→ASCII: {violations_ca} (includes original content)",
"Pangu Spacing",
)
def check_diff_lines_separate(fixed: str, report: ValidationReport):
"""Verify diff output lines aren't merged (line numbers should be separate)."""
lines = fixed.split("\n")
merged_diffs = []
for i, line in enumerate(lines):
# Look for two diff line numbers on the same line
matches = re.findall(r"\b(\d{3})\s{2,}[-+]?\s", line)
if len(matches) >= 2:
merged_diffs.append((i + 1, matches, line[:80]))
report.add(
"Diff lines separate",
len(merged_diffs) == 0,
f"{len(merged_diffs)} violations" + (
"\n" + "\n".join(f" L{ln}: numbers={m}" for ln, m, _ in merged_diffs[:5])
if merged_diffs else ""
),
"Over-Join Prevention",
)
def check_cjk_label_separation(fixed: str, report: ValidationReport):
"""Verify CJK label fields (模块:, 输出文件:, 状态:) are on separate lines."""
lines = fixed.split("\n")
merged_labels = []
# Pattern: field-label style "Label1: value1 Label2: value2" where
# each label starts at a position that looks like a separate field.
# Only flag when labels are at field boundaries (preceded by whitespace
# or start of line), not mid-sentence.
cjk_field_re = re.compile(r"(?:^|\s)([\u4e00-\u9fff]{1,4}[:])\s")
for i, line in enumerate(lines):
stripped = line.strip()
matches = cjk_field_re.findall(stripped)
if len(matches) >= 3: # 3+ field labels = likely over-joined fields
merged_labels.append((i + 1, matches, stripped[:80]))
report.add(
"CJK labels on separate lines",
len(merged_labels) == 0,
f"{len(merged_labels)} violations" + (
"\n" + "\n".join(f" L{ln}: labels={m}" for ln, m, _ in merged_labels[:5])
if merged_labels else ""
),
"Over-Join Prevention",
)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("original", type=Path, help="Original exported file")
parser.add_argument("fixed", type=Path, help="Fixed output file")
parser.add_argument("--verbose", "-v", action="store_true", help="Show all details")
args = parser.parse_args()
orig = args.original.read_text(encoding="utf-8")
fixed = args.fixed.read_text(encoding="utf-8")
report = ValidationReport()
# Run all checks
check_marker_counts(orig, fixed, report)
check_table_borders(fixed, report)
check_table_border_completeness(fixed, report)
check_line_reduction(orig, fixed, report)
check_phase_separation(fixed, report)
check_runaway_joins(fixed, report)
check_diff_lines_separate(fixed, report)
check_cjk_label_separation(fixed, report)
check_en_cjk_no_space(fixed, report)
report.print_report(verbose=args.verbose)
sys.exit(0 if report.failed == 0 else 1)
if __name__ == "__main__":
main()