- Rename skill to better reflect its purpose (document-to-markdown conversion) - Update SKILL.md name, description, and trigger keywords - Add benchmark reference (2026-03-22) - Update marketplace.json entry (name, skills path, version 2.0.0) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
467 lines
16 KiB
Python
Executable File
467 lines
16 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Quality validator for document-to-markdown conversion.
|
|
|
|
Compare original document with converted markdown to assess conversion quality.
|
|
Generates HTML quality report with detailed metrics.
|
|
|
|
Usage:
|
|
uv run --with pymupdf scripts/validate_output.py document.pdf output.md
|
|
uv run --with pymupdf scripts/validate_output.py document.pdf output.md --report report.html
|
|
"""
|
|
|
|
import argparse
|
|
import html
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class ValidationMetrics:
|
|
"""Quality metrics for conversion validation."""
|
|
# Text metrics
|
|
source_char_count: int = 0
|
|
output_char_count: int = 0
|
|
text_retention: float = 0.0
|
|
|
|
# Table metrics
|
|
source_table_count: int = 0
|
|
output_table_count: int = 0
|
|
table_retention: float = 0.0
|
|
|
|
# Image metrics
|
|
source_image_count: int = 0
|
|
output_image_count: int = 0
|
|
image_retention: float = 0.0
|
|
|
|
# Structure metrics
|
|
heading_count: int = 0
|
|
list_count: int = 0
|
|
code_block_count: int = 0
|
|
|
|
# Quality scores
|
|
overall_score: float = 0.0
|
|
status: str = "unknown" # pass, warn, fail
|
|
|
|
# Details
|
|
warnings: list[str] = field(default_factory=list)
|
|
errors: list[str] = field(default_factory=list)
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path: Path) -> tuple[str, int, int]:
|
|
"""Extract text, table count, and image count from PDF."""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
|
|
doc = fitz.open(str(pdf_path))
|
|
text_parts = []
|
|
table_count = 0
|
|
image_count = 0
|
|
|
|
for page in doc:
|
|
text_parts.append(page.get_text())
|
|
# Count images
|
|
image_count += len(page.get_images())
|
|
# Estimate tables (look for grid-like structures)
|
|
# This is approximate - tables are hard to detect in PDFs
|
|
page_text = page.get_text()
|
|
if re.search(r'(\t.*){2,}', page_text) or '│' in page_text:
|
|
table_count += 1
|
|
|
|
doc.close()
|
|
return '\n'.join(text_parts), table_count, image_count
|
|
|
|
except ImportError:
|
|
# Fallback to pdftotext if available
|
|
try:
|
|
result = subprocess.run(
|
|
['pdftotext', '-layout', str(pdf_path), '-'],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60
|
|
)
|
|
return result.stdout, 0, 0 # Can't count tables/images
|
|
except Exception:
|
|
return "", 0, 0
|
|
|
|
|
|
def extract_text_from_docx(docx_path: Path) -> tuple[str, int, int]:
|
|
"""Extract text, table count, and image count from DOCX."""
|
|
try:
|
|
import zipfile
|
|
from xml.etree import ElementTree as ET
|
|
|
|
with zipfile.ZipFile(docx_path, 'r') as z:
|
|
# Extract main document text
|
|
if 'word/document.xml' not in z.namelist():
|
|
return "", 0, 0
|
|
|
|
with z.open('word/document.xml') as f:
|
|
tree = ET.parse(f)
|
|
root = tree.getroot()
|
|
|
|
# Extract text
|
|
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
|
text_parts = []
|
|
for t in root.iter('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t'):
|
|
if t.text:
|
|
text_parts.append(t.text)
|
|
|
|
# Count tables
|
|
tables = root.findall('.//w:tbl', ns)
|
|
table_count = len(tables)
|
|
|
|
# Count images
|
|
image_count = sum(1 for name in z.namelist()
|
|
if name.startswith('word/media/'))
|
|
|
|
return ' '.join(text_parts), table_count, image_count
|
|
|
|
except Exception as e:
|
|
return "", 0, 0
|
|
|
|
|
|
def analyze_markdown(md_path: Path) -> dict:
|
|
"""Analyze markdown file structure and content."""
|
|
content = md_path.read_text()
|
|
|
|
# Count tables (markdown tables with |)
|
|
table_lines = [l for l in content.split('\n')
|
|
if re.match(r'^\s*\|.*\|', l)]
|
|
# Group consecutive table lines
|
|
table_count = 0
|
|
in_table = False
|
|
for line in content.split('\n'):
|
|
if re.match(r'^\s*\|.*\|', line):
|
|
if not in_table:
|
|
table_count += 1
|
|
in_table = True
|
|
else:
|
|
in_table = False
|
|
|
|
# Count images
|
|
images = re.findall(r'!\[.*?\]\(.*?\)', content)
|
|
|
|
# Count headings
|
|
headings = re.findall(r'^#{1,6}\s+.+$', content, re.MULTILINE)
|
|
|
|
# Count lists
|
|
list_items = re.findall(r'^[\s]*[-*+]\s+', content, re.MULTILINE)
|
|
list_items += re.findall(r'^[\s]*\d+\.\s+', content, re.MULTILINE)
|
|
|
|
# Count code blocks
|
|
code_blocks = re.findall(r'```', content)
|
|
|
|
# Clean text for comparison
|
|
clean_text = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
|
|
clean_text = re.sub(r'!\[.*?\]\(.*?\)', '', clean_text)
|
|
clean_text = re.sub(r'\[.*?\]\(.*?\)', '', clean_text)
|
|
clean_text = re.sub(r'[#*_`|>-]', '', clean_text)
|
|
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
|
|
|
|
return {
|
|
'char_count': len(clean_text),
|
|
'table_count': table_count,
|
|
'image_count': len(images),
|
|
'heading_count': len(headings),
|
|
'list_count': len(list_items),
|
|
'code_block_count': len(code_blocks) // 2,
|
|
'raw_content': content,
|
|
'clean_text': clean_text
|
|
}
|
|
|
|
|
|
def validate_conversion(
|
|
source_path: Path,
|
|
output_path: Path
|
|
) -> ValidationMetrics:
|
|
"""Validate conversion quality by comparing source and output."""
|
|
metrics = ValidationMetrics()
|
|
|
|
# Analyze output markdown
|
|
md_analysis = analyze_markdown(output_path)
|
|
metrics.output_char_count = md_analysis['char_count']
|
|
metrics.output_table_count = md_analysis['table_count']
|
|
metrics.output_image_count = md_analysis['image_count']
|
|
metrics.heading_count = md_analysis['heading_count']
|
|
metrics.list_count = md_analysis['list_count']
|
|
metrics.code_block_count = md_analysis['code_block_count']
|
|
|
|
# Extract source content based on file type
|
|
ext = source_path.suffix.lower()
|
|
if ext == '.pdf':
|
|
source_text, source_tables, source_images = extract_text_from_pdf(source_path)
|
|
elif ext in ['.docx', '.doc']:
|
|
source_text, source_tables, source_images = extract_text_from_docx(source_path)
|
|
else:
|
|
# For other formats, estimate from file size
|
|
source_text = ""
|
|
source_tables = 0
|
|
source_images = 0
|
|
metrics.warnings.append(f"Cannot analyze source format: {ext}")
|
|
|
|
metrics.source_char_count = len(source_text.replace(' ', '').replace('\n', ''))
|
|
metrics.source_table_count = source_tables
|
|
metrics.source_image_count = source_images
|
|
|
|
# Calculate retention rates
|
|
if metrics.source_char_count > 0:
|
|
# Use ratio of actual/expected, capped at 1.0
|
|
metrics.text_retention = min(
|
|
metrics.output_char_count / metrics.source_char_count,
|
|
1.0
|
|
)
|
|
else:
|
|
metrics.text_retention = 1.0 if metrics.output_char_count > 0 else 0.0
|
|
|
|
if metrics.source_table_count > 0:
|
|
metrics.table_retention = min(
|
|
metrics.output_table_count / metrics.source_table_count,
|
|
1.0
|
|
)
|
|
else:
|
|
metrics.table_retention = 1.0 # No tables expected
|
|
|
|
if metrics.source_image_count > 0:
|
|
metrics.image_retention = min(
|
|
metrics.output_image_count / metrics.source_image_count,
|
|
1.0
|
|
)
|
|
else:
|
|
metrics.image_retention = 1.0 # No images expected
|
|
|
|
# Determine status based on thresholds
|
|
if metrics.text_retention < 0.85:
|
|
metrics.errors.append(f"Low text retention: {metrics.text_retention:.1%}")
|
|
elif metrics.text_retention < 0.95:
|
|
metrics.warnings.append(f"Text retention below optimal: {metrics.text_retention:.1%}")
|
|
|
|
if metrics.source_table_count > 0 and metrics.table_retention < 0.9:
|
|
metrics.errors.append(f"Tables missing: {metrics.table_retention:.1%} retained")
|
|
elif metrics.source_table_count > 0 and metrics.table_retention < 1.0:
|
|
metrics.warnings.append(f"Some tables may be incomplete: {metrics.table_retention:.1%}")
|
|
|
|
if metrics.source_image_count > 0 and metrics.image_retention < 0.8:
|
|
metrics.errors.append(f"Images missing: {metrics.image_retention:.1%} retained")
|
|
elif metrics.source_image_count > 0 and metrics.image_retention < 1.0:
|
|
metrics.warnings.append(f"Some images missing: {metrics.image_retention:.1%}")
|
|
|
|
# Calculate overall score (0-100)
|
|
metrics.overall_score = (
|
|
metrics.text_retention * 50 +
|
|
metrics.table_retention * 25 +
|
|
metrics.image_retention * 25
|
|
) * 100
|
|
|
|
# Determine status
|
|
if metrics.errors:
|
|
metrics.status = "fail"
|
|
elif metrics.warnings:
|
|
metrics.status = "warn"
|
|
else:
|
|
metrics.status = "pass"
|
|
|
|
return metrics
|
|
|
|
|
|
def generate_html_report(
|
|
metrics: ValidationMetrics,
|
|
source_path: Path,
|
|
output_path: Path
|
|
) -> str:
|
|
"""Generate HTML quality report."""
|
|
status_colors = {
|
|
"pass": "#28a745",
|
|
"warn": "#ffc107",
|
|
"fail": "#dc3545"
|
|
}
|
|
status_color = status_colors.get(metrics.status, "#6c757d")
|
|
|
|
def metric_bar(value: float, thresholds: tuple) -> str:
|
|
"""Generate colored progress bar."""
|
|
pct = int(value * 100)
|
|
if value >= thresholds[0]:
|
|
color = "#28a745" # green
|
|
elif value >= thresholds[1]:
|
|
color = "#ffc107" # yellow
|
|
else:
|
|
color = "#dc3545" # red
|
|
return f'''
|
|
<div style="background: #e9ecef; border-radius: 4px; overflow: hidden; height: 20px;">
|
|
<div style="background: {color}; width: {pct}%; height: 100%; transition: width 0.3s;"></div>
|
|
</div>
|
|
<span style="font-size: 14px; color: #666;">{pct}%</span>
|
|
'''
|
|
|
|
report = f'''<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<title>Conversion Quality Report</title>
|
|
<style>
|
|
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; margin: 40px; background: #f5f5f5; }}
|
|
.container {{ max-width: 800px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
|
|
h1 {{ color: #333; border-bottom: 2px solid #eee; padding-bottom: 15px; }}
|
|
.status {{ display: inline-block; padding: 8px 16px; border-radius: 4px; color: white; font-weight: bold; }}
|
|
.metric {{ margin: 20px 0; padding: 15px; background: #f8f9fa; border-radius: 4px; }}
|
|
.metric-label {{ font-weight: bold; color: #333; margin-bottom: 8px; }}
|
|
.metric-value {{ font-size: 24px; color: #333; }}
|
|
.issues {{ margin-top: 20px; }}
|
|
.error {{ background: #f8d7da; color: #721c24; padding: 10px; margin: 5px 0; border-radius: 4px; }}
|
|
.warning {{ background: #fff3cd; color: #856404; padding: 10px; margin: 5px 0; border-radius: 4px; }}
|
|
table {{ width: 100%; border-collapse: collapse; margin: 15px 0; }}
|
|
th, td {{ padding: 10px; text-align: left; border-bottom: 1px solid #eee; }}
|
|
th {{ background: #f8f9fa; }}
|
|
.score {{ font-size: 48px; font-weight: bold; color: {status_color}; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="container">
|
|
<h1>📊 Conversion Quality Report</h1>
|
|
|
|
<div style="text-align: center; margin: 30px 0;">
|
|
<div class="score">{metrics.overall_score:.0f}</div>
|
|
<div style="color: #666;">Overall Score</div>
|
|
<div class="status" style="background: {status_color}; margin-top: 10px;">
|
|
{metrics.status.upper()}
|
|
</div>
|
|
</div>
|
|
|
|
<h2>📄 File Information</h2>
|
|
<table>
|
|
<tr><th>Source</th><td>{html.escape(str(source_path))}</td></tr>
|
|
<tr><th>Output</th><td>{html.escape(str(output_path))}</td></tr>
|
|
</table>
|
|
|
|
<h2>📏 Retention Metrics</h2>
|
|
|
|
<div class="metric">
|
|
<div class="metric-label">Text Retention (target: >95%)</div>
|
|
{metric_bar(metrics.text_retention, (0.95, 0.85))}
|
|
<div style="font-size: 12px; color: #666; margin-top: 5px;">
|
|
Source: ~{metrics.source_char_count:,} chars | Output: {metrics.output_char_count:,} chars
|
|
</div>
|
|
</div>
|
|
|
|
<div class="metric">
|
|
<div class="metric-label">Table Retention (target: 100%)</div>
|
|
{metric_bar(metrics.table_retention, (1.0, 0.9))}
|
|
<div style="font-size: 12px; color: #666; margin-top: 5px;">
|
|
Source: {metrics.source_table_count} tables | Output: {metrics.output_table_count} tables
|
|
</div>
|
|
</div>
|
|
|
|
<div class="metric">
|
|
<div class="metric-label">Image Retention (target: 100%)</div>
|
|
{metric_bar(metrics.image_retention, (1.0, 0.8))}
|
|
<div style="font-size: 12px; color: #666; margin-top: 5px;">
|
|
Source: {metrics.source_image_count} images | Output: {metrics.output_image_count} images
|
|
</div>
|
|
</div>
|
|
|
|
<h2>📊 Structure Analysis</h2>
|
|
<table>
|
|
<tr><th>Headings</th><td>{metrics.heading_count}</td></tr>
|
|
<tr><th>List Items</th><td>{metrics.list_count}</td></tr>
|
|
<tr><th>Code Blocks</th><td>{metrics.code_block_count}</td></tr>
|
|
</table>
|
|
|
|
{'<h2>⚠️ Issues</h2><div class="issues">' + ''.join(f'<div class="error">❌ {html.escape(e)}</div>' for e in metrics.errors) + ''.join(f'<div class="warning">⚠️ {html.escape(w)}</div>' for w in metrics.warnings) + '</div>' if metrics.errors or metrics.warnings else ''}
|
|
|
|
<div style="margin-top: 30px; padding-top: 20px; border-top: 1px solid #eee; color: #666; font-size: 12px;">
|
|
Generated by markdown-tools validate_output.py
|
|
</div>
|
|
</div>
|
|
</body>
|
|
</html>
|
|
'''
|
|
return report
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Validate document-to-markdown conversion quality"
|
|
)
|
|
parser.add_argument(
|
|
"source",
|
|
type=Path,
|
|
help="Original document (PDF, DOCX, etc.)"
|
|
)
|
|
parser.add_argument(
|
|
"output",
|
|
type=Path,
|
|
help="Converted markdown file"
|
|
)
|
|
parser.add_argument(
|
|
"--report",
|
|
type=Path,
|
|
help="Generate HTML report at this path"
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
help="Output metrics as JSON"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate inputs
|
|
if not args.source.exists():
|
|
print(f"Error: Source file not found: {args.source}", file=sys.stderr)
|
|
sys.exit(1)
|
|
if not args.output.exists():
|
|
print(f"Error: Output file not found: {args.output}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Run validation
|
|
metrics = validate_conversion(args.source, args.output)
|
|
|
|
# Output results
|
|
if args.json:
|
|
import json
|
|
print(json.dumps({
|
|
'text_retention': metrics.text_retention,
|
|
'table_retention': metrics.table_retention,
|
|
'image_retention': metrics.image_retention,
|
|
'overall_score': metrics.overall_score,
|
|
'status': metrics.status,
|
|
'warnings': metrics.warnings,
|
|
'errors': metrics.errors
|
|
}, indent=2))
|
|
else:
|
|
# Console output
|
|
status_emoji = {"pass": "✅", "warn": "⚠️", "fail": "❌"}.get(metrics.status, "❓")
|
|
print(f"\n{status_emoji} Conversion Quality: {metrics.status.upper()}")
|
|
print(f" Overall Score: {metrics.overall_score:.0f}/100")
|
|
print(f"\n Text Retention: {metrics.text_retention:.1%}")
|
|
print(f" Table Retention: {metrics.table_retention:.1%}")
|
|
print(f" Image Retention: {metrics.image_retention:.1%}")
|
|
|
|
if metrics.errors:
|
|
print("\n Errors:")
|
|
for e in metrics.errors:
|
|
print(f" ❌ {e}")
|
|
|
|
if metrics.warnings:
|
|
print("\n Warnings:")
|
|
for w in metrics.warnings:
|
|
print(f" ⚠️ {w}")
|
|
|
|
# Generate HTML report
|
|
if args.report:
|
|
report_html = generate_html_report(metrics, args.source, args.output)
|
|
args.report.parent.mkdir(parents=True, exist_ok=True)
|
|
args.report.write_text(report_html)
|
|
print(f"\n📊 HTML report: {args.report}")
|
|
|
|
# Exit with appropriate code
|
|
sys.exit(0 if metrics.status != "fail" else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|