Add multi-tool orchestration for best-quality document conversion: - Dual mode: Quick (fast) and Heavy (best quality, multi-tool merge) - New convert.py - main orchestrator with tool selection matrix - New merge_outputs.py - segment-level multi-tool output merger - New validate_output.py - quality validation with HTML reports - Enhanced extract_pdf_images.py - metadata (page, position, dimensions) - PyMuPDF4LLM integration for LLM-optimized PDF conversion - pandoc integration for DOCX/PPTX structure preservation - Quality metrics: text/table/image retention with pass/warn/fail - New references: heavy-mode-guide.md, tool-comparison.md Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
435 lines
13 KiB
Python
Executable File
435 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Multi-tool document to markdown converter with intelligent orchestration.
|
|
|
|
Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge).
|
|
|
|
Usage:
|
|
# Quick Mode (default) - fast, single best tool
|
|
uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md
|
|
|
|
# Heavy Mode - multi-tool parallel execution with merge
|
|
uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md --heavy
|
|
|
|
# With image extraction
|
|
uv run --with pymupdf4llm scripts/convert.py document.pdf -o output.md --assets-dir ./images
|
|
|
|
Dependencies:
|
|
- pymupdf4llm: PDF conversion (LLM-optimized)
|
|
- markitdown: PDF/DOCX/PPTX conversion
|
|
- pandoc: DOCX/PPTX conversion (system install: brew install pandoc)
|
|
"""
|
|
|
|
import argparse
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import shutil
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class ConversionResult:
|
|
"""Result from a single tool conversion."""
|
|
markdown: str
|
|
tool: str
|
|
images: list[str] = field(default_factory=list)
|
|
success: bool = True
|
|
error: str = ""
|
|
|
|
|
|
def check_tool_available(tool: str) -> bool:
|
|
"""Check if a conversion tool is available."""
|
|
if tool == "pymupdf4llm":
|
|
try:
|
|
import pymupdf4llm
|
|
return True
|
|
except ImportError:
|
|
return False
|
|
elif tool == "markitdown":
|
|
try:
|
|
import markitdown
|
|
return True
|
|
except ImportError:
|
|
return False
|
|
elif tool == "pandoc":
|
|
return shutil.which("pandoc") is not None
|
|
return False
|
|
|
|
|
|
def select_tools(file_path: Path, mode: str) -> list[str]:
|
|
"""Select conversion tools based on file type and mode."""
|
|
ext = file_path.suffix.lower()
|
|
|
|
# Tool preferences by format
|
|
tool_map = {
|
|
".pdf": {
|
|
"quick": ["pymupdf4llm", "markitdown"], # fallback order
|
|
"heavy": ["pymupdf4llm", "markitdown"],
|
|
},
|
|
".docx": {
|
|
"quick": ["pandoc", "markitdown"],
|
|
"heavy": ["pandoc", "markitdown"],
|
|
},
|
|
".doc": {
|
|
"quick": ["pandoc", "markitdown"],
|
|
"heavy": ["pandoc", "markitdown"],
|
|
},
|
|
".pptx": {
|
|
"quick": ["markitdown", "pandoc"],
|
|
"heavy": ["markitdown", "pandoc"],
|
|
},
|
|
".xlsx": {
|
|
"quick": ["markitdown"],
|
|
"heavy": ["markitdown"],
|
|
},
|
|
}
|
|
|
|
tools = tool_map.get(ext, {"quick": ["markitdown"], "heavy": ["markitdown"]})
|
|
|
|
if mode == "quick":
|
|
# Return first available tool
|
|
for tool in tools["quick"]:
|
|
if check_tool_available(tool):
|
|
return [tool]
|
|
return []
|
|
else: # heavy
|
|
# Return all available tools
|
|
return [t for t in tools["heavy"] if check_tool_available(t)]
|
|
|
|
|
|
def convert_with_pymupdf4llm(
|
|
file_path: Path, assets_dir: Optional[Path] = None
|
|
) -> ConversionResult:
|
|
"""Convert using PyMuPDF4LLM (best for PDFs)."""
|
|
try:
|
|
import pymupdf4llm
|
|
|
|
kwargs = {}
|
|
images = []
|
|
|
|
if assets_dir:
|
|
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
kwargs["write_images"] = True
|
|
kwargs["image_path"] = str(assets_dir)
|
|
kwargs["dpi"] = 150
|
|
|
|
# Use best table detection strategy
|
|
kwargs["table_strategy"] = "lines_strict"
|
|
|
|
md_text = pymupdf4llm.to_markdown(str(file_path), **kwargs)
|
|
|
|
# Collect extracted images
|
|
if assets_dir and assets_dir.exists():
|
|
images = [str(p) for p in assets_dir.glob("*.png")]
|
|
images.extend([str(p) for p in assets_dir.glob("*.jpg")])
|
|
|
|
return ConversionResult(
|
|
markdown=md_text, tool="pymupdf4llm", images=images, success=True
|
|
)
|
|
except Exception as e:
|
|
return ConversionResult(
|
|
markdown="", tool="pymupdf4llm", success=False, error=str(e)
|
|
)
|
|
|
|
|
|
def convert_with_markitdown(
|
|
file_path: Path, assets_dir: Optional[Path] = None
|
|
) -> ConversionResult:
|
|
"""Convert using markitdown."""
|
|
try:
|
|
# markitdown CLI approach
|
|
result = subprocess.run(
|
|
["markitdown", str(file_path)],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
return ConversionResult(
|
|
markdown="",
|
|
tool="markitdown",
|
|
success=False,
|
|
error=result.stderr,
|
|
)
|
|
|
|
return ConversionResult(
|
|
markdown=result.stdout, tool="markitdown", success=True
|
|
)
|
|
except FileNotFoundError:
|
|
# Try Python API
|
|
try:
|
|
from markitdown import MarkItDown
|
|
|
|
md = MarkItDown()
|
|
result = md.convert(str(file_path))
|
|
return ConversionResult(
|
|
markdown=result.text_content, tool="markitdown", success=True
|
|
)
|
|
except Exception as e:
|
|
return ConversionResult(
|
|
markdown="", tool="markitdown", success=False, error=str(e)
|
|
)
|
|
except Exception as e:
|
|
return ConversionResult(
|
|
markdown="", tool="markitdown", success=False, error=str(e)
|
|
)
|
|
|
|
|
|
def convert_with_pandoc(
|
|
file_path: Path, assets_dir: Optional[Path] = None
|
|
) -> ConversionResult:
|
|
"""Convert using pandoc."""
|
|
try:
|
|
cmd = ["pandoc", str(file_path), "-t", "markdown", "--wrap=none"]
|
|
|
|
if assets_dir:
|
|
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
cmd.extend(["--extract-media", str(assets_dir)])
|
|
|
|
result = subprocess.run(
|
|
cmd, capture_output=True, text=True, timeout=120
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
return ConversionResult(
|
|
markdown="", tool="pandoc", success=False, error=result.stderr
|
|
)
|
|
|
|
images = []
|
|
if assets_dir and assets_dir.exists():
|
|
images = [str(p) for p in assets_dir.rglob("*.png")]
|
|
images.extend([str(p) for p in assets_dir.rglob("*.jpg")])
|
|
|
|
return ConversionResult(
|
|
markdown=result.stdout, tool="pandoc", images=images, success=True
|
|
)
|
|
except Exception as e:
|
|
return ConversionResult(
|
|
markdown="", tool="pandoc", success=False, error=str(e)
|
|
)
|
|
|
|
|
|
def convert_single(
|
|
file_path: Path, tool: str, assets_dir: Optional[Path] = None
|
|
) -> ConversionResult:
|
|
"""Run a single conversion tool."""
|
|
converters = {
|
|
"pymupdf4llm": convert_with_pymupdf4llm,
|
|
"markitdown": convert_with_markitdown,
|
|
"pandoc": convert_with_pandoc,
|
|
}
|
|
|
|
converter = converters.get(tool)
|
|
if not converter:
|
|
return ConversionResult(
|
|
markdown="", tool=tool, success=False, error=f"Unknown tool: {tool}"
|
|
)
|
|
|
|
return converter(file_path, assets_dir)
|
|
|
|
|
|
def merge_results(results: list[ConversionResult]) -> ConversionResult:
|
|
"""Merge results from multiple tools, selecting best segments."""
|
|
if not results:
|
|
return ConversionResult(markdown="", tool="none", success=False)
|
|
|
|
# Filter successful results
|
|
successful = [r for r in results if r.success and r.markdown.strip()]
|
|
if not successful:
|
|
# Return first error
|
|
return results[0] if results else ConversionResult(
|
|
markdown="", tool="none", success=False
|
|
)
|
|
|
|
if len(successful) == 1:
|
|
return successful[0]
|
|
|
|
# Multiple successful results - merge them
|
|
# Strategy: Compare key metrics and select best
|
|
best = successful[0]
|
|
best_score = score_markdown(best.markdown)
|
|
|
|
for result in successful[1:]:
|
|
score = score_markdown(result.markdown)
|
|
if score > best_score:
|
|
best = result
|
|
best_score = score
|
|
|
|
# Merge images from all results
|
|
all_images = []
|
|
seen = set()
|
|
for result in successful:
|
|
for img in result.images:
|
|
if img not in seen:
|
|
all_images.append(img)
|
|
seen.add(img)
|
|
|
|
best.images = all_images
|
|
best.tool = f"merged({','.join(r.tool for r in successful)})"
|
|
|
|
return best
|
|
|
|
|
|
def score_markdown(md: str) -> float:
|
|
"""Score markdown quality for comparison."""
|
|
score = 0.0
|
|
|
|
# Length (more content is generally better)
|
|
score += min(len(md) / 10000, 5.0) # Cap at 5 points
|
|
|
|
# Tables (proper markdown tables)
|
|
table_count = md.count("|---|") + md.count("| ---")
|
|
score += min(table_count * 0.5, 3.0)
|
|
|
|
# Images (referenced images)
|
|
image_count = md.count("![")
|
|
score += min(image_count * 0.3, 2.0)
|
|
|
|
# Headings (proper hierarchy)
|
|
h1_count = md.count("\n# ")
|
|
h2_count = md.count("\n## ")
|
|
h3_count = md.count("\n### ")
|
|
if h1_count > 0 and h2_count >= h1_count:
|
|
score += 1.0 # Good hierarchy
|
|
|
|
# Lists (structured content)
|
|
list_count = md.count("\n- ") + md.count("\n* ") + md.count("\n1. ")
|
|
score += min(list_count * 0.1, 2.0)
|
|
|
|
return score
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Convert documents to markdown with multi-tool orchestration",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Quick mode (default)
|
|
python convert.py document.pdf -o output.md
|
|
|
|
# Heavy mode (best quality)
|
|
python convert.py document.pdf -o output.md --heavy
|
|
|
|
# With custom assets directory
|
|
python convert.py document.pdf -o output.md --assets-dir ./images
|
|
""",
|
|
)
|
|
parser.add_argument("input", type=Path, help="Input document path")
|
|
parser.add_argument(
|
|
"-o", "--output", type=Path, help="Output markdown file"
|
|
)
|
|
parser.add_argument(
|
|
"--heavy",
|
|
action="store_true",
|
|
help="Enable Heavy Mode (multi-tool, best quality)",
|
|
)
|
|
parser.add_argument(
|
|
"--assets-dir",
|
|
type=Path,
|
|
default=None,
|
|
help="Directory for extracted images (default: <output>_assets/)",
|
|
)
|
|
parser.add_argument(
|
|
"--tool",
|
|
choices=["pymupdf4llm", "markitdown", "pandoc"],
|
|
help="Force specific tool (overrides auto-selection)",
|
|
)
|
|
parser.add_argument(
|
|
"--list-tools",
|
|
action="store_true",
|
|
help="List available tools and exit",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# List tools mode
|
|
if args.list_tools:
|
|
tools = ["pymupdf4llm", "markitdown", "pandoc"]
|
|
print("Available conversion tools:")
|
|
for tool in tools:
|
|
status = "✓" if check_tool_available(tool) else "✗"
|
|
print(f" {status} {tool}")
|
|
sys.exit(0)
|
|
|
|
# Validate input
|
|
if not args.input.exists():
|
|
print(f"Error: Input file not found: {args.input}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Determine output path
|
|
output_path = args.output or args.input.with_suffix(".md")
|
|
|
|
# Determine assets directory
|
|
assets_dir = args.assets_dir
|
|
if assets_dir is None and args.heavy:
|
|
assets_dir = output_path.parent / f"{output_path.stem}_assets"
|
|
|
|
# Select tools
|
|
mode = "heavy" if args.heavy else "quick"
|
|
if args.tool:
|
|
tools = [args.tool] if check_tool_available(args.tool) else []
|
|
else:
|
|
tools = select_tools(args.input, mode)
|
|
|
|
if not tools:
|
|
print("Error: No conversion tools available.", file=sys.stderr)
|
|
print("Install with:", file=sys.stderr)
|
|
print(" pip install pymupdf4llm", file=sys.stderr)
|
|
print(" uv tool install markitdown[pdf]", file=sys.stderr)
|
|
print(" brew install pandoc", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Converting: {args.input}")
|
|
print(f"Mode: {mode.upper()}")
|
|
print(f"Tools: {', '.join(tools)}")
|
|
|
|
# Run conversions
|
|
results = []
|
|
for tool in tools:
|
|
print(f" Running {tool}...", end=" ", flush=True)
|
|
|
|
# Use separate assets dirs for each tool in heavy mode
|
|
tool_assets = None
|
|
if assets_dir and mode == "heavy" and len(tools) > 1:
|
|
tool_assets = assets_dir / tool
|
|
elif assets_dir:
|
|
tool_assets = assets_dir
|
|
|
|
result = convert_single(args.input, tool, tool_assets)
|
|
results.append(result)
|
|
|
|
if result.success:
|
|
print(f"✓ ({len(result.markdown):,} chars, {len(result.images)} images)")
|
|
else:
|
|
print(f"✗ ({result.error[:50]}...)")
|
|
|
|
# Merge results if heavy mode
|
|
if mode == "heavy" and len(results) > 1:
|
|
print(" Merging results...", end=" ", flush=True)
|
|
final = merge_results(results)
|
|
print(f"✓ (using {final.tool})")
|
|
else:
|
|
final = merge_results(results)
|
|
|
|
if not final.success:
|
|
print(f"Error: Conversion failed: {final.error}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Write output
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(final.markdown)
|
|
|
|
print(f"\nOutput: {output_path}")
|
|
print(f" Size: {len(final.markdown):,} characters")
|
|
if final.images:
|
|
print(f" Images: {len(final.images)} extracted")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|