Release v1.27.0: Enhance markdown-tools with Heavy Mode

Add multi-tool orchestration for best-quality document conversion: - Dual mode: Quick (fast) and Heavy (best quality, multi-tool merge) - New convert.py - main orchestrator with tool selection matrix - New merge_outputs.py - segment-level multi-tool output merger - New validate_output.py - quality validation with HTML reports - Enhanced extract_pdf_images.py - metadata (page, position, dimensions) - PyMuPDF4LLM integration for LLM-optimized PDF conversion - pandoc integration for DOCX/PPTX structure preservation - Quality metrics: text/table/image retention with pass/warn/fail - New references: heavy-mode-guide.md, tool-comparison.md Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-25 21:36:08 +08:00
parent 114c355aa8
commit 3f15b8942c
10 changed files with 2009 additions and 89 deletions
--- a/markdown-tools/scripts/convert.py
+++ b/markdown-tools/scripts/convert.py
@@ -0,0 +1,434 @@
+#!/usr/bin/env python3
+"""
+Multi-tool document to markdown converter with intelligent orchestration.
+
+Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge).
+
+Usage:
+    # Quick Mode (default) - fast, single best tool
+    uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md
+
+    # Heavy Mode - multi-tool parallel execution with merge
+    uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md --heavy
+
+    # With image extraction
+    uv run --with pymupdf4llm scripts/convert.py document.pdf -o output.md --assets-dir ./images
+
+Dependencies:
+    - pymupdf4llm: PDF conversion (LLM-optimized)
+    - markitdown: PDF/DOCX/PPTX conversion
+    - pandoc: DOCX/PPTX conversion (system install: brew install pandoc)
+"""
+
+import argparse
+import subprocess
+import sys
+import tempfile
+import shutil
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass
+class ConversionResult:
+    """Result from a single tool conversion."""
+    markdown: str
+    tool: str
+    images: list[str] = field(default_factory=list)
+    success: bool = True
+    error: str = ""
+
+
+def check_tool_available(tool: str) -> bool:
+    """Check if a conversion tool is available."""
+    if tool == "pymupdf4llm":
+        try:
+            import pymupdf4llm
+            return True
+        except ImportError:
+            return False
+    elif tool == "markitdown":
+        try:
+            import markitdown
+            return True
+        except ImportError:
+            return False
+    elif tool == "pandoc":
+        return shutil.which("pandoc") is not None
+    return False
+
+
+def select_tools(file_path: Path, mode: str) -> list[str]:
+    """Select conversion tools based on file type and mode."""
+    ext = file_path.suffix.lower()
+
+    # Tool preferences by format
+    tool_map = {
+        ".pdf": {
+            "quick": ["pymupdf4llm", "markitdown"],  # fallback order
+            "heavy": ["pymupdf4llm", "markitdown"],
+        },
+        ".docx": {
+            "quick": ["pandoc", "markitdown"],
+            "heavy": ["pandoc", "markitdown"],
+        },
+        ".doc": {
+            "quick": ["pandoc", "markitdown"],
+            "heavy": ["pandoc", "markitdown"],
+        },
+        ".pptx": {
+            "quick": ["markitdown", "pandoc"],
+            "heavy": ["markitdown", "pandoc"],
+        },
+        ".xlsx": {
+            "quick": ["markitdown"],
+            "heavy": ["markitdown"],
+        },
+    }
+
+    tools = tool_map.get(ext, {"quick": ["markitdown"], "heavy": ["markitdown"]})
+
+    if mode == "quick":
+        # Return first available tool
+        for tool in tools["quick"]:
+            if check_tool_available(tool):
+                return [tool]
+        return []
+    else:  # heavy
+        # Return all available tools
+        return [t for t in tools["heavy"] if check_tool_available(t)]
+
+
+def convert_with_pymupdf4llm(
+    file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+    """Convert using PyMuPDF4LLM (best for PDFs)."""
+    try:
+        import pymupdf4llm
+
+        kwargs = {}
+        images = []
+
+        if assets_dir:
+            assets_dir.mkdir(parents=True, exist_ok=True)
+            kwargs["write_images"] = True
+            kwargs["image_path"] = str(assets_dir)
+            kwargs["dpi"] = 150
+
+        # Use best table detection strategy
+        kwargs["table_strategy"] = "lines_strict"
+
+        md_text = pymupdf4llm.to_markdown(str(file_path), **kwargs)
+
+        # Collect extracted images
+        if assets_dir and assets_dir.exists():
+            images = [str(p) for p in assets_dir.glob("*.png")]
+            images.extend([str(p) for p in assets_dir.glob("*.jpg")])
+
+        return ConversionResult(
+            markdown=md_text, tool="pymupdf4llm", images=images, success=True
+        )
+    except Exception as e:
+        return ConversionResult(
+            markdown="", tool="pymupdf4llm", success=False, error=str(e)
+        )
+
+
+def convert_with_markitdown(
+    file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+    """Convert using markitdown."""
+    try:
+        # markitdown CLI approach
+        result = subprocess.run(
+            ["markitdown", str(file_path)],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+
+        if result.returncode != 0:
+            return ConversionResult(
+                markdown="",
+                tool="markitdown",
+                success=False,
+                error=result.stderr,
+            )
+
+        return ConversionResult(
+            markdown=result.stdout, tool="markitdown", success=True
+        )
+    except FileNotFoundError:
+        # Try Python API
+        try:
+            from markitdown import MarkItDown
+
+            md = MarkItDown()
+            result = md.convert(str(file_path))
+            return ConversionResult(
+                markdown=result.text_content, tool="markitdown", success=True
+            )
+        except Exception as e:
+            return ConversionResult(
+                markdown="", tool="markitdown", success=False, error=str(e)
+            )
+    except Exception as e:
+        return ConversionResult(
+            markdown="", tool="markitdown", success=False, error=str(e)
+        )
+
+
+def convert_with_pandoc(
+    file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+    """Convert using pandoc."""
+    try:
+        cmd = ["pandoc", str(file_path), "-t", "markdown", "--wrap=none"]
+
+        if assets_dir:
+            assets_dir.mkdir(parents=True, exist_ok=True)
+            cmd.extend(["--extract-media", str(assets_dir)])
+
+        result = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=120
+        )
+
+        if result.returncode != 0:
+            return ConversionResult(
+                markdown="", tool="pandoc", success=False, error=result.stderr
+            )
+
+        images = []
+        if assets_dir and assets_dir.exists():
+            images = [str(p) for p in assets_dir.rglob("*.png")]
+            images.extend([str(p) for p in assets_dir.rglob("*.jpg")])
+
+        return ConversionResult(
+            markdown=result.stdout, tool="pandoc", images=images, success=True
+        )
+    except Exception as e:
+        return ConversionResult(
+            markdown="", tool="pandoc", success=False, error=str(e)
+        )
+
+
+def convert_single(
+    file_path: Path, tool: str, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+    """Run a single conversion tool."""
+    converters = {
+        "pymupdf4llm": convert_with_pymupdf4llm,
+        "markitdown": convert_with_markitdown,
+        "pandoc": convert_with_pandoc,
+    }
+
+    converter = converters.get(tool)
+    if not converter:
+        return ConversionResult(
+            markdown="", tool=tool, success=False, error=f"Unknown tool: {tool}"
+        )
+
+    return converter(file_path, assets_dir)
+
+
+def merge_results(results: list[ConversionResult]) -> ConversionResult:
+    """Merge results from multiple tools, selecting best segments."""
+    if not results:
+        return ConversionResult(markdown="", tool="none", success=False)
+
+    # Filter successful results
+    successful = [r for r in results if r.success and r.markdown.strip()]
+    if not successful:
+        # Return first error
+        return results[0] if results else ConversionResult(
+            markdown="", tool="none", success=False
+        )
+
+    if len(successful) == 1:
+        return successful[0]
+
+    # Multiple successful results - merge them
+    # Strategy: Compare key metrics and select best
+    best = successful[0]
+    best_score = score_markdown(best.markdown)
+
+    for result in successful[1:]:
+        score = score_markdown(result.markdown)
+        if score > best_score:
+            best = result
+            best_score = score
+
+    # Merge images from all results
+    all_images = []
+    seen = set()
+    for result in successful:
+        for img in result.images:
+            if img not in seen:
+                all_images.append(img)
+                seen.add(img)
+
+    best.images = all_images
+    best.tool = f"merged({','.join(r.tool for r in successful)})"
+
+    return best
+
+
+def score_markdown(md: str) -> float:
+    """Score markdown quality for comparison."""
+    score = 0.0
+
+    # Length (more content is generally better)
+    score += min(len(md) / 10000, 5.0)  # Cap at 5 points
+
+    # Tables (proper markdown tables)
+    table_count = md.count("|---|") + md.count("| ---")
+    score += min(table_count * 0.5, 3.0)
+
+    # Images (referenced images)
+    image_count = md.count("![")
+    score += min(image_count * 0.3, 2.0)
+
+    # Headings (proper hierarchy)
+    h1_count = md.count("\n# ")
+    h2_count = md.count("\n## ")
+    h3_count = md.count("\n### ")
+    if h1_count > 0 and h2_count >= h1_count:
+        score += 1.0  # Good hierarchy
+
+    # Lists (structured content)
+    list_count = md.count("\n- ") + md.count("\n* ") + md.count("\n1. ")
+    score += min(list_count * 0.1, 2.0)
+
+    return score
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert documents to markdown with multi-tool orchestration",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Quick mode (default)
+    python convert.py document.pdf -o output.md
+
+    # Heavy mode (best quality)
+    python convert.py document.pdf -o output.md --heavy
+
+    # With custom assets directory
+    python convert.py document.pdf -o output.md --assets-dir ./images
+        """,
+    )
+    parser.add_argument("input", type=Path, help="Input document path")
+    parser.add_argument(
+        "-o", "--output", type=Path, help="Output markdown file"
+    )
+    parser.add_argument(
+        "--heavy",
+        action="store_true",
+        help="Enable Heavy Mode (multi-tool, best quality)",
+    )
+    parser.add_argument(
+        "--assets-dir",
+        type=Path,
+        default=None,
+        help="Directory for extracted images (default: <output>_assets/)",
+    )
+    parser.add_argument(
+        "--tool",
+        choices=["pymupdf4llm", "markitdown", "pandoc"],
+        help="Force specific tool (overrides auto-selection)",
+    )
+    parser.add_argument(
+        "--list-tools",
+        action="store_true",
+        help="List available tools and exit",
+    )
+
+    args = parser.parse_args()
+
+    # List tools mode
+    if args.list_tools:
+        tools = ["pymupdf4llm", "markitdown", "pandoc"]
+        print("Available conversion tools:")
+        for tool in tools:
+            status = "✓" if check_tool_available(tool) else "✗"
+            print(f"  {status} {tool}")
+        sys.exit(0)
+
+    # Validate input
+    if not args.input.exists():
+        print(f"Error: Input file not found: {args.input}", file=sys.stderr)
+        sys.exit(1)
+
+    # Determine output path
+    output_path = args.output or args.input.with_suffix(".md")
+
+    # Determine assets directory
+    assets_dir = args.assets_dir
+    if assets_dir is None and args.heavy:
+        assets_dir = output_path.parent / f"{output_path.stem}_assets"
+
+    # Select tools
+    mode = "heavy" if args.heavy else "quick"
+    if args.tool:
+        tools = [args.tool] if check_tool_available(args.tool) else []
+    else:
+        tools = select_tools(args.input, mode)
+
+    if not tools:
+        print("Error: No conversion tools available.", file=sys.stderr)
+        print("Install with:", file=sys.stderr)
+        print("  pip install pymupdf4llm", file=sys.stderr)
+        print("  uv tool install markitdown[pdf]", file=sys.stderr)
+        print("  brew install pandoc", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Converting: {args.input}")
+    print(f"Mode: {mode.upper()}")
+    print(f"Tools: {', '.join(tools)}")
+
+    # Run conversions
+    results = []
+    for tool in tools:
+        print(f"  Running {tool}...", end=" ", flush=True)
+
+        # Use separate assets dirs for each tool in heavy mode
+        tool_assets = None
+        if assets_dir and mode == "heavy" and len(tools) > 1:
+            tool_assets = assets_dir / tool
+        elif assets_dir:
+            tool_assets = assets_dir
+
+        result = convert_single(args.input, tool, tool_assets)
+        results.append(result)
+
+        if result.success:
+            print(f"✓ ({len(result.markdown):,} chars, {len(result.images)} images)")
+        else:
+            print(f"✗ ({result.error[:50]}...)")
+
+    # Merge results if heavy mode
+    if mode == "heavy" and len(results) > 1:
+        print("  Merging results...", end=" ", flush=True)
+        final = merge_results(results)
+        print(f"✓ (using {final.tool})")
+    else:
+        final = merge_results(results)
+
+    if not final.success:
+        print(f"Error: Conversion failed: {final.error}", file=sys.stderr)
+        sys.exit(1)
+
+    # Write output
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(final.markdown)
+
+    print(f"\nOutput: {output_path}")
+    print(f"  Size: {len(final.markdown):,} characters")
+    if final.images:
+        print(f"  Images: {len(final.images)} extracted")
+
+
+if __name__ == "__main__":
+    main()