Release v1.27.0: Enhance markdown-tools with Heavy Mode

Add multi-tool orchestration for best-quality document conversion: - Dual mode: Quick (fast) and Heavy (best quality, multi-tool merge) - New convert.py - main orchestrator with tool selection matrix - New merge_outputs.py - segment-level multi-tool output merger - New validate_output.py - quality validation with HTML reports - Enhanced extract_pdf_images.py - metadata (page, position, dimensions) - PyMuPDF4LLM integration for LLM-optimized PDF conversion - pandoc integration for DOCX/PPTX structure preservation - Quality metrics: text/table/image retention with pass/warn/fail - New references: heavy-mode-guide.md, tool-comparison.md Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-25 21:36:08 +08:00
parent 114c355aa8
commit 3f15b8942c
10 changed files with 2009 additions and 89 deletions
--- a/markdown-tools/scripts/extract_pdf_images.py
+++ b/markdown-tools/scripts/extract_pdf_images.py
@@ -1,94 +1,242 @@
 #!/usr/bin/env python3
 """
-Extract images from PDF files using PyMuPDF.
+Extract images from PDF files with metadata using PyMuPDF.
+
+Features:
+- Extracts all images with page and position metadata
+- Generates JSON metadata file for each image
+- Supports markdown reference generation
+- Optional DPI control for quality

 Usage:
-    uv run --with pymupdf python extract_pdf_images.py <pdf_path> [output_dir]
+    uv run --with pymupdf scripts/extract_pdf_images.py document.pdf
+    uv run --with pymupdf scripts/extract_pdf_images.py document.pdf -o ./images
+    uv run --with pymupdf scripts/extract_pdf_images.py document.pdf --markdown refs.md

 Examples:
-    uv run --with pymupdf python extract_pdf_images.py document.pdf
-    uv run --with pymupdf python extract_pdf_images.py document.pdf ./assets
+    # Basic extraction
+    uv run --with pymupdf scripts/extract_pdf_images.py document.pdf

-Output:
-    Images are saved to output_dir (default: ./assets) with names like:
-    - img_page1_1.png
-    - img_page2_1.png
+    # With custom output and markdown references
+    uv run --with pymupdf scripts/extract_pdf_images.py doc.pdf -o assets --markdown images.md
 """

+import argparse
+import json
 import sys
-import os
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Optional

-def extract_images(pdf_path: str, output_dir: str = "assets") -> list[str]:
+
+@dataclass
+class ImageMetadata:
+    """Metadata for an extracted image."""
+    filename: str
+    page: int  # 1-indexed
+    index: int  # Image index on page (1-indexed)
+    width: int  # Original width in pixels
+    height: int  # Original height in pixels
+    x: float  # X position on page (points)
+    y: float  # Y position on page (points)
+    bbox_width: float  # Width on page (points)
+    bbox_height: float  # Height on page (points)
+    size_bytes: int
+    format: str  # png, jpg, etc.
+    colorspace: str  # RGB, CMYK, Gray
+    bits_per_component: int
+
+
+def extract_images(
+    pdf_path: Path,
+    output_dir: Path,
+    markdown_file: Optional[Path] = None
+) -> list[ImageMetadata]:
    """
-    Extract all images from a PDF file.
+    Extract all images from a PDF file with metadata.

    Args:
        pdf_path: Path to the PDF file
        output_dir: Directory to save extracted images
+        markdown_file: Optional path to write markdown references

    Returns:
-        List of extracted image file paths
+        List of ImageMetadata for each extracted image
    """
    try:
        import fitz  # PyMuPDF
    except ImportError:
        print("Error: PyMuPDF not installed. Run with:")
-        print('  uv run --with pymupdf python extract_pdf_images.py <pdf_path>')
+        print('  uv run --with pymupdf scripts/extract_pdf_images.py <pdf_path>')
        sys.exit(1)

-    os.makedirs(output_dir, exist_ok=True)
+    output_dir.mkdir(parents=True, exist_ok=True)

-    doc = fitz.open(pdf_path)
-    extracted_files = []
+    doc = fitz.open(str(pdf_path))
+    extracted: list[ImageMetadata] = []
+    markdown_refs: list[str] = []

    for page_num in range(len(doc)):
        page = doc[page_num]
-        image_list = page.get_images()
+        image_list = page.get_images(full=True)
+
+        for img_index, img_info in enumerate(image_list):
+            xref = img_info[0]
+
+            try:
+                base_image = doc.extract_image(xref)
+            except Exception as e:
+                print(f"  Warning: Could not extract image xref={xref}: {e}")
+                continue

-        for img_index, img in enumerate(image_list):
-            xref = img[0]
-            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
+            width = base_image.get("width", 0)
+            height = base_image.get("height", 0)
+            colorspace = base_image.get("colorspace", 0)
+            bpc = base_image.get("bpc", 8)
+
+            # Map colorspace number to name
+            cs_names = {1: "Gray", 3: "RGB", 4: "CMYK"}
+            cs_name = cs_names.get(colorspace, f"Unknown({colorspace})")
+
+            # Get image position on page
+            # img_info: (xref, smask, width, height, bpc, colorspace, alt, name, filter, referencer)
+            # We need to find the image rect on page
+            bbox_x, bbox_y, bbox_w, bbox_h = 0.0, 0.0, 0.0, 0.0
+
+            # Search for image instances on page
+            for img_block in page.get_images():
+                if img_block[0] == xref:
+                    # Found matching image, try to get its rect
+                    rects = page.get_image_rects(img_block)
+                    if rects:
+                        rect = rects[0]  # Use first occurrence
+                        bbox_x = rect.x0
+                        bbox_y = rect.y0
+                        bbox_w = rect.width
+                        bbox_h = rect.height
+                    break

            # Create descriptive filename
            img_filename = f"img_page{page_num + 1}_{img_index + 1}.{image_ext}"
-            img_path = os.path.join(output_dir, img_filename)
+            img_path = output_dir / img_filename

+            # Save image
            with open(img_path, "wb") as f:
                f.write(image_bytes)

-            extracted_files.append(img_path)
-            print(f"Extracted: {img_filename} ({len(image_bytes):,} bytes)")
+            # Create metadata
+            metadata = ImageMetadata(
+                filename=img_filename,
+                page=page_num + 1,
+                index=img_index + 1,
+                width=width,
+                height=height,
+                x=round(bbox_x, 2),
+                y=round(bbox_y, 2),
+                bbox_width=round(bbox_w, 2),
+                bbox_height=round(bbox_h, 2),
+                size_bytes=len(image_bytes),
+                format=image_ext,
+                colorspace=cs_name,
+                bits_per_component=bpc
+            )
+            extracted.append(metadata)
+
+            # Generate markdown reference
+            alt_text = f"Image from page {page_num + 1}"
+            md_ref = f"![{alt_text}]({img_path.name})"
+            markdown_refs.append(f"<!-- Page {page_num + 1}, Position: ({bbox_x:.0f}, {bbox_y:.0f}) -->\n{md_ref}")
+
+            print(f"  ✓ {img_filename} ({width}x{height}, {len(image_bytes):,} bytes)")

    doc.close()

-    print(f"\nTotal: {len(extracted_files)} images extracted to {output_dir}/")
-    return extracted_files
+    # Write metadata JSON
+    metadata_path = output_dir / "images_metadata.json"
+    with open(metadata_path, "w") as f:
+        json.dump(
+            {
+                "source": str(pdf_path),
+                "image_count": len(extracted),
+                "images": [asdict(m) for m in extracted]
+            },
+            f,
+            indent=2
+        )
+    print(f"\n📋 Metadata: {metadata_path}")
+
+    # Write markdown references if requested
+    if markdown_file and markdown_refs:
+        markdown_content = f"# Images from {pdf_path.name}\n\n"
+        markdown_content += "\n\n".join(markdown_refs)
+        markdown_file.parent.mkdir(parents=True, exist_ok=True)
+        markdown_file.write_text(markdown_content)
+        print(f"📝 Markdown refs: {markdown_file}")
+
+    print(f"\n✅ Total: {len(extracted)} images extracted to {output_dir}/")
+    return extracted


 def main():
-    if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"):
-        print("Extract images from PDF files using PyMuPDF.")
-        print()
-        print("Usage: python extract_pdf_images.py <pdf_path> [output_dir]")
-        print()
-        print("Arguments:")
-        print("  pdf_path    Path to the PDF file")
-        print("  output_dir  Directory to save images (default: ./assets)")
-        print()
-        print("Example:")
-        print("  uv run --with pymupdf python extract_pdf_images.py document.pdf ./assets")
-        sys.exit(0 if "--help" in sys.argv or "-h" in sys.argv else 1)
+    parser = argparse.ArgumentParser(
+        description="Extract images from PDF files with metadata",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Basic extraction
+    uv run --with pymupdf scripts/extract_pdf_images.py document.pdf

-    pdf_path = sys.argv[1]
-    output_dir = sys.argv[2] if len(sys.argv) > 2 else "assets"
+    # Custom output directory
+    uv run --with pymupdf scripts/extract_pdf_images.py doc.pdf -o ./images

-    if not os.path.exists(pdf_path):
-        print(f"Error: File not found: {pdf_path}")
+    # With markdown references
+    uv run --with pymupdf scripts/extract_pdf_images.py doc.pdf --markdown refs.md
+
+Output:
+    Images are saved with descriptive names: img_page1_1.png, img_page2_1.jpg
+    Metadata is saved to: images_metadata.json
+        """
+    )
+    parser.add_argument(
+        "pdf_path",
+        type=Path,
+        help="Path to the PDF file"
+    )
+    parser.add_argument(
+        "-o", "--output",
+        type=Path,
+        default=Path("assets"),
+        help="Directory to save images (default: ./assets)"
+    )
+    parser.add_argument(
+        "--markdown",
+        type=Path,
+        help="Generate markdown file with image references"
+    )
+    parser.add_argument(
+        "--json",
+        action="store_true",
+        help="Output metadata as JSON to stdout"
+    )
+
+    args = parser.parse_args()
+
+    if not args.pdf_path.exists():
+        print(f"Error: File not found: {args.pdf_path}", file=sys.stderr)
        sys.exit(1)

-    extract_images(pdf_path, output_dir)
+    print(f"📄 Extracting images from: {args.pdf_path}")
+
+    extracted = extract_images(
+        args.pdf_path,
+        args.output,
+        args.markdown
+    )
+
+    if args.json:
+        print(json.dumps([asdict(m) for m in extracted], indent=2))


 if __name__ == "__main__":