Add multi-tool orchestration for best-quality document conversion: - Dual mode: Quick (fast) and Heavy (best quality, multi-tool merge) - New convert.py - main orchestrator with tool selection matrix - New merge_outputs.py - segment-level multi-tool output merger - New validate_output.py - quality validation with HTML reports - Enhanced extract_pdf_images.py - metadata (page, position, dimensions) - PyMuPDF4LLM integration for LLM-optimized PDF conversion - pandoc integration for DOCX/PPTX structure preservation - Quality metrics: text/table/image retention with pass/warn/fail - New references: heavy-mode-guide.md, tool-comparison.md Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
244 lines
7.6 KiB
Python
Executable File
244 lines
7.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Extract images from PDF files with metadata using PyMuPDF.
|
|
|
|
Features:
|
|
- Extracts all images with page and position metadata
|
|
- Generates JSON metadata file for each image
|
|
- Supports markdown reference generation
|
|
- Optional DPI control for quality
|
|
|
|
Usage:
|
|
uv run --with pymupdf scripts/extract_pdf_images.py document.pdf
|
|
uv run --with pymupdf scripts/extract_pdf_images.py document.pdf -o ./images
|
|
uv run --with pymupdf scripts/extract_pdf_images.py document.pdf --markdown refs.md
|
|
|
|
Examples:
|
|
# Basic extraction
|
|
uv run --with pymupdf scripts/extract_pdf_images.py document.pdf
|
|
|
|
# With custom output and markdown references
|
|
uv run --with pymupdf scripts/extract_pdf_images.py doc.pdf -o assets --markdown images.md
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from dataclasses import dataclass, asdict
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class ImageMetadata:
|
|
"""Metadata for an extracted image."""
|
|
filename: str
|
|
page: int # 1-indexed
|
|
index: int # Image index on page (1-indexed)
|
|
width: int # Original width in pixels
|
|
height: int # Original height in pixels
|
|
x: float # X position on page (points)
|
|
y: float # Y position on page (points)
|
|
bbox_width: float # Width on page (points)
|
|
bbox_height: float # Height on page (points)
|
|
size_bytes: int
|
|
format: str # png, jpg, etc.
|
|
colorspace: str # RGB, CMYK, Gray
|
|
bits_per_component: int
|
|
|
|
|
|
def extract_images(
|
|
pdf_path: Path,
|
|
output_dir: Path,
|
|
markdown_file: Optional[Path] = None
|
|
) -> list[ImageMetadata]:
|
|
"""
|
|
Extract all images from a PDF file with metadata.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
output_dir: Directory to save extracted images
|
|
markdown_file: Optional path to write markdown references
|
|
|
|
Returns:
|
|
List of ImageMetadata for each extracted image
|
|
"""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except ImportError:
|
|
print("Error: PyMuPDF not installed. Run with:")
|
|
print(' uv run --with pymupdf scripts/extract_pdf_images.py <pdf_path>')
|
|
sys.exit(1)
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
doc = fitz.open(str(pdf_path))
|
|
extracted: list[ImageMetadata] = []
|
|
markdown_refs: list[str] = []
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
image_list = page.get_images(full=True)
|
|
|
|
for img_index, img_info in enumerate(image_list):
|
|
xref = img_info[0]
|
|
|
|
try:
|
|
base_image = doc.extract_image(xref)
|
|
except Exception as e:
|
|
print(f" Warning: Could not extract image xref={xref}: {e}")
|
|
continue
|
|
|
|
image_bytes = base_image["image"]
|
|
image_ext = base_image["ext"]
|
|
width = base_image.get("width", 0)
|
|
height = base_image.get("height", 0)
|
|
colorspace = base_image.get("colorspace", 0)
|
|
bpc = base_image.get("bpc", 8)
|
|
|
|
# Map colorspace number to name
|
|
cs_names = {1: "Gray", 3: "RGB", 4: "CMYK"}
|
|
cs_name = cs_names.get(colorspace, f"Unknown({colorspace})")
|
|
|
|
# Get image position on page
|
|
# img_info: (xref, smask, width, height, bpc, colorspace, alt, name, filter, referencer)
|
|
# We need to find the image rect on page
|
|
bbox_x, bbox_y, bbox_w, bbox_h = 0.0, 0.0, 0.0, 0.0
|
|
|
|
# Search for image instances on page
|
|
for img_block in page.get_images():
|
|
if img_block[0] == xref:
|
|
# Found matching image, try to get its rect
|
|
rects = page.get_image_rects(img_block)
|
|
if rects:
|
|
rect = rects[0] # Use first occurrence
|
|
bbox_x = rect.x0
|
|
bbox_y = rect.y0
|
|
bbox_w = rect.width
|
|
bbox_h = rect.height
|
|
break
|
|
|
|
# Create descriptive filename
|
|
img_filename = f"img_page{page_num + 1}_{img_index + 1}.{image_ext}"
|
|
img_path = output_dir / img_filename
|
|
|
|
# Save image
|
|
with open(img_path, "wb") as f:
|
|
f.write(image_bytes)
|
|
|
|
# Create metadata
|
|
metadata = ImageMetadata(
|
|
filename=img_filename,
|
|
page=page_num + 1,
|
|
index=img_index + 1,
|
|
width=width,
|
|
height=height,
|
|
x=round(bbox_x, 2),
|
|
y=round(bbox_y, 2),
|
|
bbox_width=round(bbox_w, 2),
|
|
bbox_height=round(bbox_h, 2),
|
|
size_bytes=len(image_bytes),
|
|
format=image_ext,
|
|
colorspace=cs_name,
|
|
bits_per_component=bpc
|
|
)
|
|
extracted.append(metadata)
|
|
|
|
# Generate markdown reference
|
|
alt_text = f"Image from page {page_num + 1}"
|
|
md_ref = f""
|
|
markdown_refs.append(f"<!-- Page {page_num + 1}, Position: ({bbox_x:.0f}, {bbox_y:.0f}) -->\n{md_ref}")
|
|
|
|
print(f" ✓ {img_filename} ({width}x{height}, {len(image_bytes):,} bytes)")
|
|
|
|
doc.close()
|
|
|
|
# Write metadata JSON
|
|
metadata_path = output_dir / "images_metadata.json"
|
|
with open(metadata_path, "w") as f:
|
|
json.dump(
|
|
{
|
|
"source": str(pdf_path),
|
|
"image_count": len(extracted),
|
|
"images": [asdict(m) for m in extracted]
|
|
},
|
|
f,
|
|
indent=2
|
|
)
|
|
print(f"\n📋 Metadata: {metadata_path}")
|
|
|
|
# Write markdown references if requested
|
|
if markdown_file and markdown_refs:
|
|
markdown_content = f"# Images from {pdf_path.name}\n\n"
|
|
markdown_content += "\n\n".join(markdown_refs)
|
|
markdown_file.parent.mkdir(parents=True, exist_ok=True)
|
|
markdown_file.write_text(markdown_content)
|
|
print(f"📝 Markdown refs: {markdown_file}")
|
|
|
|
print(f"\n✅ Total: {len(extracted)} images extracted to {output_dir}/")
|
|
return extracted
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract images from PDF files with metadata",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Basic extraction
|
|
uv run --with pymupdf scripts/extract_pdf_images.py document.pdf
|
|
|
|
# Custom output directory
|
|
uv run --with pymupdf scripts/extract_pdf_images.py doc.pdf -o ./images
|
|
|
|
# With markdown references
|
|
uv run --with pymupdf scripts/extract_pdf_images.py doc.pdf --markdown refs.md
|
|
|
|
Output:
|
|
Images are saved with descriptive names: img_page1_1.png, img_page2_1.jpg
|
|
Metadata is saved to: images_metadata.json
|
|
"""
|
|
)
|
|
parser.add_argument(
|
|
"pdf_path",
|
|
type=Path,
|
|
help="Path to the PDF file"
|
|
)
|
|
parser.add_argument(
|
|
"-o", "--output",
|
|
type=Path,
|
|
default=Path("assets"),
|
|
help="Directory to save images (default: ./assets)"
|
|
)
|
|
parser.add_argument(
|
|
"--markdown",
|
|
type=Path,
|
|
help="Generate markdown file with image references"
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
help="Output metadata as JSON to stdout"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.pdf_path.exists():
|
|
print(f"Error: File not found: {args.pdf_path}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"📄 Extracting images from: {args.pdf_path}")
|
|
|
|
extracted = extract_images(
|
|
args.pdf_path,
|
|
args.output,
|
|
args.markdown
|
|
)
|
|
|
|
if args.json:
|
|
print(json.dumps([asdict(m) for m in extracted], indent=2))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|