Release v1.27.0: Enhance markdown-tools with Heavy Mode
Add multi-tool orchestration for best-quality document conversion: - Dual mode: Quick (fast) and Heavy (best quality, multi-tool merge) - New convert.py - main orchestrator with tool selection matrix - New merge_outputs.py - segment-level multi-tool output merger - New validate_output.py - quality validation with HTML reports - Enhanced extract_pdf_images.py - metadata (page, position, dimensions) - PyMuPDF4LLM integration for LLM-optimized PDF conversion - pandoc integration for DOCX/PPTX structure preservation - Quality metrics: text/table/image retention with pass/warn/fail - New references: heavy-mode-guide.md, tool-comparison.md Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
232
markdown-tools/scripts/extract_pdf_images.py
Normal file → Executable file
232
markdown-tools/scripts/extract_pdf_images.py
Normal file → Executable file
@@ -1,94 +1,242 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract images from PDF files using PyMuPDF.
|
||||
Extract images from PDF files with metadata using PyMuPDF.
|
||||
|
||||
Features:
|
||||
- Extracts all images with page and position metadata
|
||||
- Generates JSON metadata file for each image
|
||||
- Supports markdown reference generation
|
||||
- Optional DPI control for quality
|
||||
|
||||
Usage:
|
||||
uv run --with pymupdf python extract_pdf_images.py <pdf_path> [output_dir]
|
||||
uv run --with pymupdf scripts/extract_pdf_images.py document.pdf
|
||||
uv run --with pymupdf scripts/extract_pdf_images.py document.pdf -o ./images
|
||||
uv run --with pymupdf scripts/extract_pdf_images.py document.pdf --markdown refs.md
|
||||
|
||||
Examples:
|
||||
uv run --with pymupdf python extract_pdf_images.py document.pdf
|
||||
uv run --with pymupdf python extract_pdf_images.py document.pdf ./assets
|
||||
# Basic extraction
|
||||
uv run --with pymupdf scripts/extract_pdf_images.py document.pdf
|
||||
|
||||
Output:
|
||||
Images are saved to output_dir (default: ./assets) with names like:
|
||||
- img_page1_1.png
|
||||
- img_page2_1.png
|
||||
# With custom output and markdown references
|
||||
uv run --with pymupdf scripts/extract_pdf_images.py doc.pdf -o assets --markdown images.md
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
def extract_images(pdf_path: str, output_dir: str = "assets") -> list[str]:
|
||||
|
||||
@dataclass
|
||||
class ImageMetadata:
|
||||
"""Metadata for an extracted image."""
|
||||
filename: str
|
||||
page: int # 1-indexed
|
||||
index: int # Image index on page (1-indexed)
|
||||
width: int # Original width in pixels
|
||||
height: int # Original height in pixels
|
||||
x: float # X position on page (points)
|
||||
y: float # Y position on page (points)
|
||||
bbox_width: float # Width on page (points)
|
||||
bbox_height: float # Height on page (points)
|
||||
size_bytes: int
|
||||
format: str # png, jpg, etc.
|
||||
colorspace: str # RGB, CMYK, Gray
|
||||
bits_per_component: int
|
||||
|
||||
|
||||
def extract_images(
|
||||
pdf_path: Path,
|
||||
output_dir: Path,
|
||||
markdown_file: Optional[Path] = None
|
||||
) -> list[ImageMetadata]:
|
||||
"""
|
||||
Extract all images from a PDF file.
|
||||
Extract all images from a PDF file with metadata.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
output_dir: Directory to save extracted images
|
||||
markdown_file: Optional path to write markdown references
|
||||
|
||||
Returns:
|
||||
List of extracted image file paths
|
||||
List of ImageMetadata for each extracted image
|
||||
"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
print("Error: PyMuPDF not installed. Run with:")
|
||||
print(' uv run --with pymupdf python extract_pdf_images.py <pdf_path>')
|
||||
print(' uv run --with pymupdf scripts/extract_pdf_images.py <pdf_path>')
|
||||
sys.exit(1)
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
doc = fitz.open(pdf_path)
|
||||
extracted_files = []
|
||||
doc = fitz.open(str(pdf_path))
|
||||
extracted: list[ImageMetadata] = []
|
||||
markdown_refs: list[str] = []
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
image_list = page.get_images()
|
||||
image_list = page.get_images(full=True)
|
||||
|
||||
for img_index, img_info in enumerate(image_list):
|
||||
xref = img_info[0]
|
||||
|
||||
try:
|
||||
base_image = doc.extract_image(xref)
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not extract image xref={xref}: {e}")
|
||||
continue
|
||||
|
||||
for img_index, img in enumerate(image_list):
|
||||
xref = img[0]
|
||||
base_image = doc.extract_image(xref)
|
||||
image_bytes = base_image["image"]
|
||||
image_ext = base_image["ext"]
|
||||
width = base_image.get("width", 0)
|
||||
height = base_image.get("height", 0)
|
||||
colorspace = base_image.get("colorspace", 0)
|
||||
bpc = base_image.get("bpc", 8)
|
||||
|
||||
# Map colorspace number to name
|
||||
cs_names = {1: "Gray", 3: "RGB", 4: "CMYK"}
|
||||
cs_name = cs_names.get(colorspace, f"Unknown({colorspace})")
|
||||
|
||||
# Get image position on page
|
||||
# img_info: (xref, smask, width, height, bpc, colorspace, alt, name, filter, referencer)
|
||||
# We need to find the image rect on page
|
||||
bbox_x, bbox_y, bbox_w, bbox_h = 0.0, 0.0, 0.0, 0.0
|
||||
|
||||
# Search for image instances on page
|
||||
for img_block in page.get_images():
|
||||
if img_block[0] == xref:
|
||||
# Found matching image, try to get its rect
|
||||
rects = page.get_image_rects(img_block)
|
||||
if rects:
|
||||
rect = rects[0] # Use first occurrence
|
||||
bbox_x = rect.x0
|
||||
bbox_y = rect.y0
|
||||
bbox_w = rect.width
|
||||
bbox_h = rect.height
|
||||
break
|
||||
|
||||
# Create descriptive filename
|
||||
img_filename = f"img_page{page_num + 1}_{img_index + 1}.{image_ext}"
|
||||
img_path = os.path.join(output_dir, img_filename)
|
||||
img_path = output_dir / img_filename
|
||||
|
||||
# Save image
|
||||
with open(img_path, "wb") as f:
|
||||
f.write(image_bytes)
|
||||
|
||||
extracted_files.append(img_path)
|
||||
print(f"Extracted: {img_filename} ({len(image_bytes):,} bytes)")
|
||||
# Create metadata
|
||||
metadata = ImageMetadata(
|
||||
filename=img_filename,
|
||||
page=page_num + 1,
|
||||
index=img_index + 1,
|
||||
width=width,
|
||||
height=height,
|
||||
x=round(bbox_x, 2),
|
||||
y=round(bbox_y, 2),
|
||||
bbox_width=round(bbox_w, 2),
|
||||
bbox_height=round(bbox_h, 2),
|
||||
size_bytes=len(image_bytes),
|
||||
format=image_ext,
|
||||
colorspace=cs_name,
|
||||
bits_per_component=bpc
|
||||
)
|
||||
extracted.append(metadata)
|
||||
|
||||
# Generate markdown reference
|
||||
alt_text = f"Image from page {page_num + 1}"
|
||||
md_ref = f""
|
||||
markdown_refs.append(f"<!-- Page {page_num + 1}, Position: ({bbox_x:.0f}, {bbox_y:.0f}) -->\n{md_ref}")
|
||||
|
||||
print(f" ✓ {img_filename} ({width}x{height}, {len(image_bytes):,} bytes)")
|
||||
|
||||
doc.close()
|
||||
|
||||
print(f"\nTotal: {len(extracted_files)} images extracted to {output_dir}/")
|
||||
return extracted_files
|
||||
# Write metadata JSON
|
||||
metadata_path = output_dir / "images_metadata.json"
|
||||
with open(metadata_path, "w") as f:
|
||||
json.dump(
|
||||
{
|
||||
"source": str(pdf_path),
|
||||
"image_count": len(extracted),
|
||||
"images": [asdict(m) for m in extracted]
|
||||
},
|
||||
f,
|
||||
indent=2
|
||||
)
|
||||
print(f"\n📋 Metadata: {metadata_path}")
|
||||
|
||||
# Write markdown references if requested
|
||||
if markdown_file and markdown_refs:
|
||||
markdown_content = f"# Images from {pdf_path.name}\n\n"
|
||||
markdown_content += "\n\n".join(markdown_refs)
|
||||
markdown_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
markdown_file.write_text(markdown_content)
|
||||
print(f"📝 Markdown refs: {markdown_file}")
|
||||
|
||||
print(f"\n✅ Total: {len(extracted)} images extracted to {output_dir}/")
|
||||
return extracted
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"):
|
||||
print("Extract images from PDF files using PyMuPDF.")
|
||||
print()
|
||||
print("Usage: python extract_pdf_images.py <pdf_path> [output_dir]")
|
||||
print()
|
||||
print("Arguments:")
|
||||
print(" pdf_path Path to the PDF file")
|
||||
print(" output_dir Directory to save images (default: ./assets)")
|
||||
print()
|
||||
print("Example:")
|
||||
print(" uv run --with pymupdf python extract_pdf_images.py document.pdf ./assets")
|
||||
sys.exit(0 if "--help" in sys.argv or "-h" in sys.argv else 1)
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract images from PDF files with metadata",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Basic extraction
|
||||
uv run --with pymupdf scripts/extract_pdf_images.py document.pdf
|
||||
|
||||
pdf_path = sys.argv[1]
|
||||
output_dir = sys.argv[2] if len(sys.argv) > 2 else "assets"
|
||||
# Custom output directory
|
||||
uv run --with pymupdf scripts/extract_pdf_images.py doc.pdf -o ./images
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
print(f"Error: File not found: {pdf_path}")
|
||||
# With markdown references
|
||||
uv run --with pymupdf scripts/extract_pdf_images.py doc.pdf --markdown refs.md
|
||||
|
||||
Output:
|
||||
Images are saved with descriptive names: img_page1_1.png, img_page2_1.jpg
|
||||
Metadata is saved to: images_metadata.json
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
"pdf_path",
|
||||
type=Path,
|
||||
help="Path to the PDF file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
type=Path,
|
||||
default=Path("assets"),
|
||||
help="Directory to save images (default: ./assets)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--markdown",
|
||||
type=Path,
|
||||
help="Generate markdown file with image references"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
help="Output metadata as JSON to stdout"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.pdf_path.exists():
|
||||
print(f"Error: File not found: {args.pdf_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
extract_images(pdf_path, output_dir)
|
||||
print(f"📄 Extracting images from: {args.pdf_path}")
|
||||
|
||||
extracted = extract_images(
|
||||
args.pdf_path,
|
||||
args.output,
|
||||
args.markdown
|
||||
)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps([asdict(m) for m in extracted], indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user