Files
claude-code-skills-reference/markdown-tools/scripts/extract_pdf_images.py
daymade 8233430cf2 Release v1.18.1: Enhance markdown-tools with PDF image extraction
- Add extract_pdf_images.py script using PyMuPDF
- Refactor SKILL.md for clearer workflow documentation
- Update installation to use markitdown[pdf] extra
- Update marketplace version to 1.18.1
- Update markdown-tools version to 1.1.0
- Update README/README.zh-CN with new features
- Update QUICKSTART docs with in-app install instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 18:46:15 +08:00

96 lines
2.7 KiB
Python

#!/usr/bin/env python3
"""
Extract images from PDF files using PyMuPDF.
Usage:
uv run --with pymupdf python extract_pdf_images.py <pdf_path> [output_dir]
Examples:
uv run --with pymupdf python extract_pdf_images.py document.pdf
uv run --with pymupdf python extract_pdf_images.py document.pdf ./assets
Output:
Images are saved to output_dir (default: ./assets) with names like:
- img_page1_1.png
- img_page2_1.png
"""
import sys
import os
def extract_images(pdf_path: str, output_dir: str = "assets") -> list[str]:
"""
Extract all images from a PDF file.
Args:
pdf_path: Path to the PDF file
output_dir: Directory to save extracted images
Returns:
List of extracted image file paths
"""
try:
import fitz # PyMuPDF
except ImportError:
print("Error: PyMuPDF not installed. Run with:")
print(' uv run --with pymupdf python extract_pdf_images.py <pdf_path>')
sys.exit(1)
os.makedirs(output_dir, exist_ok=True)
doc = fitz.open(pdf_path)
extracted_files = []
for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images()
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Create descriptive filename
img_filename = f"img_page{page_num + 1}_{img_index + 1}.{image_ext}"
img_path = os.path.join(output_dir, img_filename)
with open(img_path, "wb") as f:
f.write(image_bytes)
extracted_files.append(img_path)
print(f"Extracted: {img_filename} ({len(image_bytes):,} bytes)")
doc.close()
print(f"\nTotal: {len(extracted_files)} images extracted to {output_dir}/")
return extracted_files
def main():
if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"):
print("Extract images from PDF files using PyMuPDF.")
print()
print("Usage: python extract_pdf_images.py <pdf_path> [output_dir]")
print()
print("Arguments:")
print(" pdf_path Path to the PDF file")
print(" output_dir Directory to save images (default: ./assets)")
print()
print("Example:")
print(" uv run --with pymupdf python extract_pdf_images.py document.pdf ./assets")
sys.exit(0 if "--help" in sys.argv or "-h" in sys.argv else 1)
pdf_path = sys.argv[1]
output_dir = sys.argv[2] if len(sys.argv) > 2 else "assets"
if not os.path.exists(pdf_path):
print(f"Error: File not found: {pdf_path}")
sys.exit(1)
extract_images(pdf_path, output_dir)
if __name__ == "__main__":
main()