From 143995b213f09b2cae0c98e94f0aa391d340f3b2 Mon Sep 17 00:00:00 2001 From: daymade Date: Mon, 23 Mar 2026 00:06:30 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20rename=20markdown-tools=20=E2=86=92?= =?UTF-8?q?=20doc-to-markdown=20(v2.0.0)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename skill to better reflect its purpose (document-to-markdown conversion) - Update SKILL.md name, description, and trigger keywords - Add benchmark reference (2026-03-22) - Update marketplace.json entry (name, skills path, version 2.0.0) Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude-plugin/marketplace.json | 18 +- {markdown-tools => doc-to-markdown}/SKILL.md | 32 +- .../references/benchmark-2026-03-22.md | 163 +++ .../references/conversion-examples.md | 0 .../references/heavy-mode-guide.md | 0 .../references/tool-comparison.md | 0 doc-to-markdown/scripts/convert.py | 1150 +++++++++++++++++ .../scripts/convert_path.py | 0 .../scripts/extract_pdf_images.py | 0 .../scripts/merge_outputs.py | 0 .../scripts/validate_output.py | 0 markdown-tools/scripts/convert.py | 434 ------- 12 files changed, 1346 insertions(+), 451 deletions(-) rename {markdown-tools => doc-to-markdown}/SKILL.md (74%) create mode 100644 doc-to-markdown/references/benchmark-2026-03-22.md rename {markdown-tools => doc-to-markdown}/references/conversion-examples.md (100%) rename {markdown-tools => doc-to-markdown}/references/heavy-mode-guide.md (100%) rename {markdown-tools => doc-to-markdown}/references/tool-comparison.md (100%) create mode 100755 doc-to-markdown/scripts/convert.py rename {markdown-tools => doc-to-markdown}/scripts/convert_path.py (100%) rename {markdown-tools => doc-to-markdown}/scripts/extract_pdf_images.py (100%) rename {markdown-tools => doc-to-markdown}/scripts/merge_outputs.py (100%) rename {markdown-tools => doc-to-markdown}/scripts/validate_output.py (100%) delete mode 100755 markdown-tools/scripts/convert.py diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 76986aa..2286d3d 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -50,25 +50,23 @@ ] }, { - "name": "markdown-tools", - "description": "Convert documents (PDFs, Word, PowerPoint) to high-quality markdown with multi-tool orchestration. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge with segment-level selection). Features PyMuPDF4LLM for LLM-optimized PDF conversion, pandoc for DOCX/PPTX structure preservation, quality validation with HTML reports, and image extraction with metadata", + "name": "doc-to-markdown", + "description": "Converts DOCX/PDF/PPTX to high-quality Markdown with automatic post-processing. Fixes pandoc grid tables, image paths, attribute noise, and code blocks. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). Trigger on \"convert document\", \"docx to markdown\", \"parse word\", \"doc to markdown\", \"extract images from document\".", "source": "./", "strict": false, - "version": "1.2.0", + "version": "2.0.0", "category": "document-conversion", "keywords": [ "markdown", - "pdf", "docx", + "pdf", "pptx", - "pymupdf4llm", + "converter", "pandoc", - "markitdown", - "heavy-mode", - "quality-validation" + "document" ], "skills": [ - "./markdown-tools" + "./doc-to-markdown" ] }, { @@ -942,4 +940,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/markdown-tools/SKILL.md b/doc-to-markdown/SKILL.md similarity index 74% rename from markdown-tools/SKILL.md rename to doc-to-markdown/SKILL.md index 8bc71f4..966c9fc 100644 --- a/markdown-tools/SKILL.md +++ b/doc-to-markdown/SKILL.md @@ -1,11 +1,11 @@ --- -name: markdown-tools -description: Converts documents to markdown with multi-tool orchestration for best quality. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). Use when converting PDF/DOCX/PPTX files to markdown, extracting images from documents, validating conversion quality, or needing LLM-optimized document output. +name: doc-to-markdown +description: Converts DOCX/PDF/PPTX to high-quality Markdown with automatic post-processing. Fixes pandoc grid tables, image paths, attribute noise, and code blocks. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). Trigger on "convert document", "docx to markdown", "parse word", "doc to markdown", "extract images from document". --- -# Markdown Tools +# Doc to Markdown -Convert documents to high-quality markdown with intelligent multi-tool orchestration. +Convert documents to high-quality markdown with intelligent multi-tool orchestration and automatic DOCX post-processing. ## Dual Mode Architecture @@ -34,6 +34,9 @@ uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o o # Heavy Mode - multi-tool parallel execution with merge uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md --heavy +# DOCX with deep python-docx parsing (experimental) +uv run --with pymupdf4llm --with markitdown --with python-docx scripts/convert.py document.docx -o output.md --docx-deep + # Check available tools uv run scripts/convert.py --list-tools ``` @@ -43,7 +46,7 @@ uv run scripts/convert.py --list-tools | Format | Quick Mode Tool | Heavy Mode Tools | |--------|----------------|------------------| | PDF | pymupdf4llm | pymupdf4llm + markitdown | -| DOCX | pandoc | pandoc + markitdown | +| DOCX | pandoc + post-processing | pandoc + markitdown | | PPTX | markitdown | markitdown + pandoc | | XLSX | markitdown | markitdown | @@ -53,6 +56,21 @@ uv run scripts/convert.py --list-tools - **markitdown**: Microsoft's universal converter, good for Office formats - **pandoc**: Excellent structure preservation for DOCX/PPTX +## DOCX Post-Processing (automatic) + +When converting DOCX files via pandoc, the following cleanups are applied automatically: + +| Problem | Fix | +|---------|-----| +| Grid tables (`+:---+` syntax) | Single-column -> blockquote, multi-column -> split images | +| Image double path (`media/media/`) | Flatten to `media/` | +| Pandoc attributes (`{width="..." height="..."}`) | Removed | +| Inline classes (`{.underline}`, `{.mark}`) | Removed | +| Indented dashed code blocks | Converted to fenced code blocks (```) | +| Escaped brackets (`\[...\]`) | Unescaped to `[...]` | +| Double-bracket links (`[[text]{...}](url)`) | Simplified to `[text](url)` | +| Escaped quotes in code (`\"`) | Fixed to `"` | + ## Heavy Mode Workflow Heavy Mode runs multiple tools in parallel and selects the best segments: @@ -117,7 +135,7 @@ python scripts/merge_outputs.py output1.md output2.md -o merged.md --verbose ## Path Conversion (Windows/WSL) ```bash -# Windows → WSL conversion +# Windows to WSL conversion python scripts/convert_path.py "C:\Users\name\Documents\file.pdf" # Output: /mnt/c/Users/name/Documents/file.pdf ``` @@ -147,7 +165,7 @@ brew install pandoc | Script | Purpose | |--------|---------| -| `convert.py` | Main orchestrator with Quick/Heavy mode | +| `convert.py` | Main orchestrator with Quick/Heavy mode + DOCX post-processing | | `merge_outputs.py` | Merge multiple markdown outputs | | `validate_output.py` | Quality validation with HTML report | | `extract_pdf_images.py` | PDF image extraction with metadata | diff --git a/doc-to-markdown/references/benchmark-2026-03-22.md b/doc-to-markdown/references/benchmark-2026-03-22.md new file mode 100644 index 0000000..27b2069 --- /dev/null +++ b/doc-to-markdown/references/benchmark-2026-03-22.md @@ -0,0 +1,163 @@ +# DOCX→Markdown 转换方案基准测试 + +> **测试日期**:2026-03-22 +> +> **测试文件**:`助教-【腾讯云🦞】小白实践 OpenClaw 保姆级教程.docx`(19MB,77 张图片,含 grid table 布局、JSON 代码块、多列图片并排、信息框) +> +> **测试方法**:5 个方案对同一文件转换,按 5 个维度各 10 分制打分 + +--- + +## 综合评分 + +| 维度 | Docling (IBM) | MarkItDown (MS) | Pandoc | Mammoth | **doc-to-markdown(我们)** | +|------|:---:|:---:|:---:|:---:|:---:| +| 表格质量 | 5 | 3 | 5 | 1~3 | **6** | +| 图片提取 | 4 | 2 | **10** | 5 | 7 | +| 文本完整性 | 8 | 7 | **9** | 7 | **9** | +| 格式清洁度 | 5 | 5 | 5 | 3 | **7** | +| 代码块 | 2 | 1 | N/A | 1 | **9** | +| **综合** | **4.8** | **3.6** | **7.3** | **3.4~3.6** | **7.6** | + +--- + +## 各方案详细分析 + +### 1. IBM Docling(综合 4.8) + +- **版本**:docling 2.x + Granite-Docling-258M +- **架构**:AI 驱动(VLM 视觉语言模型),DocTags 中间格式 → Markdown + +**致命问题**: +- 图片引用全部是 `` 占位符(77 张图 0 张可显示),`ImageRefMode` API 对 DOCX 不可用 +- 标题层级全部丢失(0 个 `#`),所有标题退化为粗体文本 +- 代码块为零,JSON 和命令全部输出为普通段落 +- `api_key` 被错误转义为 `api\_key` + +**优点**: +- 文本内容完整,中文/emoji/链接保留良好 +- 无 grid table 或 HTML 残留 +- 表格语法正确(pipe table),但内容是占位符 + +**结论**:Docling 的优势在 PDF(AAAI 2025 论文场景),DOCX 支持远未达到生产级别。 + +### 2. Microsoft MarkItDown(综合 3.6) + +- **版本**:markitdown 0.1.5 +- **架构**:底层调用 mammoth → HTML → markdownify → Markdown + +**致命问题**: +- 77 张图片全部是截断的 base64 占位符(`data:image/png;base64,...`),默认 `keep_data_uris=False` 主动丢弃图片数据 +- 标题全部变为粗体文本(mammoth 无法识别 WPS 自定义样式) +- 代码块为零,JSON 被塞入表格单元格 +- 有序列表编号全部错误(输出为 `1. 1. 1.`) + +**优点**: +- 无 HTML 标签残留 +- 文本内容基本完整 + +**结论**:MarkItDown 的 markdownify 后处理反而引入破坏性截断。轻量场景可用,复杂 DOCX 不可靠。 + +### 3. Pandoc(综合 7.3) + +- **版本**:pandoc 3.9 +- **架构**:Haskell 原生 AST 解析,支持 60+ 格式 + +**测试了 3 种参数**: + +| 参数 | 结果 | +|------|------| +| `-t gfm` | 最差:24 个 HTML `` 嵌套,74 个 HTML `` | +| `-t markdown` | 最佳:grid table(可后处理),无 HTML | +| `-t markdown-raw_html-...` | 与 markdown 完全相同,参数无效果 | + +**问题**: +- Grid table 不可避免(原 docx 有多行单元格和嵌套表格,pipe table 无法表达) +- `{width="..." height="..."}` 68 处 +- `{.underline}` 6 处 +- 反斜杠过度转义 37 处 + +**优点**: +- 图片提取 10/10(77 张全部正确,路径结构一致) +- 文本完整性 9/10(内容、链接、emoji 全部保留) +- 最成熟稳定的底层引擎 + +**结论**:Pandoc 是最可靠的底层引擎,输出质量最高但需要后处理清洗 pandoc 私有语法。 + +### 4. Mammoth(综合 3.4~3.6) + +- **版本**:mammoth 1.11.0 +- **架构**:python-docx 解析 → HTML/Markdown(Markdown 支持已废弃) + +**测试了 2 种方式**: + +| 方式 | 综合 | +|------|------| +| 方式A:直接转 Markdown | 3.4(表格完全丢失) | +| 方式B:转 HTML → markdownify | 3.6(有表格但嵌套被压扁) | + +**致命问题**: +- 标题全部丢失(WPS `styles.xml` 中样式定义为空,mammoth 无法映射 Heading) +- 代码块为零 +- 图片全部 base64 内嵌,单文件 28MB +- 方式B 中 markdownify 丢失 14 张图片(63/77) + +**结论**:Mammoth 的 Markdown 支持已废弃,对 WPS 导出的 docx 兼容性差。不推荐。 + +### 5. doc-to-markdown / 我们的方案(综合 7.6) + +- **版本**:doc-to-markdown 1.0(基于 pandoc + 6 个后处理函数) +- **架构**:Pandoc 转换 → 自动后处理(grid table 清理、图片路径修复、属性清理、代码块修复、转义修复) + +**后处理实际效果**: + +| 后处理函数 | 修复数量 | +|-----------|---------| +| `_convert_grid_tables` | 11 处 grid table → pipe table / blockquote | +| `_clean_pandoc_attributes` | 3437 字符属性清理 | +| `_fix_code_blocks` | 22 处缩进虚线 → ``` 代码块 | +| `_fix_escaped_brackets` | 10 处 | +| `_fix_double_bracket_links` | 1 处 | +| `_fix_image_paths` | 77 张图片路径修复 | + +**已知问题(待修复)**: +- 图片路径双层嵌套 bug:`--assets-dir` 指定目录内被 pandoc 再建一层 `media/` +- 2 处 grid table 残留(文末并排图片组未完全转换) + +**优点**: +- 代码块识别 9/10(JSON 带语言标签,命令行正确包裹) +- 格式清洁度 7/10(attributes、转义、grid table 大部分清理干净) +- 文本完整性 9/10(关键内容全部保留) + +**结论**:综合最优,核心价值在 pandoc 后处理层。剩余 2 个 bug 可修。 + +--- + +## 架构决策 + +``` +最终方案:Pandoc(底层引擎)+ doc-to-markdown 后处理(增值层) + +理由: +1. Pandoc 图片提取最可靠(10/10),文本最完整(9/10) +2. Pandoc 的问题(grid table、属性、转义)全部可后处理解决 +3. Docling/MarkItDown/Mammoth 的致命问题(图片丢失、标题丢失)无法后处理修复 +4. 后处理层是我们的核心竞争力,成本低、可迭代 +``` + +--- + +## 测试文件特征 + +本次测试文件的难点在于: + +| 特征 | 说明 | 影响 | +|------|------|------| +| WPS 导出 | 非标准 Word 样式(Style ID 2/3 而非 Heading 1/2) | mammoth/markitdown/docling 标题全丢 | +| 多列图片布局 | 2x2、1x4 图片网格用表格排版 | pandoc 输出 grid table | +| 信息框/提示框 | 单列表格包裹文字 | pandoc 输出 grid table | +| 嵌套表格 | 表格内套表格 | pipe table 无法表达 | +| JSON 代码块 | 非代码块样式,用文本框/缩进表示 | 多数工具无法识别为代码 | +| 19MB 文件 | 77 张截图嵌入 | base64 方案导致 28MB 输出 | + +这些特征代表了真实世界中 WPS/飞书文档导出 docx 的典型困难,是有效的基准测试场景。 diff --git a/markdown-tools/references/conversion-examples.md b/doc-to-markdown/references/conversion-examples.md similarity index 100% rename from markdown-tools/references/conversion-examples.md rename to doc-to-markdown/references/conversion-examples.md diff --git a/markdown-tools/references/heavy-mode-guide.md b/doc-to-markdown/references/heavy-mode-guide.md similarity index 100% rename from markdown-tools/references/heavy-mode-guide.md rename to doc-to-markdown/references/heavy-mode-guide.md diff --git a/markdown-tools/references/tool-comparison.md b/doc-to-markdown/references/tool-comparison.md similarity index 100% rename from markdown-tools/references/tool-comparison.md rename to doc-to-markdown/references/tool-comparison.md diff --git a/doc-to-markdown/scripts/convert.py b/doc-to-markdown/scripts/convert.py new file mode 100755 index 0000000..a95eb86 --- /dev/null +++ b/doc-to-markdown/scripts/convert.py @@ -0,0 +1,1150 @@ +#!/usr/bin/env python3 +""" +Multi-tool document to markdown converter with intelligent orchestration. + +Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). +DOCX files get automatic post-processing to fix pandoc artifacts. + +Usage: + # Quick Mode (default) - fast, single best tool + uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md + + # Heavy Mode - multi-tool parallel execution with merge + uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md --heavy + + # DOCX deep mode - python-docx direct parsing (experimental) + uv run --with python-docx scripts/convert.py document.docx -o output.md --docx-deep + + # With image extraction + uv run --with pymupdf4llm scripts/convert.py document.pdf -o output.md --assets-dir ./images + +Dependencies: + - pymupdf4llm: PDF conversion (LLM-optimized) + - markitdown: PDF/DOCX/PPTX conversion + - pandoc: DOCX/PPTX conversion (system install: brew install pandoc) + - python-docx: DOCX deep parsing (optional, for --docx-deep) +""" + +import argparse +import re +import subprocess +import sys +import shutil +import zipfile +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + + +@dataclass +class ConversionResult: + """Result from a single tool conversion.""" + markdown: str + tool: str + images: list[str] = field(default_factory=list) + success: bool = True + error: str = "" + + +# ── Post-processing stats ──────────────────────────────────────────────────── + +@dataclass +class PostProcessStats: + """Track what the DOCX post-processor fixed.""" + grid_tables_converted: int = 0 + image_paths_fixed: int = 0 + attributes_removed: int = 0 + code_blocks_fixed: int = 0 + escaped_brackets_fixed: int = 0 + double_brackets_fixed: int = 0 + + def any_fixes(self) -> bool: + return any( + getattr(self, f) > 0 + for f in self.__dataclass_fields__ + ) + + def summary(self) -> str: + parts = [] + if self.grid_tables_converted: + parts.append(f"grid tables: {self.grid_tables_converted}") + if self.image_paths_fixed: + parts.append(f"image paths: {self.image_paths_fixed}") + if self.attributes_removed: + parts.append(f"attributes: {self.attributes_removed}") + if self.code_blocks_fixed: + parts.append(f"code blocks: {self.code_blocks_fixed}") + if self.escaped_brackets_fixed: + parts.append(f"escaped brackets: {self.escaped_brackets_fixed}") + if self.double_brackets_fixed: + parts.append(f"double brackets: {self.double_brackets_fixed}") + return ", ".join(parts) if parts else "no fixes needed" + + +# ── DOCX post-processing ───────────────────────────────────────────────────── + +# Regex patterns compiled once +_RE_GRID_BORDER = re.compile(r"^\+[:=-][-:=]+(?:\+[:=-][-:=]+)*\+$") +_RE_GRID_ROW = re.compile(r"^\|(.+)\|$") +_RE_NESTED_GRID_BORDER = re.compile(r"^\|\s*\+[:=-][-:=]+\+\s*\|$") +_RE_PANDOC_ATTR = re.compile(r"\{[^}]*(?:width|height)\s*=\s*\"[^\"]*\"[^}]*\}") +_RE_PANDOC_CLASS = re.compile(r"\{\.(?:underline|mark)\}") +_RE_DOUBLE_BRACKET_LINK = re.compile(r"\[\[([^\]]+)\]\(([^)]+)\)") +_RE_DOUBLE_BRACKET_CLOSED = re.compile(r"\[\[([^\]]+)\]\]\(([^)]+)\)") +_RE_DOUBLE_BRACKET_ATTR_LINK = re.compile(r"\[\[([^\]]+)\]\{[^}]*\}\]\(([^)]+)\)") +_RE_ESCAPED_BRACKET = re.compile(r"\\(\[|])") +# Matches single-column dashed line: " ------" +# AND multi-column simple table border: " ---- -----" +_RE_DASHED_LINE = re.compile(r"^(\s{2,})-{3,}[\s-]*$") +_RE_ESCAPED_QUOTE = re.compile(r'\\"') +# CJK + fullwidth punctuation range for bold spacing checks +_RE_CJK_PUNCT = re.compile(r'[\u4e00-\u9fff\u3000-\u303f\uff01-\uffef,。、;:!?()【】「」《》""'']') +_RE_BOLD_PAIR = re.compile(r'\*\*(.+?)\*\*') + + +def _is_grid_border(line: str) -> bool: + """Check if a line is a grid table border like +---+ or +:---+.""" + stripped = line.strip() + return bool(_RE_GRID_BORDER.match(stripped)) + + +def _is_nested_grid_border(line: str) -> bool: + """Check if a line is a nested grid border like | +---+ |.""" + stripped = line.strip() + return bool(_RE_NESTED_GRID_BORDER.match(stripped)) + + +def _count_grid_columns(border_line: str) -> int: + """Count columns in a grid table border line.""" + stripped = border_line.strip() + if not stripped.startswith("+"): + return 0 + # Count + separators minus 1 = number of columns + return stripped.count("+") - 1 + + + +# Languages recognized as code block hints in pandoc dashed-line blocks +_KNOWN_CODE_LANGS = frozenset({ + "json", "bash", "shell", "python", "javascript", "js", + "html", "css", "yaml", "xml", "sql", "plain text", + "text", "plaintext", "typescript", "ts", "go", "rust", + "java", "c", "cpp", "ruby", "php", +}) + + +def _build_pipe_table(rows: list[list[str]]) -> list[str]: + """Build a standard markdown pipe table from rows of cells.""" + if not rows: + return [] + col_count = max(len(r) for r in rows) + lines = [ + "| " + " | ".join([""] * col_count) + " |", + "| " + " | ".join(["---"] * col_count) + " |", + ] + for row in rows: + padded = row + [""] * (col_count - len(row)) + lines.append("| " + " | ".join(padded) + " |") + return lines + + +def _collect_images(directory: Path) -> list[str]: + """Collect image files from a directory (single glob pass).""" + if not directory.exists(): + return [] + image_exts = {".png", ".jpg", ".jpeg", ".gif", ".webp"} + return sorted( + str(p) for p in directory.rglob("*") + if p.suffix.lower() in image_exts + ) + + +def _convert_grid_tables(text: str, stats: PostProcessStats) -> str: + """Convert pandoc grid tables to standard markdown. + + Single-column grid tables (info boxes) -> blockquotes. + Multi-column grid tables (side-by-side images) -> split into individual elements. + Nested grid tables are flattened. + """ + lines = text.split("\n") + result = [] + i = 0 + + while i < len(lines): + line = lines[i] + + # Detect grid table start + if _is_grid_border(line): + # Collect the entire grid table + table_lines = [line] + i += 1 + while i < len(lines): + table_lines.append(lines[i]) + if _is_grid_border(lines[i]) and len(table_lines) > 1: + i += 1 + break + i += 1 + else: + # Reached end of file without closing border + # Just output as-is + result.extend(table_lines) + continue + + stats.grid_tables_converted += 1 + num_cols = _count_grid_columns(table_lines[0]) + + # Extract content lines (skip borders) + content_lines = [] + for tl in table_lines: + if _is_grid_border(tl) or _is_nested_grid_border(tl): + continue + m = _RE_GRID_ROW.match(tl.strip()) + if m: + content_lines.append(m.group(1).strip()) + else: + # Non-standard line inside grid, keep content + stripped = tl.strip() + if stripped and stripped != "|": + content_lines.append(stripped) + + if num_cols <= 1: + # Single column -> blockquote + result.append("") + for cl in content_lines: + # Strip outer pipes if present from nested grids + cleaned = cl.strip() + if cleaned.startswith("|") and cleaned.endswith("|"): + cleaned = cleaned[1:-1].strip() + # Skip nested grid borders + if _RE_GRID_BORDER.match(cleaned): + continue + if cleaned: + result.append(f"> {cleaned}") + else: + result.append(">") + result.append("") + else: + # Multi-column -> convert to standard pipe table + # Parse rows: each content_line is a row, split by | into cells + table_rows = [] + for cl in content_lines: + cells = [c.strip() for c in cl.split("|") if c.strip() and not _RE_GRID_BORDER.match(c.strip())] + if cells: + table_rows.append(cells) + + if table_rows: + result.append("") + result.extend(_build_pipe_table(table_rows)) + result.append("") + else: + result.append(line) + i += 1 + + return "\n".join(result) + + +def _fix_image_paths(text: str, assets_dir: Optional[Path], stats: PostProcessStats) -> str: + """Fix pandoc's double media path and verify images exist. + + Pandoc extracts to /media/ but references as + /media/media/. Fix the references. + Also flatten the actual directory if needed. + """ + def fix_path(m: re.Match) -> str: + alt = m.group(1) + path = m.group(2) + new_path = path + + # Fix double media/ path + if "media/media/" in path: + new_path = path.replace("media/media/", "media/") + stats.image_paths_fixed += 1 + + return f"![{alt}]({new_path})" + + text = re.sub(r"!\[([^\]]*)\]\(([^)]+)\)", fix_path, text) + + # Flatten double media/ nesting if present (pandoc artifact) + if assets_dir: + double_media = assets_dir / "media" / "media" + single_media = assets_dir / "media" + try: + for f in double_media.iterdir(): + dest = single_media / f.name + if not dest.exists(): + shutil.move(str(f), str(dest)) + double_media.rmdir() + except (FileNotFoundError, OSError): + pass + + return text + + +def _clean_pandoc_attributes(text: str, stats: PostProcessStats) -> str: + """Remove pandoc attribute annotations from markdown. + + Removes: {width="..." height="..."}, {.underline}, {.mark}, etc. + """ + count_before = len(text) + + # Remove width/height attributes on images + text = _RE_PANDOC_ATTR.sub("", text) + + # Remove class attributes like {.underline} + text = _RE_PANDOC_CLASS.sub("", text) + + if len(text) != count_before: + # Rough count of removals + stats.attributes_removed = count_before - len(text) + + return text + + +def _is_code_content(lines: list[str]) -> bool: + """Heuristic: decide if content between dashed lines is code or a note/callout. + + Code indicators: + - Has a language hint on the first line + - Contains JSON/code-like syntax ({, }, =, ;, ->, //) + - Contains URLs with protocols + - Has backslash line continuations + + Note indicators: + - Mostly CJK/prose text without code syntax + - Short single-line content + """ + text = "\n".join(lines) + stripped = text.strip() + + if not stripped: + return False + + # Code syntax indicators + code_chars = set('{}[]();=<>/\\') + code_char_count = sum(1 for c in stripped if c in code_chars) + + # If >5% of content is code syntax characters, treat as code + if len(stripped) > 0 and code_char_count / len(stripped) > 0.05: + return True + + # JSON-like structure + if stripped.startswith("{") or stripped.startswith("["): + return True + + # Command-like (starts with common command patterns) + first_line = lines[0].strip() if lines else "" + if re.match(r"^(curl|wget|npm|pip|brew|apt|docker|git|ssh|cd|ls|cat|echo|python|node|uv)\s", first_line): + return True + + return False + + +def _fix_code_blocks(text: str, stats: PostProcessStats) -> str: + """Convert pandoc's indented dashed-line blocks to fenced code blocks or blockquotes. + + Pandoc wraps both code and notes in: + ------------------------------------------------------------------ + content here + + ------------------------------------------------------------------ + + With language hint -> code block: + ```json + content here + ``` + + Without language hint + prose content -> blockquote: + > content here + + Without language hint + code-like content -> code block: + ``` + content here + ``` + """ + lines = text.split("\n") + result = [] + i = 0 + + known_langs = _KNOWN_CODE_LANGS + + while i < len(lines): + line = lines[i] + + # Detect indented dashed line (2+ leading spaces, 3+ dashes) + if _RE_DASHED_LINE.match(line): + # Check if this is a pandoc simple table (multiple dashed columns + # on the same line, or content between dashes contains images) + # Simple table pattern: " ---- ----" (multiple dash groups separated by spaces) + # Gap can be 1+ spaces (pandoc uses varying gaps) + dash_parts = [p for p in line.strip().split() if p.strip()] + is_simple_table_border = len(dash_parts) > 1 and all( + re.match(r"^-+$", p.strip()) for p in dash_parts + ) + + if is_simple_table_border: + # This is a pandoc simple table border - collect rows until + # next simple table border, convert to pipe table + table_rows = [] + j = i + 1 + while j < len(lines): + next_line = lines[j] + # Check for closing simple table border + next_parts = [p for p in next_line.strip().split() if p.strip()] + is_next_border = len(next_parts) > 1 and all( + re.match(r"^-+$", p.strip()) for p in next_parts + ) + if is_next_border: + j += 1 + break + if next_line.strip(): + # Split by 2+ spaces to get columns (pandoc uses varying gaps) + cells = [c.strip() for c in re.split(r"\s{2,}", next_line.strip()) if c.strip()] + if cells: + table_rows.append(cells) + j += 1 + + if table_rows: + stats.code_blocks_fixed += 1 + result.append("") + result.extend(_build_pipe_table(table_rows)) + result.append("") + + i = j + continue + + # Not a simple table - look for content and closing dashed line + block_content = [] + lang_hint = "" + j = i + 1 + + while j < len(lines): + next_line = lines[j] + + if _RE_DASHED_LINE.match(next_line): + # Found closing dashed line + j += 1 + break + + block_content.append(next_line) + j += 1 + else: + # No closing dashed line found - not a block, keep as-is + result.append(line) + i += 1 + continue + + # If content contains images, treat as simple table (single-column) + has_images = any("![" in cl for cl in block_content) + if has_images: + result.append("") + for cl in block_content: + cl = cl.strip() + if cl: + result.append(cl) + result.append("") + i = j + continue + + # Check if first line is a language hint (e.g., " JSON\", " Plain Text\") + has_lang_hint = False + if block_content: + first = block_content[0].strip() + first_clean = first.rstrip("\\").strip() + if first_clean.lower() in known_langs: + lang_hint = first_clean.lower() + if lang_hint in ("plain text", "text", "plaintext"): + lang_hint = "" # No language tag for plain text + has_lang_hint = True + block_content = block_content[1:] + + # Clean content: remove leading 2-space indent, fix escaped quotes + cleaned = [] + for cl in block_content: + if cl.startswith(" "): + cl = cl[2:] + cl = cl.replace('\\"', '"') + if cl.endswith("\\"): + cl = cl[:-1] + cleaned.append(cl) + + # Remove trailing/leading empty lines + while cleaned and not cleaned[-1].strip(): + cleaned.pop() + while cleaned and not cleaned[0].strip(): + cleaned.pop(0) + + if cleaned: + stats.code_blocks_fixed += 1 + + # Decide: code block vs blockquote + if has_lang_hint or _is_code_content(cleaned): + # Code block + result.append("") + result.append(f"```{lang_hint}") + result.extend(cleaned) + result.append("```") + result.append("") + else: + # Note/callout -> blockquote + result.append("") + for cl in cleaned: + if cl.strip(): + result.append(f"> {cl}") + else: + result.append(">") + result.append("") + + i = j + else: + result.append(line) + i += 1 + + return "\n".join(result) + + +def _fix_escaped_brackets(text: str, stats: PostProcessStats) -> str: + r"""Fix pandoc's escaped brackets: \[ -> [, \] -> ].""" + count = len(_RE_ESCAPED_BRACKET.findall(text)) + if count: + stats.escaped_brackets_fixed = count + text = _RE_ESCAPED_BRACKET.sub(r"\1", text) + return text + + +def _fix_double_bracket_links(text: str, stats: PostProcessStats) -> str: + """Fix double-bracket links: [[text]{.underline}](url) -> [text](url).""" + count = 0 + + def _replace_link(m: re.Match) -> str: + nonlocal count + count += 1 + return f"[{m.group(1)}]({m.group(2)})" + + text = _RE_DOUBLE_BRACKET_ATTR_LINK.sub(_replace_link, text) + text = _RE_DOUBLE_BRACKET_CLOSED.sub(_replace_link, text) + text = _RE_DOUBLE_BRACKET_LINK.sub(_replace_link, text) + + stats.double_brackets_fixed = count + return text + + +def _fix_cjk_bold_spacing(text: str) -> str: + """Add space between **bold** markers and adjacent CJK characters. + + DOCX uses run-level styling for bold — no spaces between runs in CJK text. + Markdown renderers need whitespace around ** to recognize bold boundaries. + We find each **content** span, check the character before/after, and insert + a space only when the adjacent character is CJK (avoiding double spaces). + """ + result = [] + last_end = 0 + + for m in _RE_BOLD_PAIR.finditer(text): + start, end = m.start(), m.end() + result.append(text[last_end:start]) + + # Space before opening ** if preceded by CJK + if start > 0 and _RE_CJK_PUNCT.match(text[start - 1]): + result.append(' ') + + result.append(m.group(0)) + + # Space after closing ** if followed by CJK + if end < len(text) and _RE_CJK_PUNCT.match(text[end]): + result.append(' ') + + last_end = end + + result.append(text[last_end:]) + return ''.join(result) + + +def _cleanup_excessive_blank_lines(text: str) -> str: + """Collapse 3+ consecutive blank lines to 2.""" + return re.sub(r"\n{4,}", "\n\n\n", text) + + +def postprocess_docx_markdown( + text: str, + assets_dir: Optional[Path] = None, +) -> tuple[str, PostProcessStats]: + """Apply all DOCX-specific post-processing to pandoc markdown output. + + Returns (cleaned_text, stats). + """ + stats = PostProcessStats() + + # Order matters: grid tables first (they contain images with attributes) + text = _convert_grid_tables(text, stats) + text = _fix_image_paths(text, assets_dir, stats) + text = _clean_pandoc_attributes(text, stats) + text = _fix_code_blocks(text, stats) + text = _fix_double_bracket_links(text, stats) + text = _fix_escaped_brackets(text, stats) + text = _fix_cjk_bold_spacing(text) + text = _cleanup_excessive_blank_lines(text) + + return text, stats + + +# ── DOCX deep parsing (python-docx) ────────────────────────────────────────── + +def convert_with_docx_deep( + file_path: Path, assets_dir: Optional[Path] = None +) -> ConversionResult: + """Convert DOCX using python-docx direct parsing (experimental). + + More precise than pandoc for: + - Table structure preservation + - Comment extraction + - Image extraction with position info + """ + try: + from docx import Document + from docx.opc.constants import RELATIONSHIP_TYPE as RT + except ImportError: + return ConversionResult( + markdown="", + tool="docx-deep", + success=False, + error="python-docx not installed. Run: pip install python-docx", + ) + + try: + doc = Document(str(file_path)) + md_parts = [] + images = [] + image_counter = 0 + + # Extract images from docx zip + if assets_dir: + assets_dir.mkdir(parents=True, exist_ok=True) + media_dir = assets_dir / "media" + media_dir.mkdir(exist_ok=True) + + with zipfile.ZipFile(str(file_path), "r") as zf: + for name in zf.namelist(): + if name.startswith("word/media/"): + img_name = Path(name).name + img_dest = media_dir / img_name + with zf.open(name) as src, open(img_dest, "wb") as dst: + dst.write(src.read()) + images.append(str(img_dest)) + + # Process paragraphs + for para in doc.paragraphs: + style_name = para.style.name if para.style else "" + text = para.text.strip() + + if not text: + md_parts.append("") + continue + + # Headings + if style_name.startswith("Heading"): + try: + level = int(style_name.split()[-1]) + except (ValueError, IndexError): + level = 1 + md_parts.append(f"{'#' * level} {text}") + md_parts.append("") + continue + + # Check for bold-only paragraphs (often sub-headings in Chinese docs) + all_bold = all(run.bold for run in para.runs if run.text.strip()) + if all_bold and para.runs and len(text) < 100: + md_parts.append(f"**{text}**") + md_parts.append("") + continue + + # Regular paragraph + md_parts.append(text) + md_parts.append("") + + # Process tables + for table in doc.tables: + md_parts.append("") + rows = table.rows + if not rows: + continue + + # Header row + header_cells = [cell.text.strip() for cell in rows[0].cells] + md_parts.append("| " + " | ".join(header_cells) + " |") + md_parts.append("| " + " | ".join(["---"] * len(header_cells)) + " |") + + # Data rows + for row in rows[1:]: + cells = [cell.text.strip() for cell in row.cells] + md_parts.append("| " + " | ".join(cells) + " |") + md_parts.append("") + + markdown = "\n".join(md_parts) + + return ConversionResult( + markdown=markdown, + tool="docx-deep", + images=images, + success=True, + ) + except Exception as e: + return ConversionResult( + markdown="", tool="docx-deep", success=False, error=str(e) + ) + + +# ── Existing tool converters ───────────────────────────────────────────────── + +def check_tool_available(tool: str) -> bool: + """Check if a conversion tool is available.""" + if tool == "pymupdf4llm": + try: + import pymupdf4llm + return True + except ImportError: + return False + elif tool == "markitdown": + try: + import markitdown + return True + except ImportError: + return False + elif tool == "pandoc": + return shutil.which("pandoc") is not None + elif tool == "docx-deep": + try: + from docx import Document + return True + except ImportError: + return False + return False + + +def select_tools(file_path: Path, mode: str) -> list[str]: + """Select conversion tools based on file type and mode.""" + ext = file_path.suffix.lower() + + # Tool preferences by format + tool_map = { + ".pdf": { + "quick": ["pymupdf4llm", "markitdown"], # fallback order + "heavy": ["pymupdf4llm", "markitdown"], + }, + ".docx": { + "quick": ["pandoc", "markitdown"], + "heavy": ["pandoc", "markitdown"], + }, + ".doc": { + "quick": ["pandoc", "markitdown"], + "heavy": ["pandoc", "markitdown"], + }, + ".pptx": { + "quick": ["markitdown", "pandoc"], + "heavy": ["markitdown", "pandoc"], + }, + ".xlsx": { + "quick": ["markitdown"], + "heavy": ["markitdown"], + }, + } + + tools = tool_map.get(ext, {"quick": ["markitdown"], "heavy": ["markitdown"]}) + + if mode == "quick": + # Return first available tool + for tool in tools["quick"]: + if check_tool_available(tool): + return [tool] + return [] + else: # heavy + # Return all available tools + return [t for t in tools["heavy"] if check_tool_available(t)] + + +def convert_with_pymupdf4llm( + file_path: Path, assets_dir: Optional[Path] = None +) -> ConversionResult: + """Convert using PyMuPDF4LLM (best for PDFs).""" + try: + import pymupdf4llm + + kwargs = {} + images = [] + + if assets_dir: + assets_dir.mkdir(parents=True, exist_ok=True) + kwargs["write_images"] = True + kwargs["image_path"] = str(assets_dir) + kwargs["dpi"] = 150 + + # Use best table detection strategy + kwargs["table_strategy"] = "lines_strict" + + md_text = pymupdf4llm.to_markdown(str(file_path), **kwargs) + + if assets_dir: + images = _collect_images(assets_dir) + + return ConversionResult( + markdown=md_text, tool="pymupdf4llm", images=images, success=True + ) + except Exception as e: + return ConversionResult( + markdown="", tool="pymupdf4llm", success=False, error=str(e) + ) + + +def convert_with_markitdown( + file_path: Path, assets_dir: Optional[Path] = None +) -> ConversionResult: + """Convert using markitdown.""" + try: + # markitdown CLI approach + result = subprocess.run( + ["markitdown", str(file_path)], + capture_output=True, + text=True, + timeout=120, + ) + + if result.returncode != 0: + return ConversionResult( + markdown="", + tool="markitdown", + success=False, + error=result.stderr, + ) + + return ConversionResult( + markdown=result.stdout, tool="markitdown", success=True + ) + except FileNotFoundError: + # Try Python API + try: + from markitdown import MarkItDown + + md = MarkItDown() + result = md.convert(str(file_path)) + return ConversionResult( + markdown=result.text_content, tool="markitdown", success=True + ) + except Exception as e: + return ConversionResult( + markdown="", tool="markitdown", success=False, error=str(e) + ) + except Exception as e: + return ConversionResult( + markdown="", tool="markitdown", success=False, error=str(e) + ) + + +def convert_with_pandoc( + file_path: Path, assets_dir: Optional[Path] = None +) -> ConversionResult: + """Convert using pandoc. + + Pandoc's --extract-media=DIR creates a media/ subdirectory inside DIR. + We point --extract-media at assets_dir's parent so pandoc's media/ + subdirectory lands exactly at assets_dir (when assets_dir ends with 'media'), + or we use a temp dir and move files afterward. + """ + try: + cmd = ["pandoc", str(file_path), "-t", "markdown", "--wrap=none"] + + extract_dir = None + if assets_dir: + assets_dir.mkdir(parents=True, exist_ok=True) + # Pandoc always creates a media/ subdirectory inside --extract-media. + # Point it at the parent so media/ lands at assets_dir. + if assets_dir.name == "media": + extract_dir = assets_dir.parent + else: + extract_dir = assets_dir + cmd.extend(["--extract-media", str(extract_dir)]) + + result = subprocess.run( + cmd, capture_output=True, text=True, timeout=120 + ) + + if result.returncode != 0: + return ConversionResult( + markdown="", tool="pandoc", success=False, error=result.stderr + ) + + md = result.stdout + + # Convert absolute image paths to relative paths based on output location + if extract_dir: + abs_media = str(extract_dir / "media") + # Replace absolute paths with relative 'media/' prefix + md = md.replace(abs_media + "/", "media/") + + images = _collect_images(assets_dir) if assets_dir else [] + + return ConversionResult( + markdown=md, tool="pandoc", images=images, success=True + ) + except Exception as e: + return ConversionResult( + markdown="", tool="pandoc", success=False, error=str(e) + ) + + +def convert_single( + file_path: Path, tool: str, assets_dir: Optional[Path] = None +) -> ConversionResult: + """Run a single conversion tool.""" + converters = { + "pymupdf4llm": convert_with_pymupdf4llm, + "markitdown": convert_with_markitdown, + "pandoc": convert_with_pandoc, + "docx-deep": convert_with_docx_deep, + } + + converter = converters.get(tool) + if not converter: + return ConversionResult( + markdown="", tool=tool, success=False, error=f"Unknown tool: {tool}" + ) + + return converter(file_path, assets_dir) + + +def merge_results(results: list[ConversionResult]) -> ConversionResult: + """Merge results from multiple tools, selecting best segments.""" + if not results: + return ConversionResult(markdown="", tool="none", success=False) + + # Filter successful results + successful = [r for r in results if r.success and r.markdown.strip()] + if not successful: + # Return first error + return results[0] if results else ConversionResult( + markdown="", tool="none", success=False + ) + + if len(successful) == 1: + return successful[0] + + # Multiple successful results - merge them + # Strategy: Compare key metrics and select best + best = successful[0] + best_score = score_markdown(best.markdown) + + for result in successful[1:]: + score = score_markdown(result.markdown) + if score > best_score: + best = result + best_score = score + + # Merge images from all results + all_images = [] + seen = set() + for result in successful: + for img in result.images: + if img not in seen: + all_images.append(img) + seen.add(img) + + best.images = all_images + best.tool = f"merged({','.join(r.tool for r in successful)})" + + return best + + +def score_markdown(md: str) -> float: + """Score markdown quality for comparison.""" + score = 0.0 + + # Length (more content is generally better) + score += min(len(md) / 10000, 5.0) # Cap at 5 points + + # Tables (proper markdown tables) + table_count = md.count("|---|") + md.count("| ---") + score += min(table_count * 0.5, 3.0) + + # Images (referenced images) + image_count = md.count("![") + score += min(image_count * 0.3, 2.0) + + # Headings (proper hierarchy) + h1_count = md.count("\n# ") + h2_count = md.count("\n## ") + h3_count = md.count("\n### ") + if h1_count > 0 and h2_count >= h1_count: + score += 1.0 # Good hierarchy + + # Lists (structured content) + list_count = md.count("\n- ") + md.count("\n* ") + md.count("\n1. ") + score += min(list_count * 0.1, 2.0) + + # Penalize pandoc artifacts (grid tables, attributes) + artifact_count = md.count("+:---") + md.count("+---+") + artifact_count += md.count('{width="') + md.count("{.underline}") + score -= artifact_count * 0.5 + + return score + + +def main(): + parser = argparse.ArgumentParser( + description="Convert documents to markdown with multi-tool orchestration", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Quick mode (default) + python convert.py document.pdf -o output.md + + # Heavy mode (best quality) + python convert.py document.pdf -o output.md --heavy + + # DOCX deep mode (python-docx parsing) + python convert.py document.docx -o output.md --docx-deep + + # With custom assets directory + python convert.py document.pdf -o output.md --assets-dir ./images + """, + ) + parser.add_argument("input", type=Path, nargs="?", help="Input document path") + parser.add_argument( + "-o", "--output", type=Path, help="Output markdown file" + ) + parser.add_argument( + "--heavy", + action="store_true", + help="Enable Heavy Mode (multi-tool, best quality)", + ) + parser.add_argument( + "--docx-deep", + action="store_true", + help="Use python-docx direct parsing (experimental, DOCX only)", + ) + parser.add_argument( + "--no-postprocess", + action="store_true", + help="Disable DOCX post-processing (keep raw pandoc output)", + ) + parser.add_argument( + "--assets-dir", + type=Path, + default=None, + help="Directory for extracted images (default: _assets/)", + ) + parser.add_argument( + "--tool", + choices=["pymupdf4llm", "markitdown", "pandoc", "docx-deep"], + help="Force specific tool (overrides auto-selection)", + ) + parser.add_argument( + "--list-tools", + action="store_true", + help="List available tools and exit", + ) + + args = parser.parse_args() + + # List tools mode + if args.list_tools: + tools = ["pymupdf4llm", "markitdown", "pandoc", "docx-deep"] + print("Available conversion tools:") + for tool in tools: + status = "+" if check_tool_available(tool) else "-" + print(f" {status} {tool}") + sys.exit(0) + + # Validate input + if args.input is None: + parser.error("the following arguments are required: input") + if not args.input.exists(): + print(f"Error: Input file not found: {args.input}", file=sys.stderr) + sys.exit(1) + + # Determine output path + output_path = args.output or args.input.with_suffix(".md") + + # Determine assets directory + assets_dir = args.assets_dir + if assets_dir is None: + assets_dir = output_path.parent / f"{output_path.stem}_assets" + + is_docx = args.input.suffix.lower() in (".docx", ".doc") + + # Handle --docx-deep mode + if args.docx_deep: + if not is_docx: + print("Error: --docx-deep only works with DOCX files.", file=sys.stderr) + sys.exit(1) + tools = ["docx-deep"] + elif args.tool: + tools = [args.tool] if check_tool_available(args.tool) else [] + else: + # Select tools + mode = "heavy" if args.heavy else "quick" + tools = select_tools(args.input, mode) + + mode = "docx-deep" if args.docx_deep else ("heavy" if args.heavy else "quick") + + if not tools: + print("Error: No conversion tools available.", file=sys.stderr) + print("Install with:", file=sys.stderr) + print(" pip install pymupdf4llm", file=sys.stderr) + print(" uv tool install markitdown[pdf]", file=sys.stderr) + print(" brew install pandoc", file=sys.stderr) + sys.exit(1) + + print(f"Converting: {args.input}") + print(f"Mode: {mode.upper()}") + print(f"Tools: {', '.join(tools)}") + + # Run conversions + results = [] + for tool in tools: + print(f" Running {tool}...", end=" ", flush=True) + + # Use separate assets dirs for each tool in heavy mode + tool_assets = None + if assets_dir and mode == "heavy" and len(tools) > 1: + tool_assets = assets_dir / tool + elif assets_dir: + tool_assets = assets_dir + + result = convert_single(args.input, tool, tool_assets) + results.append(result) + + if result.success: + print(f"ok ({len(result.markdown):,} chars, {len(result.images)} images)") + else: + print(f"FAIL ({result.error[:50]}...)") + + # Merge results if heavy mode + if mode == "heavy" and len(results) > 1: + print(" Merging results...", end=" ", flush=True) + final = merge_results(results) + print(f"ok (using {final.tool})") + else: + final = merge_results(results) + + if not final.success: + print(f"Error: Conversion failed: {final.error}", file=sys.stderr) + sys.exit(1) + + # Apply DOCX post-processing + if is_docx and not args.no_postprocess and "pandoc" in final.tool: + print(" Post-processing DOCX output...", end=" ", flush=True) + final.markdown, pp_stats = postprocess_docx_markdown( + final.markdown, assets_dir + ) + print(f"ok ({pp_stats.summary()})") + + # Write output + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(final.markdown) + + print(f"\nOutput: {output_path}") + print(f" Size: {len(final.markdown):,} characters") + if final.images: + print(f" Images: {len(final.images)} extracted") + + +if __name__ == "__main__": + main() diff --git a/markdown-tools/scripts/convert_path.py b/doc-to-markdown/scripts/convert_path.py similarity index 100% rename from markdown-tools/scripts/convert_path.py rename to doc-to-markdown/scripts/convert_path.py diff --git a/markdown-tools/scripts/extract_pdf_images.py b/doc-to-markdown/scripts/extract_pdf_images.py similarity index 100% rename from markdown-tools/scripts/extract_pdf_images.py rename to doc-to-markdown/scripts/extract_pdf_images.py diff --git a/markdown-tools/scripts/merge_outputs.py b/doc-to-markdown/scripts/merge_outputs.py similarity index 100% rename from markdown-tools/scripts/merge_outputs.py rename to doc-to-markdown/scripts/merge_outputs.py diff --git a/markdown-tools/scripts/validate_output.py b/doc-to-markdown/scripts/validate_output.py similarity index 100% rename from markdown-tools/scripts/validate_output.py rename to doc-to-markdown/scripts/validate_output.py diff --git a/markdown-tools/scripts/convert.py b/markdown-tools/scripts/convert.py deleted file mode 100755 index 9ac6f36..0000000 --- a/markdown-tools/scripts/convert.py +++ /dev/null @@ -1,434 +0,0 @@ -#!/usr/bin/env python3 -""" -Multi-tool document to markdown converter with intelligent orchestration. - -Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). - -Usage: - # Quick Mode (default) - fast, single best tool - uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md - - # Heavy Mode - multi-tool parallel execution with merge - uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md --heavy - - # With image extraction - uv run --with pymupdf4llm scripts/convert.py document.pdf -o output.md --assets-dir ./images - -Dependencies: - - pymupdf4llm: PDF conversion (LLM-optimized) - - markitdown: PDF/DOCX/PPTX conversion - - pandoc: DOCX/PPTX conversion (system install: brew install pandoc) -""" - -import argparse -import subprocess -import sys -import tempfile -import shutil -from dataclasses import dataclass, field -from pathlib import Path -from typing import Optional - - -@dataclass -class ConversionResult: - """Result from a single tool conversion.""" - markdown: str - tool: str - images: list[str] = field(default_factory=list) - success: bool = True - error: str = "" - - -def check_tool_available(tool: str) -> bool: - """Check if a conversion tool is available.""" - if tool == "pymupdf4llm": - try: - import pymupdf4llm - return True - except ImportError: - return False - elif tool == "markitdown": - try: - import markitdown - return True - except ImportError: - return False - elif tool == "pandoc": - return shutil.which("pandoc") is not None - return False - - -def select_tools(file_path: Path, mode: str) -> list[str]: - """Select conversion tools based on file type and mode.""" - ext = file_path.suffix.lower() - - # Tool preferences by format - tool_map = { - ".pdf": { - "quick": ["pymupdf4llm", "markitdown"], # fallback order - "heavy": ["pymupdf4llm", "markitdown"], - }, - ".docx": { - "quick": ["pandoc", "markitdown"], - "heavy": ["pandoc", "markitdown"], - }, - ".doc": { - "quick": ["pandoc", "markitdown"], - "heavy": ["pandoc", "markitdown"], - }, - ".pptx": { - "quick": ["markitdown", "pandoc"], - "heavy": ["markitdown", "pandoc"], - }, - ".xlsx": { - "quick": ["markitdown"], - "heavy": ["markitdown"], - }, - } - - tools = tool_map.get(ext, {"quick": ["markitdown"], "heavy": ["markitdown"]}) - - if mode == "quick": - # Return first available tool - for tool in tools["quick"]: - if check_tool_available(tool): - return [tool] - return [] - else: # heavy - # Return all available tools - return [t for t in tools["heavy"] if check_tool_available(t)] - - -def convert_with_pymupdf4llm( - file_path: Path, assets_dir: Optional[Path] = None -) -> ConversionResult: - """Convert using PyMuPDF4LLM (best for PDFs).""" - try: - import pymupdf4llm - - kwargs = {} - images = [] - - if assets_dir: - assets_dir.mkdir(parents=True, exist_ok=True) - kwargs["write_images"] = True - kwargs["image_path"] = str(assets_dir) - kwargs["dpi"] = 150 - - # Use best table detection strategy - kwargs["table_strategy"] = "lines_strict" - - md_text = pymupdf4llm.to_markdown(str(file_path), **kwargs) - - # Collect extracted images - if assets_dir and assets_dir.exists(): - images = [str(p) for p in assets_dir.glob("*.png")] - images.extend([str(p) for p in assets_dir.glob("*.jpg")]) - - return ConversionResult( - markdown=md_text, tool="pymupdf4llm", images=images, success=True - ) - except Exception as e: - return ConversionResult( - markdown="", tool="pymupdf4llm", success=False, error=str(e) - ) - - -def convert_with_markitdown( - file_path: Path, assets_dir: Optional[Path] = None -) -> ConversionResult: - """Convert using markitdown.""" - try: - # markitdown CLI approach - result = subprocess.run( - ["markitdown", str(file_path)], - capture_output=True, - text=True, - timeout=120, - ) - - if result.returncode != 0: - return ConversionResult( - markdown="", - tool="markitdown", - success=False, - error=result.stderr, - ) - - return ConversionResult( - markdown=result.stdout, tool="markitdown", success=True - ) - except FileNotFoundError: - # Try Python API - try: - from markitdown import MarkItDown - - md = MarkItDown() - result = md.convert(str(file_path)) - return ConversionResult( - markdown=result.text_content, tool="markitdown", success=True - ) - except Exception as e: - return ConversionResult( - markdown="", tool="markitdown", success=False, error=str(e) - ) - except Exception as e: - return ConversionResult( - markdown="", tool="markitdown", success=False, error=str(e) - ) - - -def convert_with_pandoc( - file_path: Path, assets_dir: Optional[Path] = None -) -> ConversionResult: - """Convert using pandoc.""" - try: - cmd = ["pandoc", str(file_path), "-t", "markdown", "--wrap=none"] - - if assets_dir: - assets_dir.mkdir(parents=True, exist_ok=True) - cmd.extend(["--extract-media", str(assets_dir)]) - - result = subprocess.run( - cmd, capture_output=True, text=True, timeout=120 - ) - - if result.returncode != 0: - return ConversionResult( - markdown="", tool="pandoc", success=False, error=result.stderr - ) - - images = [] - if assets_dir and assets_dir.exists(): - images = [str(p) for p in assets_dir.rglob("*.png")] - images.extend([str(p) for p in assets_dir.rglob("*.jpg")]) - - return ConversionResult( - markdown=result.stdout, tool="pandoc", images=images, success=True - ) - except Exception as e: - return ConversionResult( - markdown="", tool="pandoc", success=False, error=str(e) - ) - - -def convert_single( - file_path: Path, tool: str, assets_dir: Optional[Path] = None -) -> ConversionResult: - """Run a single conversion tool.""" - converters = { - "pymupdf4llm": convert_with_pymupdf4llm, - "markitdown": convert_with_markitdown, - "pandoc": convert_with_pandoc, - } - - converter = converters.get(tool) - if not converter: - return ConversionResult( - markdown="", tool=tool, success=False, error=f"Unknown tool: {tool}" - ) - - return converter(file_path, assets_dir) - - -def merge_results(results: list[ConversionResult]) -> ConversionResult: - """Merge results from multiple tools, selecting best segments.""" - if not results: - return ConversionResult(markdown="", tool="none", success=False) - - # Filter successful results - successful = [r for r in results if r.success and r.markdown.strip()] - if not successful: - # Return first error - return results[0] if results else ConversionResult( - markdown="", tool="none", success=False - ) - - if len(successful) == 1: - return successful[0] - - # Multiple successful results - merge them - # Strategy: Compare key metrics and select best - best = successful[0] - best_score = score_markdown(best.markdown) - - for result in successful[1:]: - score = score_markdown(result.markdown) - if score > best_score: - best = result - best_score = score - - # Merge images from all results - all_images = [] - seen = set() - for result in successful: - for img in result.images: - if img not in seen: - all_images.append(img) - seen.add(img) - - best.images = all_images - best.tool = f"merged({','.join(r.tool for r in successful)})" - - return best - - -def score_markdown(md: str) -> float: - """Score markdown quality for comparison.""" - score = 0.0 - - # Length (more content is generally better) - score += min(len(md) / 10000, 5.0) # Cap at 5 points - - # Tables (proper markdown tables) - table_count = md.count("|---|") + md.count("| ---") - score += min(table_count * 0.5, 3.0) - - # Images (referenced images) - image_count = md.count("![") - score += min(image_count * 0.3, 2.0) - - # Headings (proper hierarchy) - h1_count = md.count("\n# ") - h2_count = md.count("\n## ") - h3_count = md.count("\n### ") - if h1_count > 0 and h2_count >= h1_count: - score += 1.0 # Good hierarchy - - # Lists (structured content) - list_count = md.count("\n- ") + md.count("\n* ") + md.count("\n1. ") - score += min(list_count * 0.1, 2.0) - - return score - - -def main(): - parser = argparse.ArgumentParser( - description="Convert documents to markdown with multi-tool orchestration", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - # Quick mode (default) - python convert.py document.pdf -o output.md - - # Heavy mode (best quality) - python convert.py document.pdf -o output.md --heavy - - # With custom assets directory - python convert.py document.pdf -o output.md --assets-dir ./images - """, - ) - parser.add_argument("input", type=Path, help="Input document path") - parser.add_argument( - "-o", "--output", type=Path, help="Output markdown file" - ) - parser.add_argument( - "--heavy", - action="store_true", - help="Enable Heavy Mode (multi-tool, best quality)", - ) - parser.add_argument( - "--assets-dir", - type=Path, - default=None, - help="Directory for extracted images (default: _assets/)", - ) - parser.add_argument( - "--tool", - choices=["pymupdf4llm", "markitdown", "pandoc"], - help="Force specific tool (overrides auto-selection)", - ) - parser.add_argument( - "--list-tools", - action="store_true", - help="List available tools and exit", - ) - - args = parser.parse_args() - - # List tools mode - if args.list_tools: - tools = ["pymupdf4llm", "markitdown", "pandoc"] - print("Available conversion tools:") - for tool in tools: - status = "✓" if check_tool_available(tool) else "✗" - print(f" {status} {tool}") - sys.exit(0) - - # Validate input - if not args.input.exists(): - print(f"Error: Input file not found: {args.input}", file=sys.stderr) - sys.exit(1) - - # Determine output path - output_path = args.output or args.input.with_suffix(".md") - - # Determine assets directory - assets_dir = args.assets_dir - if assets_dir is None and args.heavy: - assets_dir = output_path.parent / f"{output_path.stem}_assets" - - # Select tools - mode = "heavy" if args.heavy else "quick" - if args.tool: - tools = [args.tool] if check_tool_available(args.tool) else [] - else: - tools = select_tools(args.input, mode) - - if not tools: - print("Error: No conversion tools available.", file=sys.stderr) - print("Install with:", file=sys.stderr) - print(" pip install pymupdf4llm", file=sys.stderr) - print(" uv tool install markitdown[pdf]", file=sys.stderr) - print(" brew install pandoc", file=sys.stderr) - sys.exit(1) - - print(f"Converting: {args.input}") - print(f"Mode: {mode.upper()}") - print(f"Tools: {', '.join(tools)}") - - # Run conversions - results = [] - for tool in tools: - print(f" Running {tool}...", end=" ", flush=True) - - # Use separate assets dirs for each tool in heavy mode - tool_assets = None - if assets_dir and mode == "heavy" and len(tools) > 1: - tool_assets = assets_dir / tool - elif assets_dir: - tool_assets = assets_dir - - result = convert_single(args.input, tool, tool_assets) - results.append(result) - - if result.success: - print(f"✓ ({len(result.markdown):,} chars, {len(result.images)} images)") - else: - print(f"✗ ({result.error[:50]}...)") - - # Merge results if heavy mode - if mode == "heavy" and len(results) > 1: - print(" Merging results...", end=" ", flush=True) - final = merge_results(results) - print(f"✓ (using {final.tool})") - else: - final = merge_results(results) - - if not final.success: - print(f"Error: Conversion failed: {final.error}", file=sys.stderr) - sys.exit(1) - - # Write output - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(final.markdown) - - print(f"\nOutput: {output_path}") - print(f" Size: {len(final.markdown):,} characters") - if final.images: - print(f" Images: {len(final.images)} extracted") - - -if __name__ == "__main__": - main()