diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 76986aa..2286d3d 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -50,25 +50,23 @@
]
},
{
- "name": "markdown-tools",
- "description": "Convert documents (PDFs, Word, PowerPoint) to high-quality markdown with multi-tool orchestration. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge with segment-level selection). Features PyMuPDF4LLM for LLM-optimized PDF conversion, pandoc for DOCX/PPTX structure preservation, quality validation with HTML reports, and image extraction with metadata",
+ "name": "doc-to-markdown",
+ "description": "Converts DOCX/PDF/PPTX to high-quality Markdown with automatic post-processing. Fixes pandoc grid tables, image paths, attribute noise, and code blocks. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). Trigger on \"convert document\", \"docx to markdown\", \"parse word\", \"doc to markdown\", \"extract images from document\".",
"source": "./",
"strict": false,
- "version": "1.2.0",
+ "version": "2.0.0",
"category": "document-conversion",
"keywords": [
"markdown",
- "pdf",
"docx",
+ "pdf",
"pptx",
- "pymupdf4llm",
+ "converter",
"pandoc",
- "markitdown",
- "heavy-mode",
- "quality-validation"
+ "document"
],
"skills": [
- "./markdown-tools"
+ "./doc-to-markdown"
]
},
{
@@ -942,4 +940,4 @@
]
}
]
-}
+}
\ No newline at end of file
diff --git a/markdown-tools/SKILL.md b/doc-to-markdown/SKILL.md
similarity index 74%
rename from markdown-tools/SKILL.md
rename to doc-to-markdown/SKILL.md
index 8bc71f4..966c9fc 100644
--- a/markdown-tools/SKILL.md
+++ b/doc-to-markdown/SKILL.md
@@ -1,11 +1,11 @@
---
-name: markdown-tools
-description: Converts documents to markdown with multi-tool orchestration for best quality. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). Use when converting PDF/DOCX/PPTX files to markdown, extracting images from documents, validating conversion quality, or needing LLM-optimized document output.
+name: doc-to-markdown
+description: Converts DOCX/PDF/PPTX to high-quality Markdown with automatic post-processing. Fixes pandoc grid tables, image paths, attribute noise, and code blocks. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). Trigger on "convert document", "docx to markdown", "parse word", "doc to markdown", "extract images from document".
---
-# Markdown Tools
+# Doc to Markdown
-Convert documents to high-quality markdown with intelligent multi-tool orchestration.
+Convert documents to high-quality markdown with intelligent multi-tool orchestration and automatic DOCX post-processing.
## Dual Mode Architecture
@@ -34,6 +34,9 @@ uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o o
# Heavy Mode - multi-tool parallel execution with merge
uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md --heavy
+# DOCX with deep python-docx parsing (experimental)
+uv run --with pymupdf4llm --with markitdown --with python-docx scripts/convert.py document.docx -o output.md --docx-deep
+
# Check available tools
uv run scripts/convert.py --list-tools
```
@@ -43,7 +46,7 @@ uv run scripts/convert.py --list-tools
| Format | Quick Mode Tool | Heavy Mode Tools |
|--------|----------------|------------------|
| PDF | pymupdf4llm | pymupdf4llm + markitdown |
-| DOCX | pandoc | pandoc + markitdown |
+| DOCX | pandoc + post-processing | pandoc + markitdown |
| PPTX | markitdown | markitdown + pandoc |
| XLSX | markitdown | markitdown |
@@ -53,6 +56,21 @@ uv run scripts/convert.py --list-tools
- **markitdown**: Microsoft's universal converter, good for Office formats
- **pandoc**: Excellent structure preservation for DOCX/PPTX
+## DOCX Post-Processing (automatic)
+
+When converting DOCX files via pandoc, the following cleanups are applied automatically:
+
+| Problem | Fix |
+|---------|-----|
+| Grid tables (`+:---+` syntax) | Single-column -> blockquote, multi-column -> split images |
+| Image double path (`media/media/`) | Flatten to `media/` |
+| Pandoc attributes (`{width="..." height="..."}`) | Removed |
+| Inline classes (`{.underline}`, `{.mark}`) | Removed |
+| Indented dashed code blocks | Converted to fenced code blocks (```) |
+| Escaped brackets (`\[...\]`) | Unescaped to `[...]` |
+| Double-bracket links (`[[text]{...}](url)`) | Simplified to `[text](url)` |
+| Escaped quotes in code (`\"`) | Fixed to `"` |
+
## Heavy Mode Workflow
Heavy Mode runs multiple tools in parallel and selects the best segments:
@@ -117,7 +135,7 @@ python scripts/merge_outputs.py output1.md output2.md -o merged.md --verbose
## Path Conversion (Windows/WSL)
```bash
-# Windows → WSL conversion
+# Windows to WSL conversion
python scripts/convert_path.py "C:\Users\name\Documents\file.pdf"
# Output: /mnt/c/Users/name/Documents/file.pdf
```
@@ -147,7 +165,7 @@ brew install pandoc
| Script | Purpose |
|--------|---------|
-| `convert.py` | Main orchestrator with Quick/Heavy mode |
+| `convert.py` | Main orchestrator with Quick/Heavy mode + DOCX post-processing |
| `merge_outputs.py` | Merge multiple markdown outputs |
| `validate_output.py` | Quality validation with HTML report |
| `extract_pdf_images.py` | PDF image extraction with metadata |
diff --git a/doc-to-markdown/references/benchmark-2026-03-22.md b/doc-to-markdown/references/benchmark-2026-03-22.md
new file mode 100644
index 0000000..27b2069
--- /dev/null
+++ b/doc-to-markdown/references/benchmark-2026-03-22.md
@@ -0,0 +1,163 @@
+# DOCX→Markdown 转换方案基准测试
+
+> **测试日期**:2026-03-22
+>
+> **测试文件**:`助教-【腾讯云🦞】小白实践 OpenClaw 保姆级教程.docx`(19MB,77 张图片,含 grid table 布局、JSON 代码块、多列图片并排、信息框)
+>
+> **测试方法**:5 个方案对同一文件转换,按 5 个维度各 10 分制打分
+
+---
+
+## 综合评分
+
+| 维度 | Docling (IBM) | MarkItDown (MS) | Pandoc | Mammoth | **doc-to-markdown(我们)** |
+|------|:---:|:---:|:---:|:---:|:---:|
+| 表格质量 | 5 | 3 | 5 | 1~3 | **6** |
+| 图片提取 | 4 | 2 | **10** | 5 | 7 |
+| 文本完整性 | 8 | 7 | **9** | 7 | **9** |
+| 格式清洁度 | 5 | 5 | 5 | 3 | **7** |
+| 代码块 | 2 | 1 | N/A | 1 | **9** |
+| **综合** | **4.8** | **3.6** | **7.3** | **3.4~3.6** | **7.6** |
+
+---
+
+## 各方案详细分析
+
+### 1. IBM Docling(综合 4.8)
+
+- **版本**:docling 2.x + Granite-Docling-258M
+- **架构**:AI 驱动(VLM 视觉语言模型),DocTags 中间格式 → Markdown
+
+**致命问题**:
+- 图片引用全部是 `` 占位符(77 张图 0 张可显示),`ImageRefMode` API 对 DOCX 不可用
+- 标题层级全部丢失(0 个 `#`),所有标题退化为粗体文本
+- 代码块为零,JSON 和命令全部输出为普通段落
+- `api_key` 被错误转义为 `api\_key`
+
+**优点**:
+- 文本内容完整,中文/emoji/链接保留良好
+- 无 grid table 或 HTML 残留
+- 表格语法正确(pipe table),但内容是占位符
+
+**结论**:Docling 的优势在 PDF(AAAI 2025 论文场景),DOCX 支持远未达到生产级别。
+
+### 2. Microsoft MarkItDown(综合 3.6)
+
+- **版本**:markitdown 0.1.5
+- **架构**:底层调用 mammoth → HTML → markdownify → Markdown
+
+**致命问题**:
+- 77 张图片全部是截断的 base64 占位符(`data:image/png;base64,...`),默认 `keep_data_uris=False` 主动丢弃图片数据
+- 标题全部变为粗体文本(mammoth 无法识别 WPS 自定义样式)
+- 代码块为零,JSON 被塞入表格单元格
+- 有序列表编号全部错误(输出为 `1. 1. 1.`)
+
+**优点**:
+- 无 HTML 标签残留
+- 文本内容基本完整
+
+**结论**:MarkItDown 的 markdownify 后处理反而引入破坏性截断。轻量场景可用,复杂 DOCX 不可靠。
+
+### 3. Pandoc(综合 7.3)
+
+- **版本**:pandoc 3.9
+- **架构**:Haskell 原生 AST 解析,支持 60+ 格式
+
+**测试了 3 种参数**:
+
+| 参数 | 结果 |
+|------|------|
+| `-t gfm` | 最差:24 个 HTML `
` 嵌套,74 个 HTML `
` |
+| `-t markdown` | 最佳:grid table(可后处理),无 HTML |
+| `-t markdown-raw_html-...` | 与 markdown 完全相同,参数无效果 |
+
+**问题**:
+- Grid table 不可避免(原 docx 有多行单元格和嵌套表格,pipe table 无法表达)
+- `{width="..." height="..."}` 68 处
+- `{.underline}` 6 处
+- 反斜杠过度转义 37 处
+
+**优点**:
+- 图片提取 10/10(77 张全部正确,路径结构一致)
+- 文本完整性 9/10(内容、链接、emoji 全部保留)
+- 最成熟稳定的底层引擎
+
+**结论**:Pandoc 是最可靠的底层引擎,输出质量最高但需要后处理清洗 pandoc 私有语法。
+
+### 4. Mammoth(综合 3.4~3.6)
+
+- **版本**:mammoth 1.11.0
+- **架构**:python-docx 解析 → HTML/Markdown(Markdown 支持已废弃)
+
+**测试了 2 种方式**:
+
+| 方式 | 综合 |
+|------|------|
+| 方式A:直接转 Markdown | 3.4(表格完全丢失) |
+| 方式B:转 HTML → markdownify | 3.6(有表格但嵌套被压扁) |
+
+**致命问题**:
+- 标题全部丢失(WPS `styles.xml` 中样式定义为空,mammoth 无法映射 Heading)
+- 代码块为零
+- 图片全部 base64 内嵌,单文件 28MB
+- 方式B 中 markdownify 丢失 14 张图片(63/77)
+
+**结论**:Mammoth 的 Markdown 支持已废弃,对 WPS 导出的 docx 兼容性差。不推荐。
+
+### 5. doc-to-markdown / 我们的方案(综合 7.6)
+
+- **版本**:doc-to-markdown 1.0(基于 pandoc + 6 个后处理函数)
+- **架构**:Pandoc 转换 → 自动后处理(grid table 清理、图片路径修复、属性清理、代码块修复、转义修复)
+
+**后处理实际效果**:
+
+| 后处理函数 | 修复数量 |
+|-----------|---------|
+| `_convert_grid_tables` | 11 处 grid table → pipe table / blockquote |
+| `_clean_pandoc_attributes` | 3437 字符属性清理 |
+| `_fix_code_blocks` | 22 处缩进虚线 → ``` 代码块 |
+| `_fix_escaped_brackets` | 10 处 |
+| `_fix_double_bracket_links` | 1 处 |
+| `_fix_image_paths` | 77 张图片路径修复 |
+
+**已知问题(待修复)**:
+- 图片路径双层嵌套 bug:`--assets-dir` 指定目录内被 pandoc 再建一层 `media/`
+- 2 处 grid table 残留(文末并排图片组未完全转换)
+
+**优点**:
+- 代码块识别 9/10(JSON 带语言标签,命令行正确包裹)
+- 格式清洁度 7/10(attributes、转义、grid table 大部分清理干净)
+- 文本完整性 9/10(关键内容全部保留)
+
+**结论**:综合最优,核心价值在 pandoc 后处理层。剩余 2 个 bug 可修。
+
+---
+
+## 架构决策
+
+```
+最终方案:Pandoc(底层引擎)+ doc-to-markdown 后处理(增值层)
+
+理由:
+1. Pandoc 图片提取最可靠(10/10),文本最完整(9/10)
+2. Pandoc 的问题(grid table、属性、转义)全部可后处理解决
+3. Docling/MarkItDown/Mammoth 的致命问题(图片丢失、标题丢失)无法后处理修复
+4. 后处理层是我们的核心竞争力,成本低、可迭代
+```
+
+---
+
+## 测试文件特征
+
+本次测试文件的难点在于:
+
+| 特征 | 说明 | 影响 |
+|------|------|------|
+| WPS 导出 | 非标准 Word 样式(Style ID 2/3 而非 Heading 1/2) | mammoth/markitdown/docling 标题全丢 |
+| 多列图片布局 | 2x2、1x4 图片网格用表格排版 | pandoc 输出 grid table |
+| 信息框/提示框 | 单列表格包裹文字 | pandoc 输出 grid table |
+| 嵌套表格 | 表格内套表格 | pipe table 无法表达 |
+| JSON 代码块 | 非代码块样式,用文本框/缩进表示 | 多数工具无法识别为代码 |
+| 19MB 文件 | 77 张截图嵌入 | base64 方案导致 28MB 输出 |
+
+这些特征代表了真实世界中 WPS/飞书文档导出 docx 的典型困难,是有效的基准测试场景。
diff --git a/markdown-tools/references/conversion-examples.md b/doc-to-markdown/references/conversion-examples.md
similarity index 100%
rename from markdown-tools/references/conversion-examples.md
rename to doc-to-markdown/references/conversion-examples.md
diff --git a/markdown-tools/references/heavy-mode-guide.md b/doc-to-markdown/references/heavy-mode-guide.md
similarity index 100%
rename from markdown-tools/references/heavy-mode-guide.md
rename to doc-to-markdown/references/heavy-mode-guide.md
diff --git a/markdown-tools/references/tool-comparison.md b/doc-to-markdown/references/tool-comparison.md
similarity index 100%
rename from markdown-tools/references/tool-comparison.md
rename to doc-to-markdown/references/tool-comparison.md
diff --git a/doc-to-markdown/scripts/convert.py b/doc-to-markdown/scripts/convert.py
new file mode 100755
index 0000000..a95eb86
--- /dev/null
+++ b/doc-to-markdown/scripts/convert.py
@@ -0,0 +1,1150 @@
+#!/usr/bin/env python3
+"""
+Multi-tool document to markdown converter with intelligent orchestration.
+
+Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge).
+DOCX files get automatic post-processing to fix pandoc artifacts.
+
+Usage:
+ # Quick Mode (default) - fast, single best tool
+ uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md
+
+ # Heavy Mode - multi-tool parallel execution with merge
+ uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md --heavy
+
+ # DOCX deep mode - python-docx direct parsing (experimental)
+ uv run --with python-docx scripts/convert.py document.docx -o output.md --docx-deep
+
+ # With image extraction
+ uv run --with pymupdf4llm scripts/convert.py document.pdf -o output.md --assets-dir ./images
+
+Dependencies:
+ - pymupdf4llm: PDF conversion (LLM-optimized)
+ - markitdown: PDF/DOCX/PPTX conversion
+ - pandoc: DOCX/PPTX conversion (system install: brew install pandoc)
+ - python-docx: DOCX deep parsing (optional, for --docx-deep)
+"""
+
+import argparse
+import re
+import subprocess
+import sys
+import shutil
+import zipfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass
+class ConversionResult:
+ """Result from a single tool conversion."""
+ markdown: str
+ tool: str
+ images: list[str] = field(default_factory=list)
+ success: bool = True
+ error: str = ""
+
+
+# ── Post-processing stats ────────────────────────────────────────────────────
+
+@dataclass
+class PostProcessStats:
+ """Track what the DOCX post-processor fixed."""
+ grid_tables_converted: int = 0
+ image_paths_fixed: int = 0
+ attributes_removed: int = 0
+ code_blocks_fixed: int = 0
+ escaped_brackets_fixed: int = 0
+ double_brackets_fixed: int = 0
+
+ def any_fixes(self) -> bool:
+ return any(
+ getattr(self, f) > 0
+ for f in self.__dataclass_fields__
+ )
+
+ def summary(self) -> str:
+ parts = []
+ if self.grid_tables_converted:
+ parts.append(f"grid tables: {self.grid_tables_converted}")
+ if self.image_paths_fixed:
+ parts.append(f"image paths: {self.image_paths_fixed}")
+ if self.attributes_removed:
+ parts.append(f"attributes: {self.attributes_removed}")
+ if self.code_blocks_fixed:
+ parts.append(f"code blocks: {self.code_blocks_fixed}")
+ if self.escaped_brackets_fixed:
+ parts.append(f"escaped brackets: {self.escaped_brackets_fixed}")
+ if self.double_brackets_fixed:
+ parts.append(f"double brackets: {self.double_brackets_fixed}")
+ return ", ".join(parts) if parts else "no fixes needed"
+
+
+# ── DOCX post-processing ─────────────────────────────────────────────────────
+
+# Regex patterns compiled once
+_RE_GRID_BORDER = re.compile(r"^\+[:=-][-:=]+(?:\+[:=-][-:=]+)*\+$")
+_RE_GRID_ROW = re.compile(r"^\|(.+)\|$")
+_RE_NESTED_GRID_BORDER = re.compile(r"^\|\s*\+[:=-][-:=]+\+\s*\|$")
+_RE_PANDOC_ATTR = re.compile(r"\{[^}]*(?:width|height)\s*=\s*\"[^\"]*\"[^}]*\}")
+_RE_PANDOC_CLASS = re.compile(r"\{\.(?:underline|mark)\}")
+_RE_DOUBLE_BRACKET_LINK = re.compile(r"\[\[([^\]]+)\]\(([^)]+)\)")
+_RE_DOUBLE_BRACKET_CLOSED = re.compile(r"\[\[([^\]]+)\]\]\(([^)]+)\)")
+_RE_DOUBLE_BRACKET_ATTR_LINK = re.compile(r"\[\[([^\]]+)\]\{[^}]*\}\]\(([^)]+)\)")
+_RE_ESCAPED_BRACKET = re.compile(r"\\(\[|])")
+# Matches single-column dashed line: " ------"
+# AND multi-column simple table border: " ---- -----"
+_RE_DASHED_LINE = re.compile(r"^(\s{2,})-{3,}[\s-]*$")
+_RE_ESCAPED_QUOTE = re.compile(r'\\"')
+# CJK + fullwidth punctuation range for bold spacing checks
+_RE_CJK_PUNCT = re.compile(r'[\u4e00-\u9fff\u3000-\u303f\uff01-\uffef,。、;:!?()【】「」《》""'']')
+_RE_BOLD_PAIR = re.compile(r'\*\*(.+?)\*\*')
+
+
+def _is_grid_border(line: str) -> bool:
+ """Check if a line is a grid table border like +---+ or +:---+."""
+ stripped = line.strip()
+ return bool(_RE_GRID_BORDER.match(stripped))
+
+
+def _is_nested_grid_border(line: str) -> bool:
+ """Check if a line is a nested grid border like | +---+ |."""
+ stripped = line.strip()
+ return bool(_RE_NESTED_GRID_BORDER.match(stripped))
+
+
+def _count_grid_columns(border_line: str) -> int:
+ """Count columns in a grid table border line."""
+ stripped = border_line.strip()
+ if not stripped.startswith("+"):
+ return 0
+ # Count + separators minus 1 = number of columns
+ return stripped.count("+") - 1
+
+
+
+# Languages recognized as code block hints in pandoc dashed-line blocks
+_KNOWN_CODE_LANGS = frozenset({
+ "json", "bash", "shell", "python", "javascript", "js",
+ "html", "css", "yaml", "xml", "sql", "plain text",
+ "text", "plaintext", "typescript", "ts", "go", "rust",
+ "java", "c", "cpp", "ruby", "php",
+})
+
+
+def _build_pipe_table(rows: list[list[str]]) -> list[str]:
+ """Build a standard markdown pipe table from rows of cells."""
+ if not rows:
+ return []
+ col_count = max(len(r) for r in rows)
+ lines = [
+ "| " + " | ".join([""] * col_count) + " |",
+ "| " + " | ".join(["---"] * col_count) + " |",
+ ]
+ for row in rows:
+ padded = row + [""] * (col_count - len(row))
+ lines.append("| " + " | ".join(padded) + " |")
+ return lines
+
+
+def _collect_images(directory: Path) -> list[str]:
+ """Collect image files from a directory (single glob pass)."""
+ if not directory.exists():
+ return []
+ image_exts = {".png", ".jpg", ".jpeg", ".gif", ".webp"}
+ return sorted(
+ str(p) for p in directory.rglob("*")
+ if p.suffix.lower() in image_exts
+ )
+
+
+def _convert_grid_tables(text: str, stats: PostProcessStats) -> str:
+ """Convert pandoc grid tables to standard markdown.
+
+ Single-column grid tables (info boxes) -> blockquotes.
+ Multi-column grid tables (side-by-side images) -> split into individual elements.
+ Nested grid tables are flattened.
+ """
+ lines = text.split("\n")
+ result = []
+ i = 0
+
+ while i < len(lines):
+ line = lines[i]
+
+ # Detect grid table start
+ if _is_grid_border(line):
+ # Collect the entire grid table
+ table_lines = [line]
+ i += 1
+ while i < len(lines):
+ table_lines.append(lines[i])
+ if _is_grid_border(lines[i]) and len(table_lines) > 1:
+ i += 1
+ break
+ i += 1
+ else:
+ # Reached end of file without closing border
+ # Just output as-is
+ result.extend(table_lines)
+ continue
+
+ stats.grid_tables_converted += 1
+ num_cols = _count_grid_columns(table_lines[0])
+
+ # Extract content lines (skip borders)
+ content_lines = []
+ for tl in table_lines:
+ if _is_grid_border(tl) or _is_nested_grid_border(tl):
+ continue
+ m = _RE_GRID_ROW.match(tl.strip())
+ if m:
+ content_lines.append(m.group(1).strip())
+ else:
+ # Non-standard line inside grid, keep content
+ stripped = tl.strip()
+ if stripped and stripped != "|":
+ content_lines.append(stripped)
+
+ if num_cols <= 1:
+ # Single column -> blockquote
+ result.append("")
+ for cl in content_lines:
+ # Strip outer pipes if present from nested grids
+ cleaned = cl.strip()
+ if cleaned.startswith("|") and cleaned.endswith("|"):
+ cleaned = cleaned[1:-1].strip()
+ # Skip nested grid borders
+ if _RE_GRID_BORDER.match(cleaned):
+ continue
+ if cleaned:
+ result.append(f"> {cleaned}")
+ else:
+ result.append(">")
+ result.append("")
+ else:
+ # Multi-column -> convert to standard pipe table
+ # Parse rows: each content_line is a row, split by | into cells
+ table_rows = []
+ for cl in content_lines:
+ cells = [c.strip() for c in cl.split("|") if c.strip() and not _RE_GRID_BORDER.match(c.strip())]
+ if cells:
+ table_rows.append(cells)
+
+ if table_rows:
+ result.append("")
+ result.extend(_build_pipe_table(table_rows))
+ result.append("")
+ else:
+ result.append(line)
+ i += 1
+
+ return "\n".join(result)
+
+
+def _fix_image_paths(text: str, assets_dir: Optional[Path], stats: PostProcessStats) -> str:
+ """Fix pandoc's double media path and verify images exist.
+
+ Pandoc extracts to /media/ but references as
+ /media/media/. Fix the references.
+ Also flatten the actual directory if needed.
+ """
+ def fix_path(m: re.Match) -> str:
+ alt = m.group(1)
+ path = m.group(2)
+ new_path = path
+
+ # Fix double media/ path
+ if "media/media/" in path:
+ new_path = path.replace("media/media/", "media/")
+ stats.image_paths_fixed += 1
+
+ return f""
+
+ text = re.sub(r"!\[([^\]]*)\]\(([^)]+)\)", fix_path, text)
+
+ # Flatten double media/ nesting if present (pandoc artifact)
+ if assets_dir:
+ double_media = assets_dir / "media" / "media"
+ single_media = assets_dir / "media"
+ try:
+ for f in double_media.iterdir():
+ dest = single_media / f.name
+ if not dest.exists():
+ shutil.move(str(f), str(dest))
+ double_media.rmdir()
+ except (FileNotFoundError, OSError):
+ pass
+
+ return text
+
+
+def _clean_pandoc_attributes(text: str, stats: PostProcessStats) -> str:
+ """Remove pandoc attribute annotations from markdown.
+
+ Removes: {width="..." height="..."}, {.underline}, {.mark}, etc.
+ """
+ count_before = len(text)
+
+ # Remove width/height attributes on images
+ text = _RE_PANDOC_ATTR.sub("", text)
+
+ # Remove class attributes like {.underline}
+ text = _RE_PANDOC_CLASS.sub("", text)
+
+ if len(text) != count_before:
+ # Rough count of removals
+ stats.attributes_removed = count_before - len(text)
+
+ return text
+
+
+def _is_code_content(lines: list[str]) -> bool:
+ """Heuristic: decide if content between dashed lines is code or a note/callout.
+
+ Code indicators:
+ - Has a language hint on the first line
+ - Contains JSON/code-like syntax ({, }, =, ;, ->, //)
+ - Contains URLs with protocols
+ - Has backslash line continuations
+
+ Note indicators:
+ - Mostly CJK/prose text without code syntax
+ - Short single-line content
+ """
+ text = "\n".join(lines)
+ stripped = text.strip()
+
+ if not stripped:
+ return False
+
+ # Code syntax indicators
+ code_chars = set('{}[]();=<>/\\')
+ code_char_count = sum(1 for c in stripped if c in code_chars)
+
+ # If >5% of content is code syntax characters, treat as code
+ if len(stripped) > 0 and code_char_count / len(stripped) > 0.05:
+ return True
+
+ # JSON-like structure
+ if stripped.startswith("{") or stripped.startswith("["):
+ return True
+
+ # Command-like (starts with common command patterns)
+ first_line = lines[0].strip() if lines else ""
+ if re.match(r"^(curl|wget|npm|pip|brew|apt|docker|git|ssh|cd|ls|cat|echo|python|node|uv)\s", first_line):
+ return True
+
+ return False
+
+
+def _fix_code_blocks(text: str, stats: PostProcessStats) -> str:
+ """Convert pandoc's indented dashed-line blocks to fenced code blocks or blockquotes.
+
+ Pandoc wraps both code and notes in:
+ ------------------------------------------------------------------
+ content here
+
+ ------------------------------------------------------------------
+
+ With language hint -> code block:
+ ```json
+ content here
+ ```
+
+ Without language hint + prose content -> blockquote:
+ > content here
+
+ Without language hint + code-like content -> code block:
+ ```
+ content here
+ ```
+ """
+ lines = text.split("\n")
+ result = []
+ i = 0
+
+ known_langs = _KNOWN_CODE_LANGS
+
+ while i < len(lines):
+ line = lines[i]
+
+ # Detect indented dashed line (2+ leading spaces, 3+ dashes)
+ if _RE_DASHED_LINE.match(line):
+ # Check if this is a pandoc simple table (multiple dashed columns
+ # on the same line, or content between dashes contains images)
+ # Simple table pattern: " ---- ----" (multiple dash groups separated by spaces)
+ # Gap can be 1+ spaces (pandoc uses varying gaps)
+ dash_parts = [p for p in line.strip().split() if p.strip()]
+ is_simple_table_border = len(dash_parts) > 1 and all(
+ re.match(r"^-+$", p.strip()) for p in dash_parts
+ )
+
+ if is_simple_table_border:
+ # This is a pandoc simple table border - collect rows until
+ # next simple table border, convert to pipe table
+ table_rows = []
+ j = i + 1
+ while j < len(lines):
+ next_line = lines[j]
+ # Check for closing simple table border
+ next_parts = [p for p in next_line.strip().split() if p.strip()]
+ is_next_border = len(next_parts) > 1 and all(
+ re.match(r"^-+$", p.strip()) for p in next_parts
+ )
+ if is_next_border:
+ j += 1
+ break
+ if next_line.strip():
+ # Split by 2+ spaces to get columns (pandoc uses varying gaps)
+ cells = [c.strip() for c in re.split(r"\s{2,}", next_line.strip()) if c.strip()]
+ if cells:
+ table_rows.append(cells)
+ j += 1
+
+ if table_rows:
+ stats.code_blocks_fixed += 1
+ result.append("")
+ result.extend(_build_pipe_table(table_rows))
+ result.append("")
+
+ i = j
+ continue
+
+ # Not a simple table - look for content and closing dashed line
+ block_content = []
+ lang_hint = ""
+ j = i + 1
+
+ while j < len(lines):
+ next_line = lines[j]
+
+ if _RE_DASHED_LINE.match(next_line):
+ # Found closing dashed line
+ j += 1
+ break
+
+ block_content.append(next_line)
+ j += 1
+ else:
+ # No closing dashed line found - not a block, keep as-is
+ result.append(line)
+ i += 1
+ continue
+
+ # If content contains images, treat as simple table (single-column)
+ has_images = any("![" in cl for cl in block_content)
+ if has_images:
+ result.append("")
+ for cl in block_content:
+ cl = cl.strip()
+ if cl:
+ result.append(cl)
+ result.append("")
+ i = j
+ continue
+
+ # Check if first line is a language hint (e.g., " JSON\", " Plain Text\")
+ has_lang_hint = False
+ if block_content:
+ first = block_content[0].strip()
+ first_clean = first.rstrip("\\").strip()
+ if first_clean.lower() in known_langs:
+ lang_hint = first_clean.lower()
+ if lang_hint in ("plain text", "text", "plaintext"):
+ lang_hint = "" # No language tag for plain text
+ has_lang_hint = True
+ block_content = block_content[1:]
+
+ # Clean content: remove leading 2-space indent, fix escaped quotes
+ cleaned = []
+ for cl in block_content:
+ if cl.startswith(" "):
+ cl = cl[2:]
+ cl = cl.replace('\\"', '"')
+ if cl.endswith("\\"):
+ cl = cl[:-1]
+ cleaned.append(cl)
+
+ # Remove trailing/leading empty lines
+ while cleaned and not cleaned[-1].strip():
+ cleaned.pop()
+ while cleaned and not cleaned[0].strip():
+ cleaned.pop(0)
+
+ if cleaned:
+ stats.code_blocks_fixed += 1
+
+ # Decide: code block vs blockquote
+ if has_lang_hint or _is_code_content(cleaned):
+ # Code block
+ result.append("")
+ result.append(f"```{lang_hint}")
+ result.extend(cleaned)
+ result.append("```")
+ result.append("")
+ else:
+ # Note/callout -> blockquote
+ result.append("")
+ for cl in cleaned:
+ if cl.strip():
+ result.append(f"> {cl}")
+ else:
+ result.append(">")
+ result.append("")
+
+ i = j
+ else:
+ result.append(line)
+ i += 1
+
+ return "\n".join(result)
+
+
+def _fix_escaped_brackets(text: str, stats: PostProcessStats) -> str:
+ r"""Fix pandoc's escaped brackets: \[ -> [, \] -> ]."""
+ count = len(_RE_ESCAPED_BRACKET.findall(text))
+ if count:
+ stats.escaped_brackets_fixed = count
+ text = _RE_ESCAPED_BRACKET.sub(r"\1", text)
+ return text
+
+
+def _fix_double_bracket_links(text: str, stats: PostProcessStats) -> str:
+ """Fix double-bracket links: [[text]{.underline}](url) -> [text](url)."""
+ count = 0
+
+ def _replace_link(m: re.Match) -> str:
+ nonlocal count
+ count += 1
+ return f"[{m.group(1)}]({m.group(2)})"
+
+ text = _RE_DOUBLE_BRACKET_ATTR_LINK.sub(_replace_link, text)
+ text = _RE_DOUBLE_BRACKET_CLOSED.sub(_replace_link, text)
+ text = _RE_DOUBLE_BRACKET_LINK.sub(_replace_link, text)
+
+ stats.double_brackets_fixed = count
+ return text
+
+
+def _fix_cjk_bold_spacing(text: str) -> str:
+ """Add space between **bold** markers and adjacent CJK characters.
+
+ DOCX uses run-level styling for bold — no spaces between runs in CJK text.
+ Markdown renderers need whitespace around ** to recognize bold boundaries.
+ We find each **content** span, check the character before/after, and insert
+ a space only when the adjacent character is CJK (avoiding double spaces).
+ """
+ result = []
+ last_end = 0
+
+ for m in _RE_BOLD_PAIR.finditer(text):
+ start, end = m.start(), m.end()
+ result.append(text[last_end:start])
+
+ # Space before opening ** if preceded by CJK
+ if start > 0 and _RE_CJK_PUNCT.match(text[start - 1]):
+ result.append(' ')
+
+ result.append(m.group(0))
+
+ # Space after closing ** if followed by CJK
+ if end < len(text) and _RE_CJK_PUNCT.match(text[end]):
+ result.append(' ')
+
+ last_end = end
+
+ result.append(text[last_end:])
+ return ''.join(result)
+
+
+def _cleanup_excessive_blank_lines(text: str) -> str:
+ """Collapse 3+ consecutive blank lines to 2."""
+ return re.sub(r"\n{4,}", "\n\n\n", text)
+
+
+def postprocess_docx_markdown(
+ text: str,
+ assets_dir: Optional[Path] = None,
+) -> tuple[str, PostProcessStats]:
+ """Apply all DOCX-specific post-processing to pandoc markdown output.
+
+ Returns (cleaned_text, stats).
+ """
+ stats = PostProcessStats()
+
+ # Order matters: grid tables first (they contain images with attributes)
+ text = _convert_grid_tables(text, stats)
+ text = _fix_image_paths(text, assets_dir, stats)
+ text = _clean_pandoc_attributes(text, stats)
+ text = _fix_code_blocks(text, stats)
+ text = _fix_double_bracket_links(text, stats)
+ text = _fix_escaped_brackets(text, stats)
+ text = _fix_cjk_bold_spacing(text)
+ text = _cleanup_excessive_blank_lines(text)
+
+ return text, stats
+
+
+# ── DOCX deep parsing (python-docx) ──────────────────────────────────────────
+
+def convert_with_docx_deep(
+ file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+ """Convert DOCX using python-docx direct parsing (experimental).
+
+ More precise than pandoc for:
+ - Table structure preservation
+ - Comment extraction
+ - Image extraction with position info
+ """
+ try:
+ from docx import Document
+ from docx.opc.constants import RELATIONSHIP_TYPE as RT
+ except ImportError:
+ return ConversionResult(
+ markdown="",
+ tool="docx-deep",
+ success=False,
+ error="python-docx not installed. Run: pip install python-docx",
+ )
+
+ try:
+ doc = Document(str(file_path))
+ md_parts = []
+ images = []
+ image_counter = 0
+
+ # Extract images from docx zip
+ if assets_dir:
+ assets_dir.mkdir(parents=True, exist_ok=True)
+ media_dir = assets_dir / "media"
+ media_dir.mkdir(exist_ok=True)
+
+ with zipfile.ZipFile(str(file_path), "r") as zf:
+ for name in zf.namelist():
+ if name.startswith("word/media/"):
+ img_name = Path(name).name
+ img_dest = media_dir / img_name
+ with zf.open(name) as src, open(img_dest, "wb") as dst:
+ dst.write(src.read())
+ images.append(str(img_dest))
+
+ # Process paragraphs
+ for para in doc.paragraphs:
+ style_name = para.style.name if para.style else ""
+ text = para.text.strip()
+
+ if not text:
+ md_parts.append("")
+ continue
+
+ # Headings
+ if style_name.startswith("Heading"):
+ try:
+ level = int(style_name.split()[-1])
+ except (ValueError, IndexError):
+ level = 1
+ md_parts.append(f"{'#' * level} {text}")
+ md_parts.append("")
+ continue
+
+ # Check for bold-only paragraphs (often sub-headings in Chinese docs)
+ all_bold = all(run.bold for run in para.runs if run.text.strip())
+ if all_bold and para.runs and len(text) < 100:
+ md_parts.append(f"**{text}**")
+ md_parts.append("")
+ continue
+
+ # Regular paragraph
+ md_parts.append(text)
+ md_parts.append("")
+
+ # Process tables
+ for table in doc.tables:
+ md_parts.append("")
+ rows = table.rows
+ if not rows:
+ continue
+
+ # Header row
+ header_cells = [cell.text.strip() for cell in rows[0].cells]
+ md_parts.append("| " + " | ".join(header_cells) + " |")
+ md_parts.append("| " + " | ".join(["---"] * len(header_cells)) + " |")
+
+ # Data rows
+ for row in rows[1:]:
+ cells = [cell.text.strip() for cell in row.cells]
+ md_parts.append("| " + " | ".join(cells) + " |")
+ md_parts.append("")
+
+ markdown = "\n".join(md_parts)
+
+ return ConversionResult(
+ markdown=markdown,
+ tool="docx-deep",
+ images=images,
+ success=True,
+ )
+ except Exception as e:
+ return ConversionResult(
+ markdown="", tool="docx-deep", success=False, error=str(e)
+ )
+
+
+# ── Existing tool converters ─────────────────────────────────────────────────
+
+def check_tool_available(tool: str) -> bool:
+ """Check if a conversion tool is available."""
+ if tool == "pymupdf4llm":
+ try:
+ import pymupdf4llm
+ return True
+ except ImportError:
+ return False
+ elif tool == "markitdown":
+ try:
+ import markitdown
+ return True
+ except ImportError:
+ return False
+ elif tool == "pandoc":
+ return shutil.which("pandoc") is not None
+ elif tool == "docx-deep":
+ try:
+ from docx import Document
+ return True
+ except ImportError:
+ return False
+ return False
+
+
+def select_tools(file_path: Path, mode: str) -> list[str]:
+ """Select conversion tools based on file type and mode."""
+ ext = file_path.suffix.lower()
+
+ # Tool preferences by format
+ tool_map = {
+ ".pdf": {
+ "quick": ["pymupdf4llm", "markitdown"], # fallback order
+ "heavy": ["pymupdf4llm", "markitdown"],
+ },
+ ".docx": {
+ "quick": ["pandoc", "markitdown"],
+ "heavy": ["pandoc", "markitdown"],
+ },
+ ".doc": {
+ "quick": ["pandoc", "markitdown"],
+ "heavy": ["pandoc", "markitdown"],
+ },
+ ".pptx": {
+ "quick": ["markitdown", "pandoc"],
+ "heavy": ["markitdown", "pandoc"],
+ },
+ ".xlsx": {
+ "quick": ["markitdown"],
+ "heavy": ["markitdown"],
+ },
+ }
+
+ tools = tool_map.get(ext, {"quick": ["markitdown"], "heavy": ["markitdown"]})
+
+ if mode == "quick":
+ # Return first available tool
+ for tool in tools["quick"]:
+ if check_tool_available(tool):
+ return [tool]
+ return []
+ else: # heavy
+ # Return all available tools
+ return [t for t in tools["heavy"] if check_tool_available(t)]
+
+
+def convert_with_pymupdf4llm(
+ file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+ """Convert using PyMuPDF4LLM (best for PDFs)."""
+ try:
+ import pymupdf4llm
+
+ kwargs = {}
+ images = []
+
+ if assets_dir:
+ assets_dir.mkdir(parents=True, exist_ok=True)
+ kwargs["write_images"] = True
+ kwargs["image_path"] = str(assets_dir)
+ kwargs["dpi"] = 150
+
+ # Use best table detection strategy
+ kwargs["table_strategy"] = "lines_strict"
+
+ md_text = pymupdf4llm.to_markdown(str(file_path), **kwargs)
+
+ if assets_dir:
+ images = _collect_images(assets_dir)
+
+ return ConversionResult(
+ markdown=md_text, tool="pymupdf4llm", images=images, success=True
+ )
+ except Exception as e:
+ return ConversionResult(
+ markdown="", tool="pymupdf4llm", success=False, error=str(e)
+ )
+
+
+def convert_with_markitdown(
+ file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+ """Convert using markitdown."""
+ try:
+ # markitdown CLI approach
+ result = subprocess.run(
+ ["markitdown", str(file_path)],
+ capture_output=True,
+ text=True,
+ timeout=120,
+ )
+
+ if result.returncode != 0:
+ return ConversionResult(
+ markdown="",
+ tool="markitdown",
+ success=False,
+ error=result.stderr,
+ )
+
+ return ConversionResult(
+ markdown=result.stdout, tool="markitdown", success=True
+ )
+ except FileNotFoundError:
+ # Try Python API
+ try:
+ from markitdown import MarkItDown
+
+ md = MarkItDown()
+ result = md.convert(str(file_path))
+ return ConversionResult(
+ markdown=result.text_content, tool="markitdown", success=True
+ )
+ except Exception as e:
+ return ConversionResult(
+ markdown="", tool="markitdown", success=False, error=str(e)
+ )
+ except Exception as e:
+ return ConversionResult(
+ markdown="", tool="markitdown", success=False, error=str(e)
+ )
+
+
+def convert_with_pandoc(
+ file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+ """Convert using pandoc.
+
+ Pandoc's --extract-media=DIR creates a media/ subdirectory inside DIR.
+ We point --extract-media at assets_dir's parent so pandoc's media/
+ subdirectory lands exactly at assets_dir (when assets_dir ends with 'media'),
+ or we use a temp dir and move files afterward.
+ """
+ try:
+ cmd = ["pandoc", str(file_path), "-t", "markdown", "--wrap=none"]
+
+ extract_dir = None
+ if assets_dir:
+ assets_dir.mkdir(parents=True, exist_ok=True)
+ # Pandoc always creates a media/ subdirectory inside --extract-media.
+ # Point it at the parent so media/ lands at assets_dir.
+ if assets_dir.name == "media":
+ extract_dir = assets_dir.parent
+ else:
+ extract_dir = assets_dir
+ cmd.extend(["--extract-media", str(extract_dir)])
+
+ result = subprocess.run(
+ cmd, capture_output=True, text=True, timeout=120
+ )
+
+ if result.returncode != 0:
+ return ConversionResult(
+ markdown="", tool="pandoc", success=False, error=result.stderr
+ )
+
+ md = result.stdout
+
+ # Convert absolute image paths to relative paths based on output location
+ if extract_dir:
+ abs_media = str(extract_dir / "media")
+ # Replace absolute paths with relative 'media/' prefix
+ md = md.replace(abs_media + "/", "media/")
+
+ images = _collect_images(assets_dir) if assets_dir else []
+
+ return ConversionResult(
+ markdown=md, tool="pandoc", images=images, success=True
+ )
+ except Exception as e:
+ return ConversionResult(
+ markdown="", tool="pandoc", success=False, error=str(e)
+ )
+
+
+def convert_single(
+ file_path: Path, tool: str, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+ """Run a single conversion tool."""
+ converters = {
+ "pymupdf4llm": convert_with_pymupdf4llm,
+ "markitdown": convert_with_markitdown,
+ "pandoc": convert_with_pandoc,
+ "docx-deep": convert_with_docx_deep,
+ }
+
+ converter = converters.get(tool)
+ if not converter:
+ return ConversionResult(
+ markdown="", tool=tool, success=False, error=f"Unknown tool: {tool}"
+ )
+
+ return converter(file_path, assets_dir)
+
+
+def merge_results(results: list[ConversionResult]) -> ConversionResult:
+ """Merge results from multiple tools, selecting best segments."""
+ if not results:
+ return ConversionResult(markdown="", tool="none", success=False)
+
+ # Filter successful results
+ successful = [r for r in results if r.success and r.markdown.strip()]
+ if not successful:
+ # Return first error
+ return results[0] if results else ConversionResult(
+ markdown="", tool="none", success=False
+ )
+
+ if len(successful) == 1:
+ return successful[0]
+
+ # Multiple successful results - merge them
+ # Strategy: Compare key metrics and select best
+ best = successful[0]
+ best_score = score_markdown(best.markdown)
+
+ for result in successful[1:]:
+ score = score_markdown(result.markdown)
+ if score > best_score:
+ best = result
+ best_score = score
+
+ # Merge images from all results
+ all_images = []
+ seen = set()
+ for result in successful:
+ for img in result.images:
+ if img not in seen:
+ all_images.append(img)
+ seen.add(img)
+
+ best.images = all_images
+ best.tool = f"merged({','.join(r.tool for r in successful)})"
+
+ return best
+
+
+def score_markdown(md: str) -> float:
+ """Score markdown quality for comparison."""
+ score = 0.0
+
+ # Length (more content is generally better)
+ score += min(len(md) / 10000, 5.0) # Cap at 5 points
+
+ # Tables (proper markdown tables)
+ table_count = md.count("|---|") + md.count("| ---")
+ score += min(table_count * 0.5, 3.0)
+
+ # Images (referenced images)
+ image_count = md.count("![")
+ score += min(image_count * 0.3, 2.0)
+
+ # Headings (proper hierarchy)
+ h1_count = md.count("\n# ")
+ h2_count = md.count("\n## ")
+ h3_count = md.count("\n### ")
+ if h1_count > 0 and h2_count >= h1_count:
+ score += 1.0 # Good hierarchy
+
+ # Lists (structured content)
+ list_count = md.count("\n- ") + md.count("\n* ") + md.count("\n1. ")
+ score += min(list_count * 0.1, 2.0)
+
+ # Penalize pandoc artifacts (grid tables, attributes)
+ artifact_count = md.count("+:---") + md.count("+---+")
+ artifact_count += md.count('{width="') + md.count("{.underline}")
+ score -= artifact_count * 0.5
+
+ return score
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Convert documents to markdown with multi-tool orchestration",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ # Quick mode (default)
+ python convert.py document.pdf -o output.md
+
+ # Heavy mode (best quality)
+ python convert.py document.pdf -o output.md --heavy
+
+ # DOCX deep mode (python-docx parsing)
+ python convert.py document.docx -o output.md --docx-deep
+
+ # With custom assets directory
+ python convert.py document.pdf -o output.md --assets-dir ./images
+ """,
+ )
+ parser.add_argument("input", type=Path, nargs="?", help="Input document path")
+ parser.add_argument(
+ "-o", "--output", type=Path, help="Output markdown file"
+ )
+ parser.add_argument(
+ "--heavy",
+ action="store_true",
+ help="Enable Heavy Mode (multi-tool, best quality)",
+ )
+ parser.add_argument(
+ "--docx-deep",
+ action="store_true",
+ help="Use python-docx direct parsing (experimental, DOCX only)",
+ )
+ parser.add_argument(
+ "--no-postprocess",
+ action="store_true",
+ help="Disable DOCX post-processing (keep raw pandoc output)",
+ )
+ parser.add_argument(
+ "--assets-dir",
+ type=Path,
+ default=None,
+ help="Directory for extracted images (default: