From 143995b213f09b2cae0c98e94f0aa391d340f3b2 Mon Sep 17 00:00:00 2001
From: daymade <daymadev89@gmail.com>
Date: Mon, 23 Mar 2026 00:06:30 +0800
Subject: [PATCH] =?UTF-8?q?refactor:=20rename=20markdown-tools=20=E2=86=92?=
 =?UTF-8?q?=20doc-to-markdown=20(v2.0.0)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Rename skill to better reflect its purpose (document-to-markdown conversion)
- Update SKILL.md name, description, and trigger keywords
- Add benchmark reference (2026-03-22)
- Update marketplace.json entry (name, skills path, version 2.0.0)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude-plugin/marketplace.json               |   18 +-
 {markdown-tools => doc-to-markdown}/SKILL.md  |   32 +-
 .../references/benchmark-2026-03-22.md        |  163 +++
 .../references/conversion-examples.md         |    0
 .../references/heavy-mode-guide.md            |    0
 .../references/tool-comparison.md             |    0
 doc-to-markdown/scripts/convert.py            | 1150 +++++++++++++++++
 .../scripts/convert_path.py                   |    0
 .../scripts/extract_pdf_images.py             |    0
 .../scripts/merge_outputs.py                  |    0
 .../scripts/validate_output.py                |    0
 markdown-tools/scripts/convert.py             |  434 -------
 12 files changed, 1346 insertions(+), 451 deletions(-)
 rename {markdown-tools => doc-to-markdown}/SKILL.md (74%)
 create mode 100644 doc-to-markdown/references/benchmark-2026-03-22.md
 rename {markdown-tools => doc-to-markdown}/references/conversion-examples.md (100%)
 rename {markdown-tools => doc-to-markdown}/references/heavy-mode-guide.md (100%)
 rename {markdown-tools => doc-to-markdown}/references/tool-comparison.md (100%)
 create mode 100755 doc-to-markdown/scripts/convert.py
 rename {markdown-tools => doc-to-markdown}/scripts/convert_path.py (100%)
 rename {markdown-tools => doc-to-markdown}/scripts/extract_pdf_images.py (100%)
 rename {markdown-tools => doc-to-markdown}/scripts/merge_outputs.py (100%)
 rename {markdown-tools => doc-to-markdown}/scripts/validate_output.py (100%)
 delete mode 100755 markdown-tools/scripts/convert.py

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 76986aa..2286d3d 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -50,25 +50,23 @@
       ]
     },
     {
-      "name": "markdown-tools",
-      "description": "Convert documents (PDFs, Word, PowerPoint) to high-quality markdown with multi-tool orchestration. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge with segment-level selection). Features PyMuPDF4LLM for LLM-optimized PDF conversion, pandoc for DOCX/PPTX structure preservation, quality validation with HTML reports, and image extraction with metadata",
+      "name": "doc-to-markdown",
+      "description": "Converts DOCX/PDF/PPTX to high-quality Markdown with automatic post-processing. Fixes pandoc grid tables, image paths, attribute noise, and code blocks. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). Trigger on \"convert document\", \"docx to markdown\", \"parse word\", \"doc to markdown\", \"extract images from document\".",
       "source": "./",
       "strict": false,
-      "version": "1.2.0",
+      "version": "2.0.0",
       "category": "document-conversion",
       "keywords": [
         "markdown",
-        "pdf",
         "docx",
+        "pdf",
         "pptx",
-        "pymupdf4llm",
+        "converter",
         "pandoc",
-        "markitdown",
-        "heavy-mode",
-        "quality-validation"
+        "document"
       ],
       "skills": [
-        "./markdown-tools"
+        "./doc-to-markdown"
       ]
     },
     {
@@ -942,4 +940,4 @@
       ]
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/markdown-tools/SKILL.md b/doc-to-markdown/SKILL.md
similarity index 74%
rename from markdown-tools/SKILL.md
rename to doc-to-markdown/SKILL.md
index 8bc71f4..966c9fc 100644
--- a/markdown-tools/SKILL.md
+++ b/doc-to-markdown/SKILL.md
@@ -1,11 +1,11 @@
 ---
-name: markdown-tools
-description: Converts documents to markdown with multi-tool orchestration for best quality. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). Use when converting PDF/DOCX/PPTX files to markdown, extracting images from documents, validating conversion quality, or needing LLM-optimized document output.
+name: doc-to-markdown
+description: Converts DOCX/PDF/PPTX to high-quality Markdown with automatic post-processing. Fixes pandoc grid tables, image paths, attribute noise, and code blocks. Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge). Trigger on "convert document", "docx to markdown", "parse word", "doc to markdown", "extract images from document".
 ---
 
-# Markdown Tools
+# Doc to Markdown
 
-Convert documents to high-quality markdown with intelligent multi-tool orchestration.
+Convert documents to high-quality markdown with intelligent multi-tool orchestration and automatic DOCX post-processing.
 
 ## Dual Mode Architecture
 
@@ -34,6 +34,9 @@ uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o o
 # Heavy Mode - multi-tool parallel execution with merge
 uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md --heavy
 
+# DOCX with deep python-docx parsing (experimental)
+uv run --with pymupdf4llm --with markitdown --with python-docx scripts/convert.py document.docx -o output.md --docx-deep
+
 # Check available tools
 uv run scripts/convert.py --list-tools
 ```
@@ -43,7 +46,7 @@ uv run scripts/convert.py --list-tools
 | Format | Quick Mode Tool | Heavy Mode Tools |
 |--------|----------------|------------------|
 | PDF | pymupdf4llm | pymupdf4llm + markitdown |
-| DOCX | pandoc | pandoc + markitdown |
+| DOCX | pandoc + post-processing | pandoc + markitdown |
 | PPTX | markitdown | markitdown + pandoc |
 | XLSX | markitdown | markitdown |
 
@@ -53,6 +56,21 @@ uv run scripts/convert.py --list-tools
 - **markitdown**: Microsoft's universal converter, good for Office formats
 - **pandoc**: Excellent structure preservation for DOCX/PPTX
 
+## DOCX Post-Processing (automatic)
+
+When converting DOCX files via pandoc, the following cleanups are applied automatically:
+
+| Problem | Fix |
+|---------|-----|
+| Grid tables (`+:---+` syntax) | Single-column -> blockquote, multi-column -> split images |
+| Image double path (`media/media/`) | Flatten to `media/` |
+| Pandoc attributes (`{width="..." height="..."}`) | Removed |
+| Inline classes (`{.underline}`, `{.mark}`) | Removed |
+| Indented dashed code blocks | Converted to fenced code blocks (```) |
+| Escaped brackets (`\[...\]`) | Unescaped to `[...]` |
+| Double-bracket links (`[[text]{...}](url)`) | Simplified to `[text](url)` |
+| Escaped quotes in code (`\"`) | Fixed to `"` |
+
 ## Heavy Mode Workflow
 
 Heavy Mode runs multiple tools in parallel and selects the best segments:
@@ -117,7 +135,7 @@ python scripts/merge_outputs.py output1.md output2.md -o merged.md --verbose
 ## Path Conversion (Windows/WSL)
 
 ```bash
-# Windows → WSL conversion
+# Windows to WSL conversion
 python scripts/convert_path.py "C:\Users\name\Documents\file.pdf"
 # Output: /mnt/c/Users/name/Documents/file.pdf
 ```
@@ -147,7 +165,7 @@ brew install pandoc
 
 | Script | Purpose |
 |--------|---------|
-| `convert.py` | Main orchestrator with Quick/Heavy mode |
+| `convert.py` | Main orchestrator with Quick/Heavy mode + DOCX post-processing |
 | `merge_outputs.py` | Merge multiple markdown outputs |
 | `validate_output.py` | Quality validation with HTML report |
 | `extract_pdf_images.py` | PDF image extraction with metadata |
diff --git a/doc-to-markdown/references/benchmark-2026-03-22.md b/doc-to-markdown/references/benchmark-2026-03-22.md
new file mode 100644
index 0000000..27b2069
--- /dev/null
+++ b/doc-to-markdown/references/benchmark-2026-03-22.md
@@ -0,0 +1,163 @@
+# DOCX→Markdown 转换方案基准测试
+
+> **测试日期**：2026-03-22
+>
+> **测试文件**：`助教-【腾讯云🦞】小白实践 OpenClaw 保姆级教程.docx`（19MB，77 张图片，含 grid table 布局、JSON 代码块、多列图片并排、信息框）
+>
+> **测试方法**：5 个方案对同一文件转换，按 5 个维度各 10 分制打分
+
+---
+
+## 综合评分
+
+| 维度 | Docling (IBM) | MarkItDown (MS) | Pandoc | Mammoth | **doc-to-markdown（我们）** |
+|------|:---:|:---:|:---:|:---:|:---:|
+| 表格质量 | 5 | 3 | 5 | 1~3 | **6** |
+| 图片提取 | 4 | 2 | **10** | 5 | 7 |
+| 文本完整性 | 8 | 7 | **9** | 7 | **9** |
+| 格式清洁度 | 5 | 5 | 5 | 3 | **7** |
+| 代码块 | 2 | 1 | N/A | 1 | **9** |
+| **综合** | **4.8** | **3.6** | **7.3** | **3.4~3.6** | **7.6** |
+
+---
+
+## 各方案详细分析
+
+### 1. IBM Docling（综合 4.8）
+
+- **版本**：docling 2.x + Granite-Docling-258M
+- **架构**：AI 驱动（VLM 视觉语言模型），DocTags 中间格式 → Markdown
+
+**致命问题**：
+- 图片引用全部是 `<!-- image -->` 占位符（77 张图 0 张可显示），`ImageRefMode` API 对 DOCX 不可用
+- 标题层级全部丢失（0 个 `#`），所有标题退化为粗体文本
+- 代码块为零，JSON 和命令全部输出为普通段落
+- `api_key` 被错误转义为 `api\_key`
+
+**优点**：
+- 文本内容完整，中文/emoji/链接保留良好
+- 无 grid table 或 HTML 残留
+- 表格语法正确（pipe table），但内容是占位符
+
+**结论**：Docling 的优势在 PDF（AAAI 2025 论文场景），DOCX 支持远未达到生产级别。
+
+### 2. Microsoft MarkItDown（综合 3.6）
+
+- **版本**：markitdown 0.1.5
+- **架构**：底层调用 mammoth → HTML → markdownify → Markdown
+
+**致命问题**：
+- 77 张图片全部是截断的 base64 占位符（`data:image/png;base64,...`），默认 `keep_data_uris=False` 主动丢弃图片数据
+- 标题全部变为粗体文本（mammoth 无法识别 WPS 自定义样式）
+- 代码块为零，JSON 被塞入表格单元格
+- 有序列表编号全部错误（输出为 `1. 1. 1.`）
+
+**优点**：
+- 无 HTML 标签残留
+- 文本内容基本完整
+
+**结论**：MarkItDown 的 markdownify 后处理反而引入破坏性截断。轻量场景可用，复杂 DOCX 不可靠。
+
+### 3. Pandoc（综合 7.3）
+
+- **版本**：pandoc 3.9
+- **架构**：Haskell 原生 AST 解析，支持 60+ 格式
+
+**测试了 3 种参数**：
+
+| 参数 | 结果 |
+|------|------|
+| `-t gfm` | 最差：24 个 HTML `<table>` 嵌套，74 个 HTML `<img>` |
+| `-t markdown` | 最佳：grid table（可后处理），无 HTML |
+| `-t markdown-raw_html-...` | 与 markdown 完全相同，参数无效果 |
+
+**问题**：
+- Grid table 不可避免（原 docx 有多行单元格和嵌套表格，pipe table 无法表达）
+- `{width="..." height="..."}` 68 处
+- `{.underline}` 6 处
+- 反斜杠过度转义 37 处
+
+**优点**：
+- 图片提取 10/10（77 张全部正确，路径结构一致）
+- 文本完整性 9/10（内容、链接、emoji 全部保留）
+- 最成熟稳定的底层引擎
+
+**结论**：Pandoc 是最可靠的底层引擎，输出质量最高但需要后处理清洗 pandoc 私有语法。
+
+### 4. Mammoth（综合 3.4~3.6）
+
+- **版本**：mammoth 1.11.0
+- **架构**：python-docx 解析 → HTML/Markdown（Markdown 支持已废弃）
+
+**测试了 2 种方式**：
+
+| 方式 | 综合 |
+|------|------|
+| 方式A：直接转 Markdown | 3.4（表格完全丢失） |
+| 方式B：转 HTML → markdownify | 3.6（有表格但嵌套被压扁） |
+
+**致命问题**：
+- 标题全部丢失（WPS `styles.xml` 中样式定义为空，mammoth 无法映射 Heading）
+- 代码块为零
+- 图片全部 base64 内嵌，单文件 28MB
+- 方式B 中 markdownify 丢失 14 张图片（63/77）
+
+**结论**：Mammoth 的 Markdown 支持已废弃，对 WPS 导出的 docx 兼容性差。不推荐。
+
+### 5. doc-to-markdown / 我们的方案（综合 7.6）
+
+- **版本**：doc-to-markdown 1.0（基于 pandoc + 6 个后处理函数）
+- **架构**：Pandoc 转换 → 自动后处理（grid table 清理、图片路径修复、属性清理、代码块修复、转义修复）
+
+**后处理实际效果**：
+
+| 后处理函数 | 修复数量 |
+|-----------|---------|
+| `_convert_grid_tables` | 11 处 grid table → pipe table / blockquote |
+| `_clean_pandoc_attributes` | 3437 字符属性清理 |
+| `_fix_code_blocks` | 22 处缩进虚线 → ``` 代码块 |
+| `_fix_escaped_brackets` | 10 处 |
+| `_fix_double_bracket_links` | 1 处 |
+| `_fix_image_paths` | 77 张图片路径修复 |
+
+**已知问题（待修复）**：
+- 图片路径双层嵌套 bug：`--assets-dir` 指定目录内被 pandoc 再建一层 `media/`
+- 2 处 grid table 残留（文末并排图片组未完全转换）
+
+**优点**：
+- 代码块识别 9/10（JSON 带语言标签，命令行正确包裹）
+- 格式清洁度 7/10（attributes、转义、grid table 大部分清理干净）
+- 文本完整性 9/10（关键内容全部保留）
+
+**结论**：综合最优，核心价值在 pandoc 后处理层。剩余 2 个 bug 可修。
+
+---
+
+## 架构决策
+
+```
+最终方案：Pandoc（底层引擎）+ doc-to-markdown 后处理（增值层）
+
+理由：
+1. Pandoc 图片提取最可靠（10/10），文本最完整（9/10）
+2. Pandoc 的问题（grid table、属性、转义）全部可后处理解决
+3. Docling/MarkItDown/Mammoth 的致命问题（图片丢失、标题丢失）无法后处理修复
+4. 后处理层是我们的核心竞争力，成本低、可迭代
+```
+
+---
+
+## 测试文件特征
+
+本次测试文件的难点在于：
+
+| 特征 | 说明 | 影响 |
+|------|------|------|
+| WPS 导出 | 非标准 Word 样式（Style ID 2/3 而非 Heading 1/2） | mammoth/markitdown/docling 标题全丢 |
+| 多列图片布局 | 2x2、1x4 图片网格用表格排版 | pandoc 输出 grid table |
+| 信息框/提示框 | 单列表格包裹文字 | pandoc 输出 grid table |
+| 嵌套表格 | 表格内套表格 | pipe table 无法表达 |
+| JSON 代码块 | 非代码块样式，用文本框/缩进表示 | 多数工具无法识别为代码 |
+| 19MB 文件 | 77 张截图嵌入 | base64 方案导致 28MB 输出 |
+
+这些特征代表了真实世界中 WPS/飞书文档导出 docx 的典型困难，是有效的基准测试场景。
diff --git a/markdown-tools/references/conversion-examples.md b/doc-to-markdown/references/conversion-examples.md
similarity index 100%
rename from markdown-tools/references/conversion-examples.md
rename to doc-to-markdown/references/conversion-examples.md
diff --git a/markdown-tools/references/heavy-mode-guide.md b/doc-to-markdown/references/heavy-mode-guide.md
similarity index 100%
rename from markdown-tools/references/heavy-mode-guide.md
rename to doc-to-markdown/references/heavy-mode-guide.md
diff --git a/markdown-tools/references/tool-comparison.md b/doc-to-markdown/references/tool-comparison.md
similarity index 100%
rename from markdown-tools/references/tool-comparison.md
rename to doc-to-markdown/references/tool-comparison.md
diff --git a/doc-to-markdown/scripts/convert.py b/doc-to-markdown/scripts/convert.py
new file mode 100755
index 0000000..a95eb86
--- /dev/null
+++ b/doc-to-markdown/scripts/convert.py
@@ -0,0 +1,1150 @@
+#!/usr/bin/env python3
+"""
+Multi-tool document to markdown converter with intelligent orchestration.
+
+Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge).
+DOCX files get automatic post-processing to fix pandoc artifacts.
+
+Usage:
+    # Quick Mode (default) - fast, single best tool
+    uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md
+
+    # Heavy Mode - multi-tool parallel execution with merge
+    uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md --heavy
+
+    # DOCX deep mode - python-docx direct parsing (experimental)
+    uv run --with python-docx scripts/convert.py document.docx -o output.md --docx-deep
+
+    # With image extraction
+    uv run --with pymupdf4llm scripts/convert.py document.pdf -o output.md --assets-dir ./images
+
+Dependencies:
+    - pymupdf4llm: PDF conversion (LLM-optimized)
+    - markitdown: PDF/DOCX/PPTX conversion
+    - pandoc: DOCX/PPTX conversion (system install: brew install pandoc)
+    - python-docx: DOCX deep parsing (optional, for --docx-deep)
+"""
+
+import argparse
+import re
+import subprocess
+import sys
+import shutil
+import zipfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass
+class ConversionResult:
+    """Result from a single tool conversion."""
+    markdown: str
+    tool: str
+    images: list[str] = field(default_factory=list)
+    success: bool = True
+    error: str = ""
+
+
+# ── Post-processing stats ────────────────────────────────────────────────────
+
+@dataclass
+class PostProcessStats:
+    """Track what the DOCX post-processor fixed."""
+    grid_tables_converted: int = 0
+    image_paths_fixed: int = 0
+    attributes_removed: int = 0
+    code_blocks_fixed: int = 0
+    escaped_brackets_fixed: int = 0
+    double_brackets_fixed: int = 0
+
+    def any_fixes(self) -> bool:
+        return any(
+            getattr(self, f) > 0
+            for f in self.__dataclass_fields__
+        )
+
+    def summary(self) -> str:
+        parts = []
+        if self.grid_tables_converted:
+            parts.append(f"grid tables: {self.grid_tables_converted}")
+        if self.image_paths_fixed:
+            parts.append(f"image paths: {self.image_paths_fixed}")
+        if self.attributes_removed:
+            parts.append(f"attributes: {self.attributes_removed}")
+        if self.code_blocks_fixed:
+            parts.append(f"code blocks: {self.code_blocks_fixed}")
+        if self.escaped_brackets_fixed:
+            parts.append(f"escaped brackets: {self.escaped_brackets_fixed}")
+        if self.double_brackets_fixed:
+            parts.append(f"double brackets: {self.double_brackets_fixed}")
+        return ", ".join(parts) if parts else "no fixes needed"
+
+
+# ── DOCX post-processing ─────────────────────────────────────────────────────
+
+# Regex patterns compiled once
+_RE_GRID_BORDER = re.compile(r"^\+[:=-][-:=]+(?:\+[:=-][-:=]+)*\+$")
+_RE_GRID_ROW = re.compile(r"^\|(.+)\|$")
+_RE_NESTED_GRID_BORDER = re.compile(r"^\|\s*\+[:=-][-:=]+\+\s*\|$")
+_RE_PANDOC_ATTR = re.compile(r"\{[^}]*(?:width|height)\s*=\s*\"[^\"]*\"[^}]*\}")
+_RE_PANDOC_CLASS = re.compile(r"\{\.(?:underline|mark)\}")
+_RE_DOUBLE_BRACKET_LINK = re.compile(r"\[\[([^\]]+)\]\(([^)]+)\)")
+_RE_DOUBLE_BRACKET_CLOSED = re.compile(r"\[\[([^\]]+)\]\]\(([^)]+)\)")
+_RE_DOUBLE_BRACKET_ATTR_LINK = re.compile(r"\[\[([^\]]+)\]\{[^}]*\}\]\(([^)]+)\)")
+_RE_ESCAPED_BRACKET = re.compile(r"\\(\[|])")
+# Matches single-column dashed line: "  ------"
+# AND multi-column simple table border: "  ---- -----"
+_RE_DASHED_LINE = re.compile(r"^(\s{2,})-{3,}[\s-]*$")
+_RE_ESCAPED_QUOTE = re.compile(r'\\"')
+# CJK + fullwidth punctuation range for bold spacing checks
+_RE_CJK_PUNCT = re.compile(r'[\u4e00-\u9fff\u3000-\u303f\uff01-\uffef，。、；：！？（）【】「」《》""'']')
+_RE_BOLD_PAIR = re.compile(r'\*\*(.+?)\*\*')
+
+
+def _is_grid_border(line: str) -> bool:
+    """Check if a line is a grid table border like +---+ or +:---+."""
+    stripped = line.strip()
+    return bool(_RE_GRID_BORDER.match(stripped))
+
+
+def _is_nested_grid_border(line: str) -> bool:
+    """Check if a line is a nested grid border like | +---+ |."""
+    stripped = line.strip()
+    return bool(_RE_NESTED_GRID_BORDER.match(stripped))
+
+
+def _count_grid_columns(border_line: str) -> int:
+    """Count columns in a grid table border line."""
+    stripped = border_line.strip()
+    if not stripped.startswith("+"):
+        return 0
+    # Count + separators minus 1 = number of columns
+    return stripped.count("+") - 1
+
+
+
+# Languages recognized as code block hints in pandoc dashed-line blocks
+_KNOWN_CODE_LANGS = frozenset({
+    "json", "bash", "shell", "python", "javascript", "js",
+    "html", "css", "yaml", "xml", "sql", "plain text",
+    "text", "plaintext", "typescript", "ts", "go", "rust",
+    "java", "c", "cpp", "ruby", "php",
+})
+
+
+def _build_pipe_table(rows: list[list[str]]) -> list[str]:
+    """Build a standard markdown pipe table from rows of cells."""
+    if not rows:
+        return []
+    col_count = max(len(r) for r in rows)
+    lines = [
+        "| " + " | ".join([""] * col_count) + " |",
+        "| " + " | ".join(["---"] * col_count) + " |",
+    ]
+    for row in rows:
+        padded = row + [""] * (col_count - len(row))
+        lines.append("| " + " | ".join(padded) + " |")
+    return lines
+
+
+def _collect_images(directory: Path) -> list[str]:
+    """Collect image files from a directory (single glob pass)."""
+    if not directory.exists():
+        return []
+    image_exts = {".png", ".jpg", ".jpeg", ".gif", ".webp"}
+    return sorted(
+        str(p) for p in directory.rglob("*")
+        if p.suffix.lower() in image_exts
+    )
+
+
+def _convert_grid_tables(text: str, stats: PostProcessStats) -> str:
+    """Convert pandoc grid tables to standard markdown.
+
+    Single-column grid tables (info boxes) -> blockquotes.
+    Multi-column grid tables (side-by-side images) -> split into individual elements.
+    Nested grid tables are flattened.
+    """
+    lines = text.split("\n")
+    result = []
+    i = 0
+
+    while i < len(lines):
+        line = lines[i]
+
+        # Detect grid table start
+        if _is_grid_border(line):
+            # Collect the entire grid table
+            table_lines = [line]
+            i += 1
+            while i < len(lines):
+                table_lines.append(lines[i])
+                if _is_grid_border(lines[i]) and len(table_lines) > 1:
+                    i += 1
+                    break
+                i += 1
+            else:
+                # Reached end of file without closing border
+                # Just output as-is
+                result.extend(table_lines)
+                continue
+
+            stats.grid_tables_converted += 1
+            num_cols = _count_grid_columns(table_lines[0])
+
+            # Extract content lines (skip borders)
+            content_lines = []
+            for tl in table_lines:
+                if _is_grid_border(tl) or _is_nested_grid_border(tl):
+                    continue
+                m = _RE_GRID_ROW.match(tl.strip())
+                if m:
+                    content_lines.append(m.group(1).strip())
+                else:
+                    # Non-standard line inside grid, keep content
+                    stripped = tl.strip()
+                    if stripped and stripped != "|":
+                        content_lines.append(stripped)
+
+            if num_cols <= 1:
+                # Single column -> blockquote
+                result.append("")
+                for cl in content_lines:
+                    # Strip outer pipes if present from nested grids
+                    cleaned = cl.strip()
+                    if cleaned.startswith("|") and cleaned.endswith("|"):
+                        cleaned = cleaned[1:-1].strip()
+                    # Skip nested grid borders
+                    if _RE_GRID_BORDER.match(cleaned):
+                        continue
+                    if cleaned:
+                        result.append(f"> {cleaned}")
+                    else:
+                        result.append(">")
+                result.append("")
+            else:
+                # Multi-column -> convert to standard pipe table
+                # Parse rows: each content_line is a row, split by | into cells
+                table_rows = []
+                for cl in content_lines:
+                    cells = [c.strip() for c in cl.split("|") if c.strip() and not _RE_GRID_BORDER.match(c.strip())]
+                    if cells:
+                        table_rows.append(cells)
+
+                if table_rows:
+                    result.append("")
+                    result.extend(_build_pipe_table(table_rows))
+                    result.append("")
+        else:
+            result.append(line)
+            i += 1
+
+    return "\n".join(result)
+
+
+def _fix_image_paths(text: str, assets_dir: Optional[Path], stats: PostProcessStats) -> str:
+    """Fix pandoc's double media path and verify images exist.
+
+    Pandoc extracts to <assets_dir>/media/<files> but references as
+    <assets_dir>/media/media/<files>. Fix the references.
+    Also flatten the actual directory if needed.
+    """
+    def fix_path(m: re.Match) -> str:
+        alt = m.group(1)
+        path = m.group(2)
+        new_path = path
+
+        # Fix double media/ path
+        if "media/media/" in path:
+            new_path = path.replace("media/media/", "media/")
+            stats.image_paths_fixed += 1
+
+        return f"![{alt}]({new_path})"
+
+    text = re.sub(r"!\[([^\]]*)\]\(([^)]+)\)", fix_path, text)
+
+    # Flatten double media/ nesting if present (pandoc artifact)
+    if assets_dir:
+        double_media = assets_dir / "media" / "media"
+        single_media = assets_dir / "media"
+        try:
+            for f in double_media.iterdir():
+                dest = single_media / f.name
+                if not dest.exists():
+                    shutil.move(str(f), str(dest))
+            double_media.rmdir()
+        except (FileNotFoundError, OSError):
+                pass
+
+    return text
+
+
+def _clean_pandoc_attributes(text: str, stats: PostProcessStats) -> str:
+    """Remove pandoc attribute annotations from markdown.
+
+    Removes: {width="..." height="..."}, {.underline}, {.mark}, etc.
+    """
+    count_before = len(text)
+
+    # Remove width/height attributes on images
+    text = _RE_PANDOC_ATTR.sub("", text)
+
+    # Remove class attributes like {.underline}
+    text = _RE_PANDOC_CLASS.sub("", text)
+
+    if len(text) != count_before:
+        # Rough count of removals
+        stats.attributes_removed = count_before - len(text)
+
+    return text
+
+
+def _is_code_content(lines: list[str]) -> bool:
+    """Heuristic: decide if content between dashed lines is code or a note/callout.
+
+    Code indicators:
+    - Has a language hint on the first line
+    - Contains JSON/code-like syntax ({, }, =, ;, ->, //)
+    - Contains URLs with protocols
+    - Has backslash line continuations
+
+    Note indicators:
+    - Mostly CJK/prose text without code syntax
+    - Short single-line content
+    """
+    text = "\n".join(lines)
+    stripped = text.strip()
+
+    if not stripped:
+        return False
+
+    # Code syntax indicators
+    code_chars = set('{}[]();=<>/\\')
+    code_char_count = sum(1 for c in stripped if c in code_chars)
+
+    # If >5% of content is code syntax characters, treat as code
+    if len(stripped) > 0 and code_char_count / len(stripped) > 0.05:
+        return True
+
+    # JSON-like structure
+    if stripped.startswith("{") or stripped.startswith("["):
+        return True
+
+    # Command-like (starts with common command patterns)
+    first_line = lines[0].strip() if lines else ""
+    if re.match(r"^(curl|wget|npm|pip|brew|apt|docker|git|ssh|cd|ls|cat|echo|python|node|uv)\s", first_line):
+        return True
+
+    return False
+
+
+def _fix_code_blocks(text: str, stats: PostProcessStats) -> str:
+    """Convert pandoc's indented dashed-line blocks to fenced code blocks or blockquotes.
+
+    Pandoc wraps both code and notes in:
+      ------------------------------------------------------------------
+      content here
+
+      ------------------------------------------------------------------
+
+    With language hint -> code block:
+      ```json
+      content here
+      ```
+
+    Without language hint + prose content -> blockquote:
+      > content here
+
+    Without language hint + code-like content -> code block:
+      ```
+      content here
+      ```
+    """
+    lines = text.split("\n")
+    result = []
+    i = 0
+
+    known_langs = _KNOWN_CODE_LANGS
+
+    while i < len(lines):
+        line = lines[i]
+
+        # Detect indented dashed line (2+ leading spaces, 3+ dashes)
+        if _RE_DASHED_LINE.match(line):
+            # Check if this is a pandoc simple table (multiple dashed columns
+            # on the same line, or content between dashes contains images)
+            # Simple table pattern: "  ----  ----" (multiple dash groups separated by spaces)
+            # Gap can be 1+ spaces (pandoc uses varying gaps)
+            dash_parts = [p for p in line.strip().split() if p.strip()]
+            is_simple_table_border = len(dash_parts) > 1 and all(
+                re.match(r"^-+$", p.strip()) for p in dash_parts
+            )
+
+            if is_simple_table_border:
+                # This is a pandoc simple table border - collect rows until
+                # next simple table border, convert to pipe table
+                table_rows = []
+                j = i + 1
+                while j < len(lines):
+                    next_line = lines[j]
+                    # Check for closing simple table border
+                    next_parts = [p for p in next_line.strip().split() if p.strip()]
+                    is_next_border = len(next_parts) > 1 and all(
+                        re.match(r"^-+$", p.strip()) for p in next_parts
+                    )
+                    if is_next_border:
+                        j += 1
+                        break
+                    if next_line.strip():
+                        # Split by 2+ spaces to get columns (pandoc uses varying gaps)
+                        cells = [c.strip() for c in re.split(r"\s{2,}", next_line.strip()) if c.strip()]
+                        if cells:
+                            table_rows.append(cells)
+                    j += 1
+
+                if table_rows:
+                    stats.code_blocks_fixed += 1
+                    result.append("")
+                    result.extend(_build_pipe_table(table_rows))
+                    result.append("")
+
+                i = j
+                continue
+
+            # Not a simple table - look for content and closing dashed line
+            block_content = []
+            lang_hint = ""
+            j = i + 1
+
+            while j < len(lines):
+                next_line = lines[j]
+
+                if _RE_DASHED_LINE.match(next_line):
+                    # Found closing dashed line
+                    j += 1
+                    break
+
+                block_content.append(next_line)
+                j += 1
+            else:
+                # No closing dashed line found - not a block, keep as-is
+                result.append(line)
+                i += 1
+                continue
+
+            # If content contains images, treat as simple table (single-column)
+            has_images = any("![" in cl for cl in block_content)
+            if has_images:
+                result.append("")
+                for cl in block_content:
+                    cl = cl.strip()
+                    if cl:
+                        result.append(cl)
+                result.append("")
+                i = j
+                continue
+
+            # Check if first line is a language hint (e.g., "  JSON\", "  Plain Text\")
+            has_lang_hint = False
+            if block_content:
+                first = block_content[0].strip()
+                first_clean = first.rstrip("\\").strip()
+                if first_clean.lower() in known_langs:
+                    lang_hint = first_clean.lower()
+                    if lang_hint in ("plain text", "text", "plaintext"):
+                        lang_hint = ""  # No language tag for plain text
+                    has_lang_hint = True
+                    block_content = block_content[1:]
+
+            # Clean content: remove leading 2-space indent, fix escaped quotes
+            cleaned = []
+            for cl in block_content:
+                if cl.startswith("  "):
+                    cl = cl[2:]
+                cl = cl.replace('\\"', '"')
+                if cl.endswith("\\"):
+                    cl = cl[:-1]
+                cleaned.append(cl)
+
+            # Remove trailing/leading empty lines
+            while cleaned and not cleaned[-1].strip():
+                cleaned.pop()
+            while cleaned and not cleaned[0].strip():
+                cleaned.pop(0)
+
+            if cleaned:
+                stats.code_blocks_fixed += 1
+
+                # Decide: code block vs blockquote
+                if has_lang_hint or _is_code_content(cleaned):
+                    # Code block
+                    result.append("")
+                    result.append(f"```{lang_hint}")
+                    result.extend(cleaned)
+                    result.append("```")
+                    result.append("")
+                else:
+                    # Note/callout -> blockquote
+                    result.append("")
+                    for cl in cleaned:
+                        if cl.strip():
+                            result.append(f"> {cl}")
+                        else:
+                            result.append(">")
+                    result.append("")
+
+            i = j
+        else:
+            result.append(line)
+            i += 1
+
+    return "\n".join(result)
+
+
+def _fix_escaped_brackets(text: str, stats: PostProcessStats) -> str:
+    r"""Fix pandoc's escaped brackets: \[ -> [, \] -> ]."""
+    count = len(_RE_ESCAPED_BRACKET.findall(text))
+    if count:
+        stats.escaped_brackets_fixed = count
+        text = _RE_ESCAPED_BRACKET.sub(r"\1", text)
+    return text
+
+
+def _fix_double_bracket_links(text: str, stats: PostProcessStats) -> str:
+    """Fix double-bracket links: [[text]{.underline}](url) -> [text](url)."""
+    count = 0
+
+    def _replace_link(m: re.Match) -> str:
+        nonlocal count
+        count += 1
+        return f"[{m.group(1)}]({m.group(2)})"
+
+    text = _RE_DOUBLE_BRACKET_ATTR_LINK.sub(_replace_link, text)
+    text = _RE_DOUBLE_BRACKET_CLOSED.sub(_replace_link, text)
+    text = _RE_DOUBLE_BRACKET_LINK.sub(_replace_link, text)
+
+    stats.double_brackets_fixed = count
+    return text
+
+
+def _fix_cjk_bold_spacing(text: str) -> str:
+    """Add space between **bold** markers and adjacent CJK characters.
+
+    DOCX uses run-level styling for bold — no spaces between runs in CJK text.
+    Markdown renderers need whitespace around ** to recognize bold boundaries.
+    We find each **content** span, check the character before/after, and insert
+    a space only when the adjacent character is CJK (avoiding double spaces).
+    """
+    result = []
+    last_end = 0
+
+    for m in _RE_BOLD_PAIR.finditer(text):
+        start, end = m.start(), m.end()
+        result.append(text[last_end:start])
+
+        # Space before opening ** if preceded by CJK
+        if start > 0 and _RE_CJK_PUNCT.match(text[start - 1]):
+            result.append(' ')
+
+        result.append(m.group(0))
+
+        # Space after closing ** if followed by CJK
+        if end < len(text) and _RE_CJK_PUNCT.match(text[end]):
+            result.append(' ')
+
+        last_end = end
+
+    result.append(text[last_end:])
+    return ''.join(result)
+
+
+def _cleanup_excessive_blank_lines(text: str) -> str:
+    """Collapse 3+ consecutive blank lines to 2."""
+    return re.sub(r"\n{4,}", "\n\n\n", text)
+
+
+def postprocess_docx_markdown(
+    text: str,
+    assets_dir: Optional[Path] = None,
+) -> tuple[str, PostProcessStats]:
+    """Apply all DOCX-specific post-processing to pandoc markdown output.
+
+    Returns (cleaned_text, stats).
+    """
+    stats = PostProcessStats()
+
+    # Order matters: grid tables first (they contain images with attributes)
+    text = _convert_grid_tables(text, stats)
+    text = _fix_image_paths(text, assets_dir, stats)
+    text = _clean_pandoc_attributes(text, stats)
+    text = _fix_code_blocks(text, stats)
+    text = _fix_double_bracket_links(text, stats)
+    text = _fix_escaped_brackets(text, stats)
+    text = _fix_cjk_bold_spacing(text)
+    text = _cleanup_excessive_blank_lines(text)
+
+    return text, stats
+
+
+# ── DOCX deep parsing (python-docx) ──────────────────────────────────────────
+
+def convert_with_docx_deep(
+    file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+    """Convert DOCX using python-docx direct parsing (experimental).
+
+    More precise than pandoc for:
+    - Table structure preservation
+    - Comment extraction
+    - Image extraction with position info
+    """
+    try:
+        from docx import Document
+        from docx.opc.constants import RELATIONSHIP_TYPE as RT
+    except ImportError:
+        return ConversionResult(
+            markdown="",
+            tool="docx-deep",
+            success=False,
+            error="python-docx not installed. Run: pip install python-docx",
+        )
+
+    try:
+        doc = Document(str(file_path))
+        md_parts = []
+        images = []
+        image_counter = 0
+
+        # Extract images from docx zip
+        if assets_dir:
+            assets_dir.mkdir(parents=True, exist_ok=True)
+            media_dir = assets_dir / "media"
+            media_dir.mkdir(exist_ok=True)
+
+            with zipfile.ZipFile(str(file_path), "r") as zf:
+                for name in zf.namelist():
+                    if name.startswith("word/media/"):
+                        img_name = Path(name).name
+                        img_dest = media_dir / img_name
+                        with zf.open(name) as src, open(img_dest, "wb") as dst:
+                            dst.write(src.read())
+                        images.append(str(img_dest))
+
+        # Process paragraphs
+        for para in doc.paragraphs:
+            style_name = para.style.name if para.style else ""
+            text = para.text.strip()
+
+            if not text:
+                md_parts.append("")
+                continue
+
+            # Headings
+            if style_name.startswith("Heading"):
+                try:
+                    level = int(style_name.split()[-1])
+                except (ValueError, IndexError):
+                    level = 1
+                md_parts.append(f"{'#' * level} {text}")
+                md_parts.append("")
+                continue
+
+            # Check for bold-only paragraphs (often sub-headings in Chinese docs)
+            all_bold = all(run.bold for run in para.runs if run.text.strip())
+            if all_bold and para.runs and len(text) < 100:
+                md_parts.append(f"**{text}**")
+                md_parts.append("")
+                continue
+
+            # Regular paragraph
+            md_parts.append(text)
+            md_parts.append("")
+
+        # Process tables
+        for table in doc.tables:
+            md_parts.append("")
+            rows = table.rows
+            if not rows:
+                continue
+
+            # Header row
+            header_cells = [cell.text.strip() for cell in rows[0].cells]
+            md_parts.append("| " + " | ".join(header_cells) + " |")
+            md_parts.append("| " + " | ".join(["---"] * len(header_cells)) + " |")
+
+            # Data rows
+            for row in rows[1:]:
+                cells = [cell.text.strip() for cell in row.cells]
+                md_parts.append("| " + " | ".join(cells) + " |")
+            md_parts.append("")
+
+        markdown = "\n".join(md_parts)
+
+        return ConversionResult(
+            markdown=markdown,
+            tool="docx-deep",
+            images=images,
+            success=True,
+        )
+    except Exception as e:
+        return ConversionResult(
+            markdown="", tool="docx-deep", success=False, error=str(e)
+        )
+
+
+# ── Existing tool converters ─────────────────────────────────────────────────
+
+def check_tool_available(tool: str) -> bool:
+    """Check if a conversion tool is available."""
+    if tool == "pymupdf4llm":
+        try:
+            import pymupdf4llm
+            return True
+        except ImportError:
+            return False
+    elif tool == "markitdown":
+        try:
+            import markitdown
+            return True
+        except ImportError:
+            return False
+    elif tool == "pandoc":
+        return shutil.which("pandoc") is not None
+    elif tool == "docx-deep":
+        try:
+            from docx import Document
+            return True
+        except ImportError:
+            return False
+    return False
+
+
+def select_tools(file_path: Path, mode: str) -> list[str]:
+    """Select conversion tools based on file type and mode."""
+    ext = file_path.suffix.lower()
+
+    # Tool preferences by format
+    tool_map = {
+        ".pdf": {
+            "quick": ["pymupdf4llm", "markitdown"],  # fallback order
+            "heavy": ["pymupdf4llm", "markitdown"],
+        },
+        ".docx": {
+            "quick": ["pandoc", "markitdown"],
+            "heavy": ["pandoc", "markitdown"],
+        },
+        ".doc": {
+            "quick": ["pandoc", "markitdown"],
+            "heavy": ["pandoc", "markitdown"],
+        },
+        ".pptx": {
+            "quick": ["markitdown", "pandoc"],
+            "heavy": ["markitdown", "pandoc"],
+        },
+        ".xlsx": {
+            "quick": ["markitdown"],
+            "heavy": ["markitdown"],
+        },
+    }
+
+    tools = tool_map.get(ext, {"quick": ["markitdown"], "heavy": ["markitdown"]})
+
+    if mode == "quick":
+        # Return first available tool
+        for tool in tools["quick"]:
+            if check_tool_available(tool):
+                return [tool]
+        return []
+    else:  # heavy
+        # Return all available tools
+        return [t for t in tools["heavy"] if check_tool_available(t)]
+
+
+def convert_with_pymupdf4llm(
+    file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+    """Convert using PyMuPDF4LLM (best for PDFs)."""
+    try:
+        import pymupdf4llm
+
+        kwargs = {}
+        images = []
+
+        if assets_dir:
+            assets_dir.mkdir(parents=True, exist_ok=True)
+            kwargs["write_images"] = True
+            kwargs["image_path"] = str(assets_dir)
+            kwargs["dpi"] = 150
+
+        # Use best table detection strategy
+        kwargs["table_strategy"] = "lines_strict"
+
+        md_text = pymupdf4llm.to_markdown(str(file_path), **kwargs)
+
+        if assets_dir:
+            images = _collect_images(assets_dir)
+
+        return ConversionResult(
+            markdown=md_text, tool="pymupdf4llm", images=images, success=True
+        )
+    except Exception as e:
+        return ConversionResult(
+            markdown="", tool="pymupdf4llm", success=False, error=str(e)
+        )
+
+
+def convert_with_markitdown(
+    file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+    """Convert using markitdown."""
+    try:
+        # markitdown CLI approach
+        result = subprocess.run(
+            ["markitdown", str(file_path)],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+
+        if result.returncode != 0:
+            return ConversionResult(
+                markdown="",
+                tool="markitdown",
+                success=False,
+                error=result.stderr,
+            )
+
+        return ConversionResult(
+            markdown=result.stdout, tool="markitdown", success=True
+        )
+    except FileNotFoundError:
+        # Try Python API
+        try:
+            from markitdown import MarkItDown
+
+            md = MarkItDown()
+            result = md.convert(str(file_path))
+            return ConversionResult(
+                markdown=result.text_content, tool="markitdown", success=True
+            )
+        except Exception as e:
+            return ConversionResult(
+                markdown="", tool="markitdown", success=False, error=str(e)
+            )
+    except Exception as e:
+        return ConversionResult(
+            markdown="", tool="markitdown", success=False, error=str(e)
+        )
+
+
+def convert_with_pandoc(
+    file_path: Path, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+    """Convert using pandoc.
+
+    Pandoc's --extract-media=DIR creates a media/ subdirectory inside DIR.
+    We point --extract-media at assets_dir's parent so pandoc's media/
+    subdirectory lands exactly at assets_dir (when assets_dir ends with 'media'),
+    or we use a temp dir and move files afterward.
+    """
+    try:
+        cmd = ["pandoc", str(file_path), "-t", "markdown", "--wrap=none"]
+
+        extract_dir = None
+        if assets_dir:
+            assets_dir.mkdir(parents=True, exist_ok=True)
+            # Pandoc always creates a media/ subdirectory inside --extract-media.
+            # Point it at the parent so media/ lands at assets_dir.
+            if assets_dir.name == "media":
+                extract_dir = assets_dir.parent
+            else:
+                extract_dir = assets_dir
+            cmd.extend(["--extract-media", str(extract_dir)])
+
+        result = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=120
+        )
+
+        if result.returncode != 0:
+            return ConversionResult(
+                markdown="", tool="pandoc", success=False, error=result.stderr
+            )
+
+        md = result.stdout
+
+        # Convert absolute image paths to relative paths based on output location
+        if extract_dir:
+            abs_media = str(extract_dir / "media")
+            # Replace absolute paths with relative 'media/' prefix
+            md = md.replace(abs_media + "/", "media/")
+
+        images = _collect_images(assets_dir) if assets_dir else []
+
+        return ConversionResult(
+            markdown=md, tool="pandoc", images=images, success=True
+        )
+    except Exception as e:
+        return ConversionResult(
+            markdown="", tool="pandoc", success=False, error=str(e)
+        )
+
+
+def convert_single(
+    file_path: Path, tool: str, assets_dir: Optional[Path] = None
+) -> ConversionResult:
+    """Run a single conversion tool."""
+    converters = {
+        "pymupdf4llm": convert_with_pymupdf4llm,
+        "markitdown": convert_with_markitdown,
+        "pandoc": convert_with_pandoc,
+        "docx-deep": convert_with_docx_deep,
+    }
+
+    converter = converters.get(tool)
+    if not converter:
+        return ConversionResult(
+            markdown="", tool=tool, success=False, error=f"Unknown tool: {tool}"
+        )
+
+    return converter(file_path, assets_dir)
+
+
+def merge_results(results: list[ConversionResult]) -> ConversionResult:
+    """Merge results from multiple tools, selecting best segments."""
+    if not results:
+        return ConversionResult(markdown="", tool="none", success=False)
+
+    # Filter successful results
+    successful = [r for r in results if r.success and r.markdown.strip()]
+    if not successful:
+        # Return first error
+        return results[0] if results else ConversionResult(
+            markdown="", tool="none", success=False
+        )
+
+    if len(successful) == 1:
+        return successful[0]
+
+    # Multiple successful results - merge them
+    # Strategy: Compare key metrics and select best
+    best = successful[0]
+    best_score = score_markdown(best.markdown)
+
+    for result in successful[1:]:
+        score = score_markdown(result.markdown)
+        if score > best_score:
+            best = result
+            best_score = score
+
+    # Merge images from all results
+    all_images = []
+    seen = set()
+    for result in successful:
+        for img in result.images:
+            if img not in seen:
+                all_images.append(img)
+                seen.add(img)
+
+    best.images = all_images
+    best.tool = f"merged({','.join(r.tool for r in successful)})"
+
+    return best
+
+
+def score_markdown(md: str) -> float:
+    """Score markdown quality for comparison."""
+    score = 0.0
+
+    # Length (more content is generally better)
+    score += min(len(md) / 10000, 5.0)  # Cap at 5 points
+
+    # Tables (proper markdown tables)
+    table_count = md.count("|---|") + md.count("| ---")
+    score += min(table_count * 0.5, 3.0)
+
+    # Images (referenced images)
+    image_count = md.count("![")
+    score += min(image_count * 0.3, 2.0)
+
+    # Headings (proper hierarchy)
+    h1_count = md.count("\n# ")
+    h2_count = md.count("\n## ")
+    h3_count = md.count("\n### ")
+    if h1_count > 0 and h2_count >= h1_count:
+        score += 1.0  # Good hierarchy
+
+    # Lists (structured content)
+    list_count = md.count("\n- ") + md.count("\n* ") + md.count("\n1. ")
+    score += min(list_count * 0.1, 2.0)
+
+    # Penalize pandoc artifacts (grid tables, attributes)
+    artifact_count = md.count("+:---") + md.count("+---+")
+    artifact_count += md.count('{width="') + md.count("{.underline}")
+    score -= artifact_count * 0.5
+
+    return score
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert documents to markdown with multi-tool orchestration",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Quick mode (default)
+    python convert.py document.pdf -o output.md
+
+    # Heavy mode (best quality)
+    python convert.py document.pdf -o output.md --heavy
+
+    # DOCX deep mode (python-docx parsing)
+    python convert.py document.docx -o output.md --docx-deep
+
+    # With custom assets directory
+    python convert.py document.pdf -o output.md --assets-dir ./images
+        """,
+    )
+    parser.add_argument("input", type=Path, nargs="?", help="Input document path")
+    parser.add_argument(
+        "-o", "--output", type=Path, help="Output markdown file"
+    )
+    parser.add_argument(
+        "--heavy",
+        action="store_true",
+        help="Enable Heavy Mode (multi-tool, best quality)",
+    )
+    parser.add_argument(
+        "--docx-deep",
+        action="store_true",
+        help="Use python-docx direct parsing (experimental, DOCX only)",
+    )
+    parser.add_argument(
+        "--no-postprocess",
+        action="store_true",
+        help="Disable DOCX post-processing (keep raw pandoc output)",
+    )
+    parser.add_argument(
+        "--assets-dir",
+        type=Path,
+        default=None,
+        help="Directory for extracted images (default: <output>_assets/)",
+    )
+    parser.add_argument(
+        "--tool",
+        choices=["pymupdf4llm", "markitdown", "pandoc", "docx-deep"],
+        help="Force specific tool (overrides auto-selection)",
+    )
+    parser.add_argument(
+        "--list-tools",
+        action="store_true",
+        help="List available tools and exit",
+    )
+
+    args = parser.parse_args()
+
+    # List tools mode
+    if args.list_tools:
+        tools = ["pymupdf4llm", "markitdown", "pandoc", "docx-deep"]
+        print("Available conversion tools:")
+        for tool in tools:
+            status = "+" if check_tool_available(tool) else "-"
+            print(f"  {status} {tool}")
+        sys.exit(0)
+
+    # Validate input
+    if args.input is None:
+        parser.error("the following arguments are required: input")
+    if not args.input.exists():
+        print(f"Error: Input file not found: {args.input}", file=sys.stderr)
+        sys.exit(1)
+
+    # Determine output path
+    output_path = args.output or args.input.with_suffix(".md")
+
+    # Determine assets directory
+    assets_dir = args.assets_dir
+    if assets_dir is None:
+        assets_dir = output_path.parent / f"{output_path.stem}_assets"
+
+    is_docx = args.input.suffix.lower() in (".docx", ".doc")
+
+    # Handle --docx-deep mode
+    if args.docx_deep:
+        if not is_docx:
+            print("Error: --docx-deep only works with DOCX files.", file=sys.stderr)
+            sys.exit(1)
+        tools = ["docx-deep"]
+    elif args.tool:
+        tools = [args.tool] if check_tool_available(args.tool) else []
+    else:
+        # Select tools
+        mode = "heavy" if args.heavy else "quick"
+        tools = select_tools(args.input, mode)
+
+    mode = "docx-deep" if args.docx_deep else ("heavy" if args.heavy else "quick")
+
+    if not tools:
+        print("Error: No conversion tools available.", file=sys.stderr)
+        print("Install with:", file=sys.stderr)
+        print("  pip install pymupdf4llm", file=sys.stderr)
+        print("  uv tool install markitdown[pdf]", file=sys.stderr)
+        print("  brew install pandoc", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Converting: {args.input}")
+    print(f"Mode: {mode.upper()}")
+    print(f"Tools: {', '.join(tools)}")
+
+    # Run conversions
+    results = []
+    for tool in tools:
+        print(f"  Running {tool}...", end=" ", flush=True)
+
+        # Use separate assets dirs for each tool in heavy mode
+        tool_assets = None
+        if assets_dir and mode == "heavy" and len(tools) > 1:
+            tool_assets = assets_dir / tool
+        elif assets_dir:
+            tool_assets = assets_dir
+
+        result = convert_single(args.input, tool, tool_assets)
+        results.append(result)
+
+        if result.success:
+            print(f"ok ({len(result.markdown):,} chars, {len(result.images)} images)")
+        else:
+            print(f"FAIL ({result.error[:50]}...)")
+
+    # Merge results if heavy mode
+    if mode == "heavy" and len(results) > 1:
+        print("  Merging results...", end=" ", flush=True)
+        final = merge_results(results)
+        print(f"ok (using {final.tool})")
+    else:
+        final = merge_results(results)
+
+    if not final.success:
+        print(f"Error: Conversion failed: {final.error}", file=sys.stderr)
+        sys.exit(1)
+
+    # Apply DOCX post-processing
+    if is_docx and not args.no_postprocess and "pandoc" in final.tool:
+        print("  Post-processing DOCX output...", end=" ", flush=True)
+        final.markdown, pp_stats = postprocess_docx_markdown(
+            final.markdown, assets_dir
+        )
+        print(f"ok ({pp_stats.summary()})")
+
+    # Write output
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(final.markdown)
+
+    print(f"\nOutput: {output_path}")
+    print(f"  Size: {len(final.markdown):,} characters")
+    if final.images:
+        print(f"  Images: {len(final.images)} extracted")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/markdown-tools/scripts/convert_path.py b/doc-to-markdown/scripts/convert_path.py
similarity index 100%
rename from markdown-tools/scripts/convert_path.py
rename to doc-to-markdown/scripts/convert_path.py
diff --git a/markdown-tools/scripts/extract_pdf_images.py b/doc-to-markdown/scripts/extract_pdf_images.py
similarity index 100%
rename from markdown-tools/scripts/extract_pdf_images.py
rename to doc-to-markdown/scripts/extract_pdf_images.py
diff --git a/markdown-tools/scripts/merge_outputs.py b/doc-to-markdown/scripts/merge_outputs.py
similarity index 100%
rename from markdown-tools/scripts/merge_outputs.py
rename to doc-to-markdown/scripts/merge_outputs.py
diff --git a/markdown-tools/scripts/validate_output.py b/doc-to-markdown/scripts/validate_output.py
similarity index 100%
rename from markdown-tools/scripts/validate_output.py
rename to doc-to-markdown/scripts/validate_output.py
diff --git a/markdown-tools/scripts/convert.py b/markdown-tools/scripts/convert.py
deleted file mode 100755
index 9ac6f36..0000000
--- a/markdown-tools/scripts/convert.py
+++ /dev/null
@@ -1,434 +0,0 @@
-#!/usr/bin/env python3
-"""
-Multi-tool document to markdown converter with intelligent orchestration.
-
-Supports Quick Mode (fast, single tool) and Heavy Mode (best quality, multi-tool merge).
-
-Usage:
-    # Quick Mode (default) - fast, single best tool
-    uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md
-
-    # Heavy Mode - multi-tool parallel execution with merge
-    uv run --with pymupdf4llm --with markitdown scripts/convert.py document.pdf -o output.md --heavy
-
-    # With image extraction
-    uv run --with pymupdf4llm scripts/convert.py document.pdf -o output.md --assets-dir ./images
-
-Dependencies:
-    - pymupdf4llm: PDF conversion (LLM-optimized)
-    - markitdown: PDF/DOCX/PPTX conversion
-    - pandoc: DOCX/PPTX conversion (system install: brew install pandoc)
-"""
-
-import argparse
-import subprocess
-import sys
-import tempfile
-import shutil
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Optional
-
-
-@dataclass
-class ConversionResult:
-    """Result from a single tool conversion."""
-    markdown: str
-    tool: str
-    images: list[str] = field(default_factory=list)
-    success: bool = True
-    error: str = ""
-
-
-def check_tool_available(tool: str) -> bool:
-    """Check if a conversion tool is available."""
-    if tool == "pymupdf4llm":
-        try:
-            import pymupdf4llm
-            return True
-        except ImportError:
-            return False
-    elif tool == "markitdown":
-        try:
-            import markitdown
-            return True
-        except ImportError:
-            return False
-    elif tool == "pandoc":
-        return shutil.which("pandoc") is not None
-    return False
-
-
-def select_tools(file_path: Path, mode: str) -> list[str]:
-    """Select conversion tools based on file type and mode."""
-    ext = file_path.suffix.lower()
-
-    # Tool preferences by format
-    tool_map = {
-        ".pdf": {
-            "quick": ["pymupdf4llm", "markitdown"],  # fallback order
-            "heavy": ["pymupdf4llm", "markitdown"],
-        },
-        ".docx": {
-            "quick": ["pandoc", "markitdown"],
-            "heavy": ["pandoc", "markitdown"],
-        },
-        ".doc": {
-            "quick": ["pandoc", "markitdown"],
-            "heavy": ["pandoc", "markitdown"],
-        },
-        ".pptx": {
-            "quick": ["markitdown", "pandoc"],
-            "heavy": ["markitdown", "pandoc"],
-        },
-        ".xlsx": {
-            "quick": ["markitdown"],
-            "heavy": ["markitdown"],
-        },
-    }
-
-    tools = tool_map.get(ext, {"quick": ["markitdown"], "heavy": ["markitdown"]})
-
-    if mode == "quick":
-        # Return first available tool
-        for tool in tools["quick"]:
-            if check_tool_available(tool):
-                return [tool]
-        return []
-    else:  # heavy
-        # Return all available tools
-        return [t for t in tools["heavy"] if check_tool_available(t)]
-
-
-def convert_with_pymupdf4llm(
-    file_path: Path, assets_dir: Optional[Path] = None
-) -> ConversionResult:
-    """Convert using PyMuPDF4LLM (best for PDFs)."""
-    try:
-        import pymupdf4llm
-
-        kwargs = {}
-        images = []
-
-        if assets_dir:
-            assets_dir.mkdir(parents=True, exist_ok=True)
-            kwargs["write_images"] = True
-            kwargs["image_path"] = str(assets_dir)
-            kwargs["dpi"] = 150
-
-        # Use best table detection strategy
-        kwargs["table_strategy"] = "lines_strict"
-
-        md_text = pymupdf4llm.to_markdown(str(file_path), **kwargs)
-
-        # Collect extracted images
-        if assets_dir and assets_dir.exists():
-            images = [str(p) for p in assets_dir.glob("*.png")]
-            images.extend([str(p) for p in assets_dir.glob("*.jpg")])
-
-        return ConversionResult(
-            markdown=md_text, tool="pymupdf4llm", images=images, success=True
-        )
-    except Exception as e:
-        return ConversionResult(
-            markdown="", tool="pymupdf4llm", success=False, error=str(e)
-        )
-
-
-def convert_with_markitdown(
-    file_path: Path, assets_dir: Optional[Path] = None
-) -> ConversionResult:
-    """Convert using markitdown."""
-    try:
-        # markitdown CLI approach
-        result = subprocess.run(
-            ["markitdown", str(file_path)],
-            capture_output=True,
-            text=True,
-            timeout=120,
-        )
-
-        if result.returncode != 0:
-            return ConversionResult(
-                markdown="",
-                tool="markitdown",
-                success=False,
-                error=result.stderr,
-            )
-
-        return ConversionResult(
-            markdown=result.stdout, tool="markitdown", success=True
-        )
-    except FileNotFoundError:
-        # Try Python API
-        try:
-            from markitdown import MarkItDown
-
-            md = MarkItDown()
-            result = md.convert(str(file_path))
-            return ConversionResult(
-                markdown=result.text_content, tool="markitdown", success=True
-            )
-        except Exception as e:
-            return ConversionResult(
-                markdown="", tool="markitdown", success=False, error=str(e)
-            )
-    except Exception as e:
-        return ConversionResult(
-            markdown="", tool="markitdown", success=False, error=str(e)
-        )
-
-
-def convert_with_pandoc(
-    file_path: Path, assets_dir: Optional[Path] = None
-) -> ConversionResult:
-    """Convert using pandoc."""
-    try:
-        cmd = ["pandoc", str(file_path), "-t", "markdown", "--wrap=none"]
-
-        if assets_dir:
-            assets_dir.mkdir(parents=True, exist_ok=True)
-            cmd.extend(["--extract-media", str(assets_dir)])
-
-        result = subprocess.run(
-            cmd, capture_output=True, text=True, timeout=120
-        )
-
-        if result.returncode != 0:
-            return ConversionResult(
-                markdown="", tool="pandoc", success=False, error=result.stderr
-            )
-
-        images = []
-        if assets_dir and assets_dir.exists():
-            images = [str(p) for p in assets_dir.rglob("*.png")]
-            images.extend([str(p) for p in assets_dir.rglob("*.jpg")])
-
-        return ConversionResult(
-            markdown=result.stdout, tool="pandoc", images=images, success=True
-        )
-    except Exception as e:
-        return ConversionResult(
-            markdown="", tool="pandoc", success=False, error=str(e)
-        )
-
-
-def convert_single(
-    file_path: Path, tool: str, assets_dir: Optional[Path] = None
-) -> ConversionResult:
-    """Run a single conversion tool."""
-    converters = {
-        "pymupdf4llm": convert_with_pymupdf4llm,
-        "markitdown": convert_with_markitdown,
-        "pandoc": convert_with_pandoc,
-    }
-
-    converter = converters.get(tool)
-    if not converter:
-        return ConversionResult(
-            markdown="", tool=tool, success=False, error=f"Unknown tool: {tool}"
-        )
-
-    return converter(file_path, assets_dir)
-
-
-def merge_results(results: list[ConversionResult]) -> ConversionResult:
-    """Merge results from multiple tools, selecting best segments."""
-    if not results:
-        return ConversionResult(markdown="", tool="none", success=False)
-
-    # Filter successful results
-    successful = [r for r in results if r.success and r.markdown.strip()]
-    if not successful:
-        # Return first error
-        return results[0] if results else ConversionResult(
-            markdown="", tool="none", success=False
-        )
-
-    if len(successful) == 1:
-        return successful[0]
-
-    # Multiple successful results - merge them
-    # Strategy: Compare key metrics and select best
-    best = successful[0]
-    best_score = score_markdown(best.markdown)
-
-    for result in successful[1:]:
-        score = score_markdown(result.markdown)
-        if score > best_score:
-            best = result
-            best_score = score
-
-    # Merge images from all results
-    all_images = []
-    seen = set()
-    for result in successful:
-        for img in result.images:
-            if img not in seen:
-                all_images.append(img)
-                seen.add(img)
-
-    best.images = all_images
-    best.tool = f"merged({','.join(r.tool for r in successful)})"
-
-    return best
-
-
-def score_markdown(md: str) -> float:
-    """Score markdown quality for comparison."""
-    score = 0.0
-
-    # Length (more content is generally better)
-    score += min(len(md) / 10000, 5.0)  # Cap at 5 points
-
-    # Tables (proper markdown tables)
-    table_count = md.count("|---|") + md.count("| ---")
-    score += min(table_count * 0.5, 3.0)
-
-    # Images (referenced images)
-    image_count = md.count("![")
-    score += min(image_count * 0.3, 2.0)
-
-    # Headings (proper hierarchy)
-    h1_count = md.count("\n# ")
-    h2_count = md.count("\n## ")
-    h3_count = md.count("\n### ")
-    if h1_count > 0 and h2_count >= h1_count:
-        score += 1.0  # Good hierarchy
-
-    # Lists (structured content)
-    list_count = md.count("\n- ") + md.count("\n* ") + md.count("\n1. ")
-    score += min(list_count * 0.1, 2.0)
-
-    return score
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Convert documents to markdown with multi-tool orchestration",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-    # Quick mode (default)
-    python convert.py document.pdf -o output.md
-
-    # Heavy mode (best quality)
-    python convert.py document.pdf -o output.md --heavy
-
-    # With custom assets directory
-    python convert.py document.pdf -o output.md --assets-dir ./images
-        """,
-    )
-    parser.add_argument("input", type=Path, help="Input document path")
-    parser.add_argument(
-        "-o", "--output", type=Path, help="Output markdown file"
-    )
-    parser.add_argument(
-        "--heavy",
-        action="store_true",
-        help="Enable Heavy Mode (multi-tool, best quality)",
-    )
-    parser.add_argument(
-        "--assets-dir",
-        type=Path,
-        default=None,
-        help="Directory for extracted images (default: <output>_assets/)",
-    )
-    parser.add_argument(
-        "--tool",
-        choices=["pymupdf4llm", "markitdown", "pandoc"],
-        help="Force specific tool (overrides auto-selection)",
-    )
-    parser.add_argument(
-        "--list-tools",
-        action="store_true",
-        help="List available tools and exit",
-    )
-
-    args = parser.parse_args()
-
-    # List tools mode
-    if args.list_tools:
-        tools = ["pymupdf4llm", "markitdown", "pandoc"]
-        print("Available conversion tools:")
-        for tool in tools:
-            status = "✓" if check_tool_available(tool) else "✗"
-            print(f"  {status} {tool}")
-        sys.exit(0)
-
-    # Validate input
-    if not args.input.exists():
-        print(f"Error: Input file not found: {args.input}", file=sys.stderr)
-        sys.exit(1)
-
-    # Determine output path
-    output_path = args.output or args.input.with_suffix(".md")
-
-    # Determine assets directory
-    assets_dir = args.assets_dir
-    if assets_dir is None and args.heavy:
-        assets_dir = output_path.parent / f"{output_path.stem}_assets"
-
-    # Select tools
-    mode = "heavy" if args.heavy else "quick"
-    if args.tool:
-        tools = [args.tool] if check_tool_available(args.tool) else []
-    else:
-        tools = select_tools(args.input, mode)
-
-    if not tools:
-        print("Error: No conversion tools available.", file=sys.stderr)
-        print("Install with:", file=sys.stderr)
-        print("  pip install pymupdf4llm", file=sys.stderr)
-        print("  uv tool install markitdown[pdf]", file=sys.stderr)
-        print("  brew install pandoc", file=sys.stderr)
-        sys.exit(1)
-
-    print(f"Converting: {args.input}")
-    print(f"Mode: {mode.upper()}")
-    print(f"Tools: {', '.join(tools)}")
-
-    # Run conversions
-    results = []
-    for tool in tools:
-        print(f"  Running {tool}...", end=" ", flush=True)
-
-        # Use separate assets dirs for each tool in heavy mode
-        tool_assets = None
-        if assets_dir and mode == "heavy" and len(tools) > 1:
-            tool_assets = assets_dir / tool
-        elif assets_dir:
-            tool_assets = assets_dir
-
-        result = convert_single(args.input, tool, tool_assets)
-        results.append(result)
-
-        if result.success:
-            print(f"✓ ({len(result.markdown):,} chars, {len(result.images)} images)")
-        else:
-            print(f"✗ ({result.error[:50]}...)")
-
-    # Merge results if heavy mode
-    if mode == "heavy" and len(results) > 1:
-        print("  Merging results...", end=" ", flush=True)
-        final = merge_results(results)
-        print(f"✓ (using {final.tool})")
-    else:
-        final = merge_results(results)
-
-    if not final.success:
-        print(f"Error: Conversion failed: {final.error}", file=sys.stderr)
-        sys.exit(1)
-
-    # Write output
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    output_path.write_text(final.markdown)
-
-    print(f"\nOutput: {output_path}")
-    print(f"  Size: {len(final.markdown):,} characters")
-    if final.images:
-        print(f"  Images: {len(final.images)} extracted")
-
-
-if __name__ == "__main__":
-    main()