feat(doc-to-markdown): CJK bold spacing, JSON pretty-print, 31 tests, full rename cleanup

- Add CJK bold spacing fix: insert spaces around **bold** spans containing
  CJK characters for correct rendering (handles emoji adjacency, already-spaced)
- Add JSON pretty-print: auto-format JSON code blocks with 2-space indent
- Add 31 unit tests covering all post-processing functions
- Fix pandoc simple table detection (1-space column gaps)
- Fix image path double-nesting when --assets-dir ends with 'media'
- Rename all markdown-tools references across 15 files (README, QUICKSTART,
  marketplace.json, CLAUDE.md, meeting-minutes-taker, GitHub templates)
- Add 5-tool benchmark report (Docling/MarkItDown/Pandoc/Mammoth/ours)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
daymade
2026-03-23 03:18:37 +08:00
parent a5f3a4bfbe
commit d9e1967689
16 changed files with 351 additions and 90 deletions

View File

@@ -26,6 +26,7 @@ Dependencies:
"""
import argparse
import json
import re
import subprocess
import sys
@@ -478,10 +479,19 @@ def _fix_code_blocks(text: str, stats: PostProcessStats) -> str:
# Decide: code block vs blockquote
if has_lang_hint or _is_code_content(cleaned):
# Code block
# Code block — try to pretty-print JSON
code_lines = cleaned
if lang_hint == "json":
try:
raw = "\n".join(cleaned)
parsed = json.loads(raw)
code_lines = json.dumps(parsed, indent=2, ensure_ascii=False).split("\n")
except (json.JSONDecodeError, ValueError):
pass # Keep original if not valid JSON
result.append("")
result.append(f"```{lang_hint}")
result.extend(cleaned)
result.extend(code_lines)
result.append("```")
result.append("")
else:
@@ -529,29 +539,40 @@ def _fix_double_bracket_links(text: str, stats: PostProcessStats) -> str:
def _fix_cjk_bold_spacing(text: str) -> str:
"""Add space between **bold** markers and adjacent CJK characters.
"""Add space around **bold** spans that contain CJK characters.
DOCX uses run-level styling for bold — no spaces between runs in CJK text.
Markdown renderers need whitespace around ** to recognize bold boundaries.
We find each **content** span, check the character before/after, and insert
a space only when the adjacent character is CJK (avoiding double spaces).
Rule: if a **content** span contains any CJK character, ensure both sides
have a space (unless already spaced or at line boundary). This handles:
- CJK directly touching **: 打开**飞书** → 打开 **飞书**
- Emoji touching **: **密码】**➡️ → **密码】** ➡️
- Already spaced: 已有 **粗体** → unchanged
- English bold: English **bold** text → unchanged
"""
result = []
last_end = 0
for m in _RE_BOLD_PAIR.finditer(text):
start, end = m.start(), m.end()
content = m.group(1)
result.append(text[last_end:start])
# Space before opening ** if preceded by CJK
if start > 0 and _RE_CJK_PUNCT.match(text[start - 1]):
result.append(' ')
# Only add spaces for bold spans containing CJK
if _RE_CJK_PUNCT.search(content):
# Space before ** if previous char is not whitespace
if start > 0 and text[start - 1] not in (' ', '\t', '\n'):
result.append(' ')
result.append(m.group(0))
result.append(m.group(0))
# Space after closing ** if followed by CJK
if end < len(text) and _RE_CJK_PUNCT.match(text[end]):
result.append(' ')
# Space after ** if next char is not whitespace
if end < len(text) and text[end] not in (' ', '\t', '\n'):
result.append(' ')
else:
result.append(m.group(0))
last_end = end

View File

@@ -0,0 +1,242 @@
"""Tests for doc-to-markdown convert.py post-processing functions.
Run: uv run pytest scripts/test_convert.py -v
"""
import pytest
import re
import sys
from pathlib import Path
# Import the module under test
sys.path.insert(0, str(Path(__file__).parent))
from convert import (
_fix_cjk_bold_spacing,
_build_pipe_table,
_collect_images,
PostProcessStats,
postprocess_docx_markdown,
)
# ── CJK Bold Spacing ─────────────────────────────────────────────────────────
class TestCjkBoldSpacing:
"""Test _fix_cjk_bold_spacing: spaces between **bold** and CJK chars."""
def test_bold_followed_by_cjk_punctuation(self):
"""**text** directly touching CJK colon → add space after **."""
inp = "**打开阶跃开放平台链接**https://platform.stepfun.com/"
out = _fix_cjk_bold_spacing(inp)
assert "**打开阶跃开放平台链接** " in out
def test_cjk_before_bold(self):
"""CJK char directly before ** → add space before **."""
assert _fix_cjk_bold_spacing("可用**手机号**进行") == "可用 **手机号** 进行"
def test_bold_with_emoji_neighbor(self):
"""**text** touching emoji ➡️ → still add space (CJK content rule)."""
inp = "点击**【接口密码】**➡️**【创建新的密钥**】"
out = _fix_cjk_bold_spacing(inp)
# Each CJK-containing bold span should have spaces on both sides
assert "点击 **【接口密码】** ➡️" in out
assert "➡️ **【创建新的密钥**" in out
def test_full_emoji_line(self):
"""Complete line with emoji separators between bold spans."""
inp = "点击**【接口密码】**➡️**【创建新的密钥**】➡️**【输入密钥名称】**输入你想取的名称生成API Key"
out = _fix_cjk_bold_spacing(inp)
assert "点击 **【接口密码】** ➡️" in out
assert "**【输入密钥名称】** (输入" in out
def test_bold_between_cjk(self):
"""CJK **text** CJK → spaces on both sides."""
assert _fix_cjk_bold_spacing("打开**飞书**,就可以") == "打开 **飞书** ,就可以"
def test_bold_with_chinese_quotes(self):
"""Bold containing Chinese quotes."""
inp = '有个**"企鹅戴龙虾头套的机器人"**,开始'
out = _fix_cjk_bold_spacing(inp)
assert '**"企鹅戴龙虾头套的机器人"** ' in out
def test_multiple_bold_spans(self):
"""Multiple bold spans in one line."""
assert _fix_cjk_bold_spacing("这是**测试**和**验证**的效果") == "这是 **测试** 和 **验证** 的效果"
def test_already_spaced(self):
"""Already has spaces → no double spaces."""
inp = "已有空格 **粗体** 不需要再加"
assert _fix_cjk_bold_spacing(inp) == inp
def test_english_unchanged(self):
"""English bold text should not be modified."""
inp = "English **bold** text should not change"
assert _fix_cjk_bold_spacing(inp) == inp
def test_line_start_bold(self):
"""Bold at line start followed by CJK."""
assert _fix_cjk_bold_spacing("**重要**内容") == "**重要** 内容"
def test_line_start_bold_standalone(self):
"""Bold at line start with no CJK neighbor → no change."""
assert _fix_cjk_bold_spacing("**这是纯粗体不需要改**") == "**这是纯粗体不需要改**"
def test_no_bold(self):
"""Text without bold markers → unchanged."""
inp = "这是普通文本,没有粗体"
assert _fix_cjk_bold_spacing(inp) == inp
def test_empty_string(self):
assert _fix_cjk_bold_spacing("") == ""
def test_bold_at_line_end(self):
"""Bold at line end → no trailing space needed."""
assert _fix_cjk_bold_spacing("内容是**粗体**") == "内容是 **粗体**"
def test_mixed_cjk_and_english_bold(self):
"""English bold between CJK → no change (no CJK in content)."""
inp = "请使用 **API Key** 进行认证"
assert _fix_cjk_bold_spacing(inp) == inp
# ── Pipe Table Builder ────────────────────────────────────────────────────────
class TestBuildPipeTable:
"""Test _build_pipe_table: rows → markdown pipe table."""
def test_basic_table(self):
rows = [["a", "b"], ["c", "d"]]
result = _build_pipe_table(rows)
assert result == [
"| | |",
"| --- | --- |",
"| a | b |",
"| c | d |",
]
def test_uneven_rows(self):
"""Rows with different column counts → padded."""
rows = [["a", "b", "c"], ["d"]]
result = _build_pipe_table(rows)
assert "| d | | |" in result
def test_single_cell(self):
rows = [["only"]]
result = _build_pipe_table(rows)
assert len(result) == 3 # header + sep + 1 row
def test_empty_rows(self):
assert _build_pipe_table([]) == []
def test_image_with_caption(self):
"""Images and captions should pair correctly in table."""
rows = [
["![](img1.png)", "![](img2.png)"],
["Step 1", "Step 2"],
]
result = _build_pipe_table(rows)
assert "| ![](img1.png) | ![](img2.png) |" in result
assert "| Step 1 | Step 2 |" in result
# ── Full Post-Processing Pipeline ─────────────────────────────────────────────
class TestPostprocessPipeline:
"""Integration tests for the full postprocess_docx_markdown pipeline."""
def test_grid_table_single_column_to_blockquote(self):
"""Single-column grid table → blockquote."""
inp = """+:---+
| 注意事项 |
+----+"""
out, stats = postprocess_docx_markdown(inp)
assert "> 注意事项" in out
assert "+:---+" not in out
def test_pandoc_attributes_removed(self):
"""Pandoc {width=...} and {.underline} removed."""
inp = '![](img.png){width="5in" height="3in"} and [text]{.underline}'
out, stats = postprocess_docx_markdown(inp)
assert "{width=" not in out
assert "{.underline}" not in out
assert "![](img.png)" in out
def test_escaped_brackets_fixed(self):
r"""Pandoc \[ and \] → [ and ]."""
inp = r"\[在飞书里\] 发消息"
out, stats = postprocess_docx_markdown(inp)
assert "你 [在飞书里] 发消息" in out
def test_double_bracket_links_fixed(self):
"""[[text]](url) → [text](url)."""
inp = "[[点击跳转]](https://example.com)"
out, stats = postprocess_docx_markdown(inp)
assert "[点击跳转](https://example.com)" in out
def test_code_block_with_language(self):
"""Indented dashed block with JSON language hint → ```json."""
inp = """ ------------------------------------------------------------------
JSON\\
{\\
"provider": "stepfun"\\
}
------------------------------------------------------------------"""
out, stats = postprocess_docx_markdown(inp)
assert "```json" in out
assert '"provider": "stepfun"' in out
assert "---" not in out
def test_code_block_plain_text_to_blockquote(self):
"""Indented dashed block with plain text → blockquote."""
inp = """ --------------------------
注意:这是一条重要提示
--------------------------"""
out, stats = postprocess_docx_markdown(inp)
assert "> 注意:这是一条重要提示" in out
def test_cjk_bold_spacing_in_pipeline(self):
"""CJK bold spacing is applied in the full pipeline."""
inp = "打开**飞书**,就可以看到"
out, stats = postprocess_docx_markdown(inp)
assert "打开 **飞书** ,就可以看到" in out
def test_excessive_blank_lines_collapsed(self):
"""4+ blank lines → 2 blank lines."""
inp = "line1\n\n\n\n\nline2"
out, stats = postprocess_docx_markdown(inp)
assert out.count("\n") < 5
def test_stats_tracking(self):
"""Stats object correctly tracks fix counts."""
inp = '![](media/media/img.png){width="5in"}'
out, stats = postprocess_docx_markdown(inp)
assert stats.attributes_removed > 0
# ── Simple Table (pandoc) ─────────────────────────────────────────────────────
class TestSimpleTable:
"""Test pandoc simple table (indented dashes with spaces) → pipe table."""
def test_two_column_image_table(self):
"""Two images side by side in simple table → pipe table."""
inp = """ ---- ----
![](img1.png) ![](img2.png)
---- ----"""
out, stats = postprocess_docx_markdown(inp)
assert "| ![](img1.png) | ![](img2.png) |" in out
assert "----" not in out
def test_four_column_image_table(self):
"""Four images in simple table → 4-column pipe table."""
inp = """ ---------- ---------- ---------- ----------
![](a.png) ![](b.png) ![](c.png) ![](d.png)
---------- ---------- ---------- ----------"""
out, stats = postprocess_docx_markdown(inp)
assert "| ![](a.png) | ![](b.png) | ![](c.png) | ![](d.png) |" in out