Files
daymade d9e1967689 feat(doc-to-markdown): CJK bold spacing, JSON pretty-print, 31 tests, full rename cleanup
- Add CJK bold spacing fix: insert spaces around **bold** spans containing
  CJK characters for correct rendering (handles emoji adjacency, already-spaced)
- Add JSON pretty-print: auto-format JSON code blocks with 2-space indent
- Add 31 unit tests covering all post-processing functions
- Fix pandoc simple table detection (1-space column gaps)
- Fix image path double-nesting when --assets-dir ends with 'media'
- Rename all markdown-tools references across 15 files (README, QUICKSTART,
  marketplace.json, CLAUDE.md, meeting-minutes-taker, GitHub templates)
- Add 5-tool benchmark report (Docling/MarkItDown/Pandoc/Mammoth/ours)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-23 03:18:37 +08:00

243 lines
9.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for doc-to-markdown convert.py post-processing functions.
Run: uv run pytest scripts/test_convert.py -v
"""
import pytest
import re
import sys
from pathlib import Path
# Import the module under test
sys.path.insert(0, str(Path(__file__).parent))
from convert import (
_fix_cjk_bold_spacing,
_build_pipe_table,
_collect_images,
PostProcessStats,
postprocess_docx_markdown,
)
# ── CJK Bold Spacing ─────────────────────────────────────────────────────────
class TestCjkBoldSpacing:
"""Test _fix_cjk_bold_spacing: spaces between **bold** and CJK chars."""
def test_bold_followed_by_cjk_punctuation(self):
"""**text** directly touching CJK colon → add space after **."""
inp = "**打开阶跃开放平台链接**https://platform.stepfun.com/"
out = _fix_cjk_bold_spacing(inp)
assert "**打开阶跃开放平台链接** " in out
def test_cjk_before_bold(self):
"""CJK char directly before ** → add space before **."""
assert _fix_cjk_bold_spacing("可用**手机号**进行") == "可用 **手机号** 进行"
def test_bold_with_emoji_neighbor(self):
"""**text** touching emoji ➡️ → still add space (CJK content rule)."""
inp = "点击**【接口密码】**➡️**【创建新的密钥**】"
out = _fix_cjk_bold_spacing(inp)
# Each CJK-containing bold span should have spaces on both sides
assert "点击 **【接口密码】** ➡️" in out
assert "➡️ **【创建新的密钥**" in out
def test_full_emoji_line(self):
"""Complete line with emoji separators between bold spans."""
inp = "点击**【接口密码】**➡️**【创建新的密钥**】➡️**【输入密钥名称】**输入你想取的名称生成API Key"
out = _fix_cjk_bold_spacing(inp)
assert "点击 **【接口密码】** ➡️" in out
assert "**【输入密钥名称】** (输入" in out
def test_bold_between_cjk(self):
"""CJK **text** CJK → spaces on both sides."""
assert _fix_cjk_bold_spacing("打开**飞书**,就可以") == "打开 **飞书** ,就可以"
def test_bold_with_chinese_quotes(self):
"""Bold containing Chinese quotes."""
inp = '有个**"企鹅戴龙虾头套的机器人"**,开始'
out = _fix_cjk_bold_spacing(inp)
assert '**"企鹅戴龙虾头套的机器人"** ' in out
def test_multiple_bold_spans(self):
"""Multiple bold spans in one line."""
assert _fix_cjk_bold_spacing("这是**测试**和**验证**的效果") == "这是 **测试** 和 **验证** 的效果"
def test_already_spaced(self):
"""Already has spaces → no double spaces."""
inp = "已有空格 **粗体** 不需要再加"
assert _fix_cjk_bold_spacing(inp) == inp
def test_english_unchanged(self):
"""English bold text should not be modified."""
inp = "English **bold** text should not change"
assert _fix_cjk_bold_spacing(inp) == inp
def test_line_start_bold(self):
"""Bold at line start followed by CJK."""
assert _fix_cjk_bold_spacing("**重要**内容") == "**重要** 内容"
def test_line_start_bold_standalone(self):
"""Bold at line start with no CJK neighbor → no change."""
assert _fix_cjk_bold_spacing("**这是纯粗体不需要改**") == "**这是纯粗体不需要改**"
def test_no_bold(self):
"""Text without bold markers → unchanged."""
inp = "这是普通文本,没有粗体"
assert _fix_cjk_bold_spacing(inp) == inp
def test_empty_string(self):
assert _fix_cjk_bold_spacing("") == ""
def test_bold_at_line_end(self):
"""Bold at line end → no trailing space needed."""
assert _fix_cjk_bold_spacing("内容是**粗体**") == "内容是 **粗体**"
def test_mixed_cjk_and_english_bold(self):
"""English bold between CJK → no change (no CJK in content)."""
inp = "请使用 **API Key** 进行认证"
assert _fix_cjk_bold_spacing(inp) == inp
# ── Pipe Table Builder ────────────────────────────────────────────────────────
class TestBuildPipeTable:
"""Test _build_pipe_table: rows → markdown pipe table."""
def test_basic_table(self):
rows = [["a", "b"], ["c", "d"]]
result = _build_pipe_table(rows)
assert result == [
"| | |",
"| --- | --- |",
"| a | b |",
"| c | d |",
]
def test_uneven_rows(self):
"""Rows with different column counts → padded."""
rows = [["a", "b", "c"], ["d"]]
result = _build_pipe_table(rows)
assert "| d | | |" in result
def test_single_cell(self):
rows = [["only"]]
result = _build_pipe_table(rows)
assert len(result) == 3 # header + sep + 1 row
def test_empty_rows(self):
assert _build_pipe_table([]) == []
def test_image_with_caption(self):
"""Images and captions should pair correctly in table."""
rows = [
["![](img1.png)", "![](img2.png)"],
["Step 1", "Step 2"],
]
result = _build_pipe_table(rows)
assert "| ![](img1.png) | ![](img2.png) |" in result
assert "| Step 1 | Step 2 |" in result
# ── Full Post-Processing Pipeline ─────────────────────────────────────────────
class TestPostprocessPipeline:
"""Integration tests for the full postprocess_docx_markdown pipeline."""
def test_grid_table_single_column_to_blockquote(self):
"""Single-column grid table → blockquote."""
inp = """+:---+
| 注意事项 |
+----+"""
out, stats = postprocess_docx_markdown(inp)
assert "> 注意事项" in out
assert "+:---+" not in out
def test_pandoc_attributes_removed(self):
"""Pandoc {width=...} and {.underline} removed."""
inp = '![](img.png){width="5in" height="3in"} and [text]{.underline}'
out, stats = postprocess_docx_markdown(inp)
assert "{width=" not in out
assert "{.underline}" not in out
assert "![](img.png)" in out
def test_escaped_brackets_fixed(self):
r"""Pandoc \[ and \] → [ and ]."""
inp = r"\[在飞书里\] 发消息"
out, stats = postprocess_docx_markdown(inp)
assert "你 [在飞书里] 发消息" in out
def test_double_bracket_links_fixed(self):
"""[[text]](url) → [text](url)."""
inp = "[[点击跳转]](https://example.com)"
out, stats = postprocess_docx_markdown(inp)
assert "[点击跳转](https://example.com)" in out
def test_code_block_with_language(self):
"""Indented dashed block with JSON language hint → ```json."""
inp = """ ------------------------------------------------------------------
JSON\\
{\\
"provider": "stepfun"\\
}
------------------------------------------------------------------"""
out, stats = postprocess_docx_markdown(inp)
assert "```json" in out
assert '"provider": "stepfun"' in out
assert "---" not in out
def test_code_block_plain_text_to_blockquote(self):
"""Indented dashed block with plain text → blockquote."""
inp = """ --------------------------
注意:这是一条重要提示
--------------------------"""
out, stats = postprocess_docx_markdown(inp)
assert "> 注意:这是一条重要提示" in out
def test_cjk_bold_spacing_in_pipeline(self):
"""CJK bold spacing is applied in the full pipeline."""
inp = "打开**飞书**,就可以看到"
out, stats = postprocess_docx_markdown(inp)
assert "打开 **飞书** ,就可以看到" in out
def test_excessive_blank_lines_collapsed(self):
"""4+ blank lines → 2 blank lines."""
inp = "line1\n\n\n\n\nline2"
out, stats = postprocess_docx_markdown(inp)
assert out.count("\n") < 5
def test_stats_tracking(self):
"""Stats object correctly tracks fix counts."""
inp = '![](media/media/img.png){width="5in"}'
out, stats = postprocess_docx_markdown(inp)
assert stats.attributes_removed > 0
# ── Simple Table (pandoc) ─────────────────────────────────────────────────────
class TestSimpleTable:
"""Test pandoc simple table (indented dashes with spaces) → pipe table."""
def test_two_column_image_table(self):
"""Two images side by side in simple table → pipe table."""
inp = """ ---- ----
![](img1.png) ![](img2.png)
---- ----"""
out, stats = postprocess_docx_markdown(inp)
assert "| ![](img1.png) | ![](img2.png) |" in out
assert "----" not in out
def test_four_column_image_table(self):
"""Four images in simple table → 4-column pipe table."""
inp = """ ---------- ---------- ---------- ----------
![](a.png) ![](b.png) ![](c.png) ![](d.png)
---------- ---------- ---------- ----------"""
out, stats = postprocess_docx_markdown(inp)
assert "| ![](a.png) | ![](b.png) | ![](c.png) | ![](d.png) |" in out