- Add CJK bold spacing fix: insert spaces around **bold** spans containing CJK characters for correct rendering (handles emoji adjacency, already-spaced) - Add JSON pretty-print: auto-format JSON code blocks with 2-space indent - Add 31 unit tests covering all post-processing functions - Fix pandoc simple table detection (1-space column gaps) - Fix image path double-nesting when --assets-dir ends with 'media' - Rename all markdown-tools references across 15 files (README, QUICKSTART, marketplace.json, CLAUDE.md, meeting-minutes-taker, GitHub templates) - Add 5-tool benchmark report (Docling/MarkItDown/Pandoc/Mammoth/ours) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
243 lines
9.4 KiB
Python
243 lines
9.4 KiB
Python
"""Tests for doc-to-markdown convert.py post-processing functions.
|
||
|
||
Run: uv run pytest scripts/test_convert.py -v
|
||
"""
|
||
|
||
import pytest
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# Import the module under test
|
||
sys.path.insert(0, str(Path(__file__).parent))
|
||
from convert import (
|
||
_fix_cjk_bold_spacing,
|
||
_build_pipe_table,
|
||
_collect_images,
|
||
PostProcessStats,
|
||
postprocess_docx_markdown,
|
||
)
|
||
|
||
|
||
# ── CJK Bold Spacing ─────────────────────────────────────────────────────────
|
||
|
||
|
||
class TestCjkBoldSpacing:
|
||
"""Test _fix_cjk_bold_spacing: spaces between **bold** and CJK chars."""
|
||
|
||
def test_bold_followed_by_cjk_punctuation(self):
|
||
"""**text** directly touching CJK colon → add space after **."""
|
||
inp = "**打开阶跃开放平台链接**:https://platform.stepfun.com/"
|
||
out = _fix_cjk_bold_spacing(inp)
|
||
assert "**打开阶跃开放平台链接** :" in out
|
||
|
||
def test_cjk_before_bold(self):
|
||
"""CJK char directly before ** → add space before **."""
|
||
assert _fix_cjk_bold_spacing("可用**手机号**进行") == "可用 **手机号** 进行"
|
||
|
||
def test_bold_with_emoji_neighbor(self):
|
||
"""**text** touching emoji ➡️ → still add space (CJK content rule)."""
|
||
inp = "点击**【接口密码】**➡️**【创建新的密钥**】"
|
||
out = _fix_cjk_bold_spacing(inp)
|
||
# Each CJK-containing bold span should have spaces on both sides
|
||
assert "点击 **【接口密码】** ➡️" in out
|
||
assert "➡️ **【创建新的密钥**" in out
|
||
|
||
def test_full_emoji_line(self):
|
||
"""Complete line with emoji separators between bold spans."""
|
||
inp = "点击**【接口密码】**➡️**【创建新的密钥**】➡️**【输入密钥名称】**(输入你想取的名称),生成API Key"
|
||
out = _fix_cjk_bold_spacing(inp)
|
||
assert "点击 **【接口密码】** ➡️" in out
|
||
assert "**【输入密钥名称】** (输入" in out
|
||
|
||
def test_bold_between_cjk(self):
|
||
"""CJK **text** CJK → spaces on both sides."""
|
||
assert _fix_cjk_bold_spacing("打开**飞书**,就可以") == "打开 **飞书** ,就可以"
|
||
|
||
def test_bold_with_chinese_quotes(self):
|
||
"""Bold containing Chinese quotes."""
|
||
inp = '有个**"企鹅戴龙虾头套的机器人"**,开始'
|
||
out = _fix_cjk_bold_spacing(inp)
|
||
assert '**"企鹅戴龙虾头套的机器人"** ,' in out
|
||
|
||
def test_multiple_bold_spans(self):
|
||
"""Multiple bold spans in one line."""
|
||
assert _fix_cjk_bold_spacing("这是**测试**和**验证**的效果") == "这是 **测试** 和 **验证** 的效果"
|
||
|
||
def test_already_spaced(self):
|
||
"""Already has spaces → no double spaces."""
|
||
inp = "已有空格 **粗体** 不需要再加"
|
||
assert _fix_cjk_bold_spacing(inp) == inp
|
||
|
||
def test_english_unchanged(self):
|
||
"""English bold text should not be modified."""
|
||
inp = "English **bold** text should not change"
|
||
assert _fix_cjk_bold_spacing(inp) == inp
|
||
|
||
def test_line_start_bold(self):
|
||
"""Bold at line start followed by CJK."""
|
||
assert _fix_cjk_bold_spacing("**重要**内容") == "**重要** 内容"
|
||
|
||
def test_line_start_bold_standalone(self):
|
||
"""Bold at line start with no CJK neighbor → no change."""
|
||
assert _fix_cjk_bold_spacing("**这是纯粗体不需要改**") == "**这是纯粗体不需要改**"
|
||
|
||
def test_no_bold(self):
|
||
"""Text without bold markers → unchanged."""
|
||
inp = "这是普通文本,没有粗体"
|
||
assert _fix_cjk_bold_spacing(inp) == inp
|
||
|
||
def test_empty_string(self):
|
||
assert _fix_cjk_bold_spacing("") == ""
|
||
|
||
def test_bold_at_line_end(self):
|
||
"""Bold at line end → no trailing space needed."""
|
||
assert _fix_cjk_bold_spacing("内容是**粗体**") == "内容是 **粗体**"
|
||
|
||
def test_mixed_cjk_and_english_bold(self):
|
||
"""English bold between CJK → no change (no CJK in content)."""
|
||
inp = "请使用 **API Key** 进行认证"
|
||
assert _fix_cjk_bold_spacing(inp) == inp
|
||
|
||
|
||
# ── Pipe Table Builder ────────────────────────────────────────────────────────
|
||
|
||
|
||
class TestBuildPipeTable:
|
||
"""Test _build_pipe_table: rows → markdown pipe table."""
|
||
|
||
def test_basic_table(self):
|
||
rows = [["a", "b"], ["c", "d"]]
|
||
result = _build_pipe_table(rows)
|
||
assert result == [
|
||
"| | |",
|
||
"| --- | --- |",
|
||
"| a | b |",
|
||
"| c | d |",
|
||
]
|
||
|
||
def test_uneven_rows(self):
|
||
"""Rows with different column counts → padded."""
|
||
rows = [["a", "b", "c"], ["d"]]
|
||
result = _build_pipe_table(rows)
|
||
assert "| d | | |" in result
|
||
|
||
def test_single_cell(self):
|
||
rows = [["only"]]
|
||
result = _build_pipe_table(rows)
|
||
assert len(result) == 3 # header + sep + 1 row
|
||
|
||
def test_empty_rows(self):
|
||
assert _build_pipe_table([]) == []
|
||
|
||
def test_image_with_caption(self):
|
||
"""Images and captions should pair correctly in table."""
|
||
rows = [
|
||
["", ""],
|
||
["Step 1", "Step 2"],
|
||
]
|
||
result = _build_pipe_table(rows)
|
||
assert "|  |  |" in result
|
||
assert "| Step 1 | Step 2 |" in result
|
||
|
||
|
||
# ── Full Post-Processing Pipeline ─────────────────────────────────────────────
|
||
|
||
|
||
class TestPostprocessPipeline:
|
||
"""Integration tests for the full postprocess_docx_markdown pipeline."""
|
||
|
||
def test_grid_table_single_column_to_blockquote(self):
|
||
"""Single-column grid table → blockquote."""
|
||
inp = """+:---+
|
||
| 注意事项 |
|
||
+----+"""
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert "> 注意事项" in out
|
||
assert "+:---+" not in out
|
||
|
||
def test_pandoc_attributes_removed(self):
|
||
"""Pandoc {width=...} and {.underline} removed."""
|
||
inp = '{width="5in" height="3in"} and [text]{.underline}'
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert "{width=" not in out
|
||
assert "{.underline}" not in out
|
||
assert "" in out
|
||
|
||
def test_escaped_brackets_fixed(self):
|
||
r"""Pandoc \[ and \] → [ and ]."""
|
||
inp = r"你 \[在飞书里\] 发消息"
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert "你 [在飞书里] 发消息" in out
|
||
|
||
def test_double_bracket_links_fixed(self):
|
||
"""[[text]](url) → [text](url)."""
|
||
inp = "[[点击跳转]](https://example.com)"
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert "[点击跳转](https://example.com)" in out
|
||
|
||
def test_code_block_with_language(self):
|
||
"""Indented dashed block with JSON language hint → ```json."""
|
||
inp = """ ------------------------------------------------------------------
|
||
JSON\\
|
||
{\\
|
||
"provider": "stepfun"\\
|
||
}
|
||
------------------------------------------------------------------"""
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert "```json" in out
|
||
assert '"provider": "stepfun"' in out
|
||
assert "---" not in out
|
||
|
||
def test_code_block_plain_text_to_blockquote(self):
|
||
"""Indented dashed block with plain text → blockquote."""
|
||
inp = """ --------------------------
|
||
注意:这是一条重要提示
|
||
--------------------------"""
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert "> 注意:这是一条重要提示" in out
|
||
|
||
def test_cjk_bold_spacing_in_pipeline(self):
|
||
"""CJK bold spacing is applied in the full pipeline."""
|
||
inp = "打开**飞书**,就可以看到"
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert "打开 **飞书** ,就可以看到" in out
|
||
|
||
def test_excessive_blank_lines_collapsed(self):
|
||
"""4+ blank lines → 2 blank lines."""
|
||
inp = "line1\n\n\n\n\nline2"
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert out.count("\n") < 5
|
||
|
||
def test_stats_tracking(self):
|
||
"""Stats object correctly tracks fix counts."""
|
||
inp = '{width="5in"}'
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert stats.attributes_removed > 0
|
||
|
||
|
||
# ── Simple Table (pandoc) ─────────────────────────────────────────────────────
|
||
|
||
|
||
class TestSimpleTable:
|
||
"""Test pandoc simple table (indented dashes with spaces) → pipe table."""
|
||
|
||
def test_two_column_image_table(self):
|
||
"""Two images side by side in simple table → pipe table."""
|
||
inp = """ ---- ----
|
||
 
|
||
|
||
---- ----"""
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert "|  |  |" in out
|
||
assert "----" not in out
|
||
|
||
def test_four_column_image_table(self):
|
||
"""Four images in simple table → 4-column pipe table."""
|
||
inp = """ ---------- ---------- ---------- ----------
|
||
   
|
||
|
||
---------- ---------- ---------- ----------"""
|
||
out, stats = postprocess_docx_markdown(inp)
|
||
assert "|  |  |  |  |" in out
|