feat(doc-to-markdown): CJK bold spacing, JSON pretty-print, 31 tests, full rename cleanup
- Add CJK bold spacing fix: insert spaces around **bold** spans containing CJK characters for correct rendering (handles emoji adjacency, already-spaced) - Add JSON pretty-print: auto-format JSON code blocks with 2-space indent - Add 31 unit tests covering all post-processing functions - Fix pandoc simple table detection (1-space column gaps) - Fix image path double-nesting when --assets-dir ends with 'media' - Rename all markdown-tools references across 15 files (README, QUICKSTART, marketplace.json, CLAUDE.md, meeting-minutes-taker, GitHub templates) - Add 5-tool benchmark report (Docling/MarkItDown/Pandoc/Mammoth/ours) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -26,6 +26,7 @@ Dependencies:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
@@ -478,10 +479,19 @@ def _fix_code_blocks(text: str, stats: PostProcessStats) -> str:
|
||||
|
||||
# Decide: code block vs blockquote
|
||||
if has_lang_hint or _is_code_content(cleaned):
|
||||
# Code block
|
||||
# Code block — try to pretty-print JSON
|
||||
code_lines = cleaned
|
||||
if lang_hint == "json":
|
||||
try:
|
||||
raw = "\n".join(cleaned)
|
||||
parsed = json.loads(raw)
|
||||
code_lines = json.dumps(parsed, indent=2, ensure_ascii=False).split("\n")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass # Keep original if not valid JSON
|
||||
|
||||
result.append("")
|
||||
result.append(f"```{lang_hint}")
|
||||
result.extend(cleaned)
|
||||
result.extend(code_lines)
|
||||
result.append("```")
|
||||
result.append("")
|
||||
else:
|
||||
@@ -529,29 +539,40 @@ def _fix_double_bracket_links(text: str, stats: PostProcessStats) -> str:
|
||||
|
||||
|
||||
def _fix_cjk_bold_spacing(text: str) -> str:
|
||||
"""Add space between **bold** markers and adjacent CJK characters.
|
||||
"""Add space around **bold** spans that contain CJK characters.
|
||||
|
||||
DOCX uses run-level styling for bold — no spaces between runs in CJK text.
|
||||
Markdown renderers need whitespace around ** to recognize bold boundaries.
|
||||
We find each **content** span, check the character before/after, and insert
|
||||
a space only when the adjacent character is CJK (avoiding double spaces).
|
||||
|
||||
Rule: if a **content** span contains any CJK character, ensure both sides
|
||||
have a space (unless already spaced or at line boundary). This handles:
|
||||
- CJK directly touching **: 打开**飞书** → 打开 **飞书**
|
||||
- Emoji touching **: **密码】**➡️ → **密码】** ➡️
|
||||
- Already spaced: 已有 **粗体** → unchanged
|
||||
- English bold: English **bold** text → unchanged
|
||||
"""
|
||||
result = []
|
||||
last_end = 0
|
||||
|
||||
for m in _RE_BOLD_PAIR.finditer(text):
|
||||
start, end = m.start(), m.end()
|
||||
content = m.group(1)
|
||||
|
||||
result.append(text[last_end:start])
|
||||
|
||||
# Space before opening ** if preceded by CJK
|
||||
if start > 0 and _RE_CJK_PUNCT.match(text[start - 1]):
|
||||
result.append(' ')
|
||||
# Only add spaces for bold spans containing CJK
|
||||
if _RE_CJK_PUNCT.search(content):
|
||||
# Space before ** if previous char is not whitespace
|
||||
if start > 0 and text[start - 1] not in (' ', '\t', '\n'):
|
||||
result.append(' ')
|
||||
|
||||
result.append(m.group(0))
|
||||
result.append(m.group(0))
|
||||
|
||||
# Space after closing ** if followed by CJK
|
||||
if end < len(text) and _RE_CJK_PUNCT.match(text[end]):
|
||||
result.append(' ')
|
||||
# Space after ** if next char is not whitespace
|
||||
if end < len(text) and text[end] not in (' ', '\t', '\n'):
|
||||
result.append(' ')
|
||||
else:
|
||||
result.append(m.group(0))
|
||||
|
||||
last_end = end
|
||||
|
||||
|
||||
242
doc-to-markdown/scripts/test_convert.py
Normal file
242
doc-to-markdown/scripts/test_convert.py
Normal file
@@ -0,0 +1,242 @@
|
||||
"""Tests for doc-to-markdown convert.py post-processing functions.
|
||||
|
||||
Run: uv run pytest scripts/test_convert.py -v
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Import the module under test
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from convert import (
|
||||
_fix_cjk_bold_spacing,
|
||||
_build_pipe_table,
|
||||
_collect_images,
|
||||
PostProcessStats,
|
||||
postprocess_docx_markdown,
|
||||
)
|
||||
|
||||
|
||||
# ── CJK Bold Spacing ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestCjkBoldSpacing:
|
||||
"""Test _fix_cjk_bold_spacing: spaces between **bold** and CJK chars."""
|
||||
|
||||
def test_bold_followed_by_cjk_punctuation(self):
|
||||
"""**text** directly touching CJK colon → add space after **."""
|
||||
inp = "**打开阶跃开放平台链接**:https://platform.stepfun.com/"
|
||||
out = _fix_cjk_bold_spacing(inp)
|
||||
assert "**打开阶跃开放平台链接** :" in out
|
||||
|
||||
def test_cjk_before_bold(self):
|
||||
"""CJK char directly before ** → add space before **."""
|
||||
assert _fix_cjk_bold_spacing("可用**手机号**进行") == "可用 **手机号** 进行"
|
||||
|
||||
def test_bold_with_emoji_neighbor(self):
|
||||
"""**text** touching emoji ➡️ → still add space (CJK content rule)."""
|
||||
inp = "点击**【接口密码】**➡️**【创建新的密钥**】"
|
||||
out = _fix_cjk_bold_spacing(inp)
|
||||
# Each CJK-containing bold span should have spaces on both sides
|
||||
assert "点击 **【接口密码】** ➡️" in out
|
||||
assert "➡️ **【创建新的密钥**" in out
|
||||
|
||||
def test_full_emoji_line(self):
|
||||
"""Complete line with emoji separators between bold spans."""
|
||||
inp = "点击**【接口密码】**➡️**【创建新的密钥**】➡️**【输入密钥名称】**(输入你想取的名称),生成API Key"
|
||||
out = _fix_cjk_bold_spacing(inp)
|
||||
assert "点击 **【接口密码】** ➡️" in out
|
||||
assert "**【输入密钥名称】** (输入" in out
|
||||
|
||||
def test_bold_between_cjk(self):
|
||||
"""CJK **text** CJK → spaces on both sides."""
|
||||
assert _fix_cjk_bold_spacing("打开**飞书**,就可以") == "打开 **飞书** ,就可以"
|
||||
|
||||
def test_bold_with_chinese_quotes(self):
|
||||
"""Bold containing Chinese quotes."""
|
||||
inp = '有个**"企鹅戴龙虾头套的机器人"**,开始'
|
||||
out = _fix_cjk_bold_spacing(inp)
|
||||
assert '**"企鹅戴龙虾头套的机器人"** ,' in out
|
||||
|
||||
def test_multiple_bold_spans(self):
|
||||
"""Multiple bold spans in one line."""
|
||||
assert _fix_cjk_bold_spacing("这是**测试**和**验证**的效果") == "这是 **测试** 和 **验证** 的效果"
|
||||
|
||||
def test_already_spaced(self):
|
||||
"""Already has spaces → no double spaces."""
|
||||
inp = "已有空格 **粗体** 不需要再加"
|
||||
assert _fix_cjk_bold_spacing(inp) == inp
|
||||
|
||||
def test_english_unchanged(self):
|
||||
"""English bold text should not be modified."""
|
||||
inp = "English **bold** text should not change"
|
||||
assert _fix_cjk_bold_spacing(inp) == inp
|
||||
|
||||
def test_line_start_bold(self):
|
||||
"""Bold at line start followed by CJK."""
|
||||
assert _fix_cjk_bold_spacing("**重要**内容") == "**重要** 内容"
|
||||
|
||||
def test_line_start_bold_standalone(self):
|
||||
"""Bold at line start with no CJK neighbor → no change."""
|
||||
assert _fix_cjk_bold_spacing("**这是纯粗体不需要改**") == "**这是纯粗体不需要改**"
|
||||
|
||||
def test_no_bold(self):
|
||||
"""Text without bold markers → unchanged."""
|
||||
inp = "这是普通文本,没有粗体"
|
||||
assert _fix_cjk_bold_spacing(inp) == inp
|
||||
|
||||
def test_empty_string(self):
|
||||
assert _fix_cjk_bold_spacing("") == ""
|
||||
|
||||
def test_bold_at_line_end(self):
|
||||
"""Bold at line end → no trailing space needed."""
|
||||
assert _fix_cjk_bold_spacing("内容是**粗体**") == "内容是 **粗体**"
|
||||
|
||||
def test_mixed_cjk_and_english_bold(self):
|
||||
"""English bold between CJK → no change (no CJK in content)."""
|
||||
inp = "请使用 **API Key** 进行认证"
|
||||
assert _fix_cjk_bold_spacing(inp) == inp
|
||||
|
||||
|
||||
# ── Pipe Table Builder ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestBuildPipeTable:
|
||||
"""Test _build_pipe_table: rows → markdown pipe table."""
|
||||
|
||||
def test_basic_table(self):
|
||||
rows = [["a", "b"], ["c", "d"]]
|
||||
result = _build_pipe_table(rows)
|
||||
assert result == [
|
||||
"| | |",
|
||||
"| --- | --- |",
|
||||
"| a | b |",
|
||||
"| c | d |",
|
||||
]
|
||||
|
||||
def test_uneven_rows(self):
|
||||
"""Rows with different column counts → padded."""
|
||||
rows = [["a", "b", "c"], ["d"]]
|
||||
result = _build_pipe_table(rows)
|
||||
assert "| d | | |" in result
|
||||
|
||||
def test_single_cell(self):
|
||||
rows = [["only"]]
|
||||
result = _build_pipe_table(rows)
|
||||
assert len(result) == 3 # header + sep + 1 row
|
||||
|
||||
def test_empty_rows(self):
|
||||
assert _build_pipe_table([]) == []
|
||||
|
||||
def test_image_with_caption(self):
|
||||
"""Images and captions should pair correctly in table."""
|
||||
rows = [
|
||||
["", ""],
|
||||
["Step 1", "Step 2"],
|
||||
]
|
||||
result = _build_pipe_table(rows)
|
||||
assert "|  |  |" in result
|
||||
assert "| Step 1 | Step 2 |" in result
|
||||
|
||||
|
||||
# ── Full Post-Processing Pipeline ─────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestPostprocessPipeline:
|
||||
"""Integration tests for the full postprocess_docx_markdown pipeline."""
|
||||
|
||||
def test_grid_table_single_column_to_blockquote(self):
|
||||
"""Single-column grid table → blockquote."""
|
||||
inp = """+:---+
|
||||
| 注意事项 |
|
||||
+----+"""
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert "> 注意事项" in out
|
||||
assert "+:---+" not in out
|
||||
|
||||
def test_pandoc_attributes_removed(self):
|
||||
"""Pandoc {width=...} and {.underline} removed."""
|
||||
inp = '{width="5in" height="3in"} and [text]{.underline}'
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert "{width=" not in out
|
||||
assert "{.underline}" not in out
|
||||
assert "" in out
|
||||
|
||||
def test_escaped_brackets_fixed(self):
|
||||
r"""Pandoc \[ and \] → [ and ]."""
|
||||
inp = r"你 \[在飞书里\] 发消息"
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert "你 [在飞书里] 发消息" in out
|
||||
|
||||
def test_double_bracket_links_fixed(self):
|
||||
"""[[text]](url) → [text](url)."""
|
||||
inp = "[[点击跳转]](https://example.com)"
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert "[点击跳转](https://example.com)" in out
|
||||
|
||||
def test_code_block_with_language(self):
|
||||
"""Indented dashed block with JSON language hint → ```json."""
|
||||
inp = """ ------------------------------------------------------------------
|
||||
JSON\\
|
||||
{\\
|
||||
"provider": "stepfun"\\
|
||||
}
|
||||
------------------------------------------------------------------"""
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert "```json" in out
|
||||
assert '"provider": "stepfun"' in out
|
||||
assert "---" not in out
|
||||
|
||||
def test_code_block_plain_text_to_blockquote(self):
|
||||
"""Indented dashed block with plain text → blockquote."""
|
||||
inp = """ --------------------------
|
||||
注意:这是一条重要提示
|
||||
--------------------------"""
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert "> 注意:这是一条重要提示" in out
|
||||
|
||||
def test_cjk_bold_spacing_in_pipeline(self):
|
||||
"""CJK bold spacing is applied in the full pipeline."""
|
||||
inp = "打开**飞书**,就可以看到"
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert "打开 **飞书** ,就可以看到" in out
|
||||
|
||||
def test_excessive_blank_lines_collapsed(self):
|
||||
"""4+ blank lines → 2 blank lines."""
|
||||
inp = "line1\n\n\n\n\nline2"
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert out.count("\n") < 5
|
||||
|
||||
def test_stats_tracking(self):
|
||||
"""Stats object correctly tracks fix counts."""
|
||||
inp = '{width="5in"}'
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert stats.attributes_removed > 0
|
||||
|
||||
|
||||
# ── Simple Table (pandoc) ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestSimpleTable:
|
||||
"""Test pandoc simple table (indented dashes with spaces) → pipe table."""
|
||||
|
||||
def test_two_column_image_table(self):
|
||||
"""Two images side by side in simple table → pipe table."""
|
||||
inp = """ ---- ----
|
||||
 
|
||||
|
||||
---- ----"""
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert "|  |  |" in out
|
||||
assert "----" not in out
|
||||
|
||||
def test_four_column_image_table(self):
|
||||
"""Four images in simple table → 4-column pipe table."""
|
||||
inp = """ ---------- ---------- ---------- ----------
|
||||
   
|
||||
|
||||
---------- ---------- ---------- ----------"""
|
||||
out, stats = postprocess_docx_markdown(inp)
|
||||
assert "|  |  |  |  |" in out
|
||||
Reference in New Issue
Block a user