fix(pdf-creator): restore list spacing preprocessor for pandoc

- Add _ensure_list_spacing() to handle lists without blank lines before them - Modify _md_to_html() to preprocess markdown content via stdin - Add automated test suite (scripts/tests/test_list_rendering.py) - Fix: Lists without preceding blank lines now render correctly - Original markdown files remain unmodified (preprocessing in memory only) Root cause: Pandoc requires blank lines before lists per CommonMark spec. Without preprocessing, lists following paragraphs render as plain text. Tested scenarios: ✅ Lists with blank lines (normal case) ✅ Lists without blank lines (critical fix) ✅ Ordered lists without blank lines ✅ Original file integrity preserved Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-11 13:58:39 +08:00
parent c49e23e7ef
commit 6dc2805f03
2 changed files with 199 additions and 3 deletions
--- a/pdf-creator/scripts/md_to_pdf.py
+++ b/pdf-creator/scripts/md_to_pdf.py
@@ -19,6 +19,7 @@ Requirements:
 import os
 import platform
 import re
 import shutil
 import subprocess
 import sys
@@ -146,15 +147,44 @@ blockquote {
 """
 def _ensure_list_spacing(text: str) -> str:
    """Ensure blank lines before list items for proper markdown parsing.
    Both Python markdown library and pandoc require a blank line before a list
    when it follows a paragraph. Without it, list items render as plain text.
    This preprocessor adds blank lines before list items when needed, without
    modifying the user's original markdown file.
    """
    lines = text.split('\n')
    result = []
    list_re = re.compile(r'^(\s*)([-*+]|\d+\.)\s')
    for i, line in enumerate(lines):
        if i > 0 and list_re.match(line):
            prev = lines[i - 1]
            if prev.strip() and not list_re.match(prev):
                result.append('')
        result.append(line)
    return '\n'.join(result)
 def _md_to_html(md_file: str) -> str:
-    """Convert markdown to HTML using pandoc."""
+    """Convert markdown to HTML using pandoc with list spacing preprocessing.
    Reads the markdown file, preprocesses it to ensure proper list spacing,
    then passes the content to pandoc via stdin. The original file is not modified.
    """
    if not shutil.which('pandoc'):
        print("Error: pandoc not found. Install with: brew install pandoc", file=sys.stderr)
        sys.exit(1)
    # Read and preprocess markdown to ensure list spacing
    md_content = Path(md_file).read_text(encoding='utf-8')
    md_content = _ensure_list_spacing(md_content)
    result = subprocess.run(
-        ['pandoc', md_file, '-f', 'markdown', '-t', 'html'],
+        ['pandoc', '-f', 'markdown', '-t', 'html'],
-        capture_output=True, text=True,
+        input=md_content, capture_output=True, text=True,
    )
    if result.returncode != 0:
        print(f"Error: pandoc failed: {result.stderr}", file=sys.stderr)
--- a/pdf-creator/scripts/tests/test_list_rendering.py
+++ b/pdf-creator/scripts/tests/test_list_rendering.py
@@ -0,0 +1,166 @@
 #!/usr/bin/env python3
 """
 Test list rendering in PDF generation.
 Verifies that markdown lists are correctly rendered in PDFs,
 even when they don't have blank lines before them.
 The original markdown files are NOT modified - preprocessing
 happens in memory during conversion.
 """
 import subprocess
 import sys
 import tempfile
 from pathlib import Path
 # Test markdown content with various list scenarios
 TEST_MARKDOWN = """# 测试列表解析
 ## 场景1：列表前有空行（正常）
 这是一段文字。
 - 列表项 1
 - 列表项 2
 - 列表项 3
 ## 场景2：列表前没有空行（关键测试）
 这是一段文字。
 - 列表项 1
 - 列表项 2
 - 列表项 3
 ## 场景3：有序列表前没有空行
 这是一段文字。
 1. 第一项
 2. 第二项
 3. 第三项
 ## 场景4：有序列表前有空行（正常）
 这是一段文字。
 1. 第一项
 2. 第二项
 3. 第三项
 """
 def run_test():
    """Run the list rendering test."""
    # Create temporary files
    with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as md_file:
        md_file.write(TEST_MARKDOWN)
        md_path = md_file.name
    pdf_path = md_path.replace('.md', '.pdf')
    txt_path = md_path.replace('.md', '.txt')
    try:
        # Generate PDF
        script_dir = Path(__file__).parent.parent
        md_to_pdf = script_dir / 'md_to_pdf.py'
        print(f"生成 PDF: {md_path} -> {pdf_path}")
        result = subprocess.run(
            ['uv', 'run', '--with', 'weasyprint', str(md_to_pdf), md_path, pdf_path],
            capture_output=True, text=True, cwd=script_dir.parent
        )
        if result.returncode != 0:
            print(f"❌ PDF 生成失败: {result.stderr}")
            return False
        print(f"✅ PDF 已生成")
        # Extract text from PDF
        result = subprocess.run(
            ['pdftotext', pdf_path, txt_path],
            capture_output=True, text=True
        )
        if result.returncode != 0:
            print(f"❌ 文本提取失败: {result.stderr}")
            return False
        # Read extracted text
        with open(txt_path, 'r', encoding='utf-8') as f:
            pdf_text = f.read()
        # Verify original file was not modified
        with open(md_path, 'r', encoding='utf-8') as f:
            original_content = f.read()
        if original_content != TEST_MARKDOWN:
            print("❌ 原始文件被修改了！")
            return False
        print("✅ 原始文件未被修改")
        # Verify list rendering
        print("\n=== 列表渲染验证 ===")
        tests_passed = 0
        tests_total = 4
        # Test 1: List with blank line before it
        if '• 列表项 1' in pdf_text:
            print("✅ 场景1: 列表前有空行 - 正确渲染")
            tests_passed += 1
        else:
            print("❌ 场景1: 列表前有空行 - 渲染失败")
        # Test 2: Critical test - list without blank line before it
        scene2_start = pdf_text.find('场景2')
        scene2_section = pdf_text[scene2_start:scene2_start+200] if scene2_start != -1 else ""
        if '• 列表项 1' in scene2_section and '- 列表项 1' not in scene2_section:
            print("✅ 场景2: 列表前没有空行 - 正确渲染（关键测试）")
            tests_passed += 1
        else:
            print("❌ 场景2: 列表前没有空行 - 渲染失败")
            print(f"   实际内容: {scene2_section}")
        # Test 3: Ordered list without blank line
        scene3_start = pdf_text.find('场景3')
        scene3_section = pdf_text[scene3_start:scene3_start+200] if scene3_start != -1 else ""
        if '1. 第一项' in scene3_section and '2. 第二项' in scene3_section:
            print("✅ 场景3: 有序列表前没有空行 - 正确渲染")
            tests_passed += 1
        else:
            print("❌ 场景3: 有序列表前没有空行 - 渲染失败")
        # Test 4: Ordered list with blank line
        if '1. 第一项' in pdf_text and '2. 第二项' in pdf_text:
            print("✅ 场景4: 有序列表前有空行 - 正确渲染")
            tests_passed += 1
        else:
            print("❌ 场景4: 有序列表前有空行 - 渲染失败")
        print(f"\n=== 测试结果: {tests_passed}/{tests_total} 通过 ===")
        if tests_passed == tests_total:
            print("\n✅ 所有测试通过！")
            print(f"\n生成的文件:")
            print(f"  Markdown: {md_path}")
            print(f"  PDF:      {pdf_path}")
            print(f"  Text:     {txt_path}")
            return True
        else:
            print(f"\n❌ {tests_total - tests_passed} 个测试失败")
            return False
    except Exception as e:
        print(f"❌ 测试失败: {e}")
        import traceback
        traceback.print_exc()
        return False
 if __name__ == '__main__':
    success = run_test()
    sys.exit(0 if success else 1)