fix(pdf-creator): restore list spacing preprocessor for pandoc
- Add _ensure_list_spacing() to handle lists without blank lines before them - Modify _md_to_html() to preprocess markdown content via stdin - Add automated test suite (scripts/tests/test_list_rendering.py) - Fix: Lists without preceding blank lines now render correctly - Original markdown files remain unmodified (preprocessing in memory only) Root cause: Pandoc requires blank lines before lists per CommonMark spec. Without preprocessing, lists following paragraphs render as plain text. Tested scenarios: ✅ Lists with blank lines (normal case) ✅ Lists without blank lines (critical fix) ✅ Ordered lists without blank lines ✅ Original file integrity preserved Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,7 @@ Requirements:
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
@@ -146,15 +147,44 @@ blockquote {
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_list_spacing(text: str) -> str:
|
||||||
|
"""Ensure blank lines before list items for proper markdown parsing.
|
||||||
|
|
||||||
|
Both Python markdown library and pandoc require a blank line before a list
|
||||||
|
when it follows a paragraph. Without it, list items render as plain text.
|
||||||
|
|
||||||
|
This preprocessor adds blank lines before list items when needed, without
|
||||||
|
modifying the user's original markdown file.
|
||||||
|
"""
|
||||||
|
lines = text.split('\n')
|
||||||
|
result = []
|
||||||
|
list_re = re.compile(r'^(\s*)([-*+]|\d+\.)\s')
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if i > 0 and list_re.match(line):
|
||||||
|
prev = lines[i - 1]
|
||||||
|
if prev.strip() and not list_re.match(prev):
|
||||||
|
result.append('')
|
||||||
|
result.append(line)
|
||||||
|
return '\n'.join(result)
|
||||||
|
|
||||||
|
|
||||||
def _md_to_html(md_file: str) -> str:
|
def _md_to_html(md_file: str) -> str:
|
||||||
"""Convert markdown to HTML using pandoc."""
|
"""Convert markdown to HTML using pandoc with list spacing preprocessing.
|
||||||
|
|
||||||
|
Reads the markdown file, preprocesses it to ensure proper list spacing,
|
||||||
|
then passes the content to pandoc via stdin. The original file is not modified.
|
||||||
|
"""
|
||||||
if not shutil.which('pandoc'):
|
if not shutil.which('pandoc'):
|
||||||
print("Error: pandoc not found. Install with: brew install pandoc", file=sys.stderr)
|
print("Error: pandoc not found. Install with: brew install pandoc", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Read and preprocess markdown to ensure list spacing
|
||||||
|
md_content = Path(md_file).read_text(encoding='utf-8')
|
||||||
|
md_content = _ensure_list_spacing(md_content)
|
||||||
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['pandoc', md_file, '-f', 'markdown', '-t', 'html'],
|
['pandoc', '-f', 'markdown', '-t', 'html'],
|
||||||
capture_output=True, text=True,
|
input=md_content, capture_output=True, text=True,
|
||||||
)
|
)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
print(f"Error: pandoc failed: {result.stderr}", file=sys.stderr)
|
print(f"Error: pandoc failed: {result.stderr}", file=sys.stderr)
|
||||||
|
|||||||
166
pdf-creator/scripts/tests/test_list_rendering.py
Executable file
166
pdf-creator/scripts/tests/test_list_rendering.py
Executable file
@@ -0,0 +1,166 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test list rendering in PDF generation.
|
||||||
|
|
||||||
|
Verifies that markdown lists are correctly rendered in PDFs,
|
||||||
|
even when they don't have blank lines before them.
|
||||||
|
|
||||||
|
The original markdown files are NOT modified - preprocessing
|
||||||
|
happens in memory during conversion.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Test markdown content with various list scenarios
|
||||||
|
TEST_MARKDOWN = """# 测试列表解析
|
||||||
|
|
||||||
|
## 场景1:列表前有空行(正常)
|
||||||
|
|
||||||
|
这是一段文字。
|
||||||
|
|
||||||
|
- 列表项 1
|
||||||
|
- 列表项 2
|
||||||
|
- 列表项 3
|
||||||
|
|
||||||
|
## 场景2:列表前没有空行(关键测试)
|
||||||
|
|
||||||
|
这是一段文字。
|
||||||
|
- 列表项 1
|
||||||
|
- 列表项 2
|
||||||
|
- 列表项 3
|
||||||
|
|
||||||
|
## 场景3:有序列表前没有空行
|
||||||
|
|
||||||
|
这是一段文字。
|
||||||
|
1. 第一项
|
||||||
|
2. 第二项
|
||||||
|
3. 第三项
|
||||||
|
|
||||||
|
## 场景4:有序列表前有空行(正常)
|
||||||
|
|
||||||
|
这是一段文字。
|
||||||
|
|
||||||
|
1. 第一项
|
||||||
|
2. 第二项
|
||||||
|
3. 第三项
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def run_test():
|
||||||
|
"""Run the list rendering test."""
|
||||||
|
# Create temporary files
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as md_file:
|
||||||
|
md_file.write(TEST_MARKDOWN)
|
||||||
|
md_path = md_file.name
|
||||||
|
|
||||||
|
pdf_path = md_path.replace('.md', '.pdf')
|
||||||
|
txt_path = md_path.replace('.md', '.txt')
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Generate PDF
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
md_to_pdf = script_dir / 'md_to_pdf.py'
|
||||||
|
|
||||||
|
print(f"生成 PDF: {md_path} -> {pdf_path}")
|
||||||
|
result = subprocess.run(
|
||||||
|
['uv', 'run', '--with', 'weasyprint', str(md_to_pdf), md_path, pdf_path],
|
||||||
|
capture_output=True, text=True, cwd=script_dir.parent
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f"❌ PDF 生成失败: {result.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"✅ PDF 已生成")
|
||||||
|
|
||||||
|
# Extract text from PDF
|
||||||
|
result = subprocess.run(
|
||||||
|
['pdftotext', pdf_path, txt_path],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f"❌ 文本提取失败: {result.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Read extracted text
|
||||||
|
with open(txt_path, 'r', encoding='utf-8') as f:
|
||||||
|
pdf_text = f.read()
|
||||||
|
|
||||||
|
# Verify original file was not modified
|
||||||
|
with open(md_path, 'r', encoding='utf-8') as f:
|
||||||
|
original_content = f.read()
|
||||||
|
|
||||||
|
if original_content != TEST_MARKDOWN:
|
||||||
|
print("❌ 原始文件被修改了!")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("✅ 原始文件未被修改")
|
||||||
|
|
||||||
|
# Verify list rendering
|
||||||
|
print("\n=== 列表渲染验证 ===")
|
||||||
|
|
||||||
|
tests_passed = 0
|
||||||
|
tests_total = 4
|
||||||
|
|
||||||
|
# Test 1: List with blank line before it
|
||||||
|
if '• 列表项 1' in pdf_text:
|
||||||
|
print("✅ 场景1: 列表前有空行 - 正确渲染")
|
||||||
|
tests_passed += 1
|
||||||
|
else:
|
||||||
|
print("❌ 场景1: 列表前有空行 - 渲染失败")
|
||||||
|
|
||||||
|
# Test 2: Critical test - list without blank line before it
|
||||||
|
scene2_start = pdf_text.find('场景2')
|
||||||
|
scene2_section = pdf_text[scene2_start:scene2_start+200] if scene2_start != -1 else ""
|
||||||
|
|
||||||
|
if '• 列表项 1' in scene2_section and '- 列表项 1' not in scene2_section:
|
||||||
|
print("✅ 场景2: 列表前没有空行 - 正确渲染(关键测试)")
|
||||||
|
tests_passed += 1
|
||||||
|
else:
|
||||||
|
print("❌ 场景2: 列表前没有空行 - 渲染失败")
|
||||||
|
print(f" 实际内容: {scene2_section}")
|
||||||
|
|
||||||
|
# Test 3: Ordered list without blank line
|
||||||
|
scene3_start = pdf_text.find('场景3')
|
||||||
|
scene3_section = pdf_text[scene3_start:scene3_start+200] if scene3_start != -1 else ""
|
||||||
|
|
||||||
|
if '1. 第一项' in scene3_section and '2. 第二项' in scene3_section:
|
||||||
|
print("✅ 场景3: 有序列表前没有空行 - 正确渲染")
|
||||||
|
tests_passed += 1
|
||||||
|
else:
|
||||||
|
print("❌ 场景3: 有序列表前没有空行 - 渲染失败")
|
||||||
|
|
||||||
|
# Test 4: Ordered list with blank line
|
||||||
|
if '1. 第一项' in pdf_text and '2. 第二项' in pdf_text:
|
||||||
|
print("✅ 场景4: 有序列表前有空行 - 正确渲染")
|
||||||
|
tests_passed += 1
|
||||||
|
else:
|
||||||
|
print("❌ 场景4: 有序列表前有空行 - 渲染失败")
|
||||||
|
|
||||||
|
print(f"\n=== 测试结果: {tests_passed}/{tests_total} 通过 ===")
|
||||||
|
|
||||||
|
if tests_passed == tests_total:
|
||||||
|
print("\n✅ 所有测试通过!")
|
||||||
|
print(f"\n生成的文件:")
|
||||||
|
print(f" Markdown: {md_path}")
|
||||||
|
print(f" PDF: {pdf_path}")
|
||||||
|
print(f" Text: {txt_path}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"\n❌ {tests_total - tests_passed} 个测试失败")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ 测试失败: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
success = run_test()
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
Reference in New Issue
Block a user