fix(pdf-creator): resolve CJK text garbled in weasyprint code blocks

weasyprint renders <pre> blocks with monospace fonts that lack CJK glyphs,
causing Chinese/Japanese/Korean characters to display as garbled text.

Fix: add _fix_cjk_code_blocks() preprocessor that detects CJK in <pre><code>
and converts to <div class="cjk-code-block"> with inherited body font.
Pure-ASCII code blocks are left untouched.

Also adds code/pre/pre-code CSS rules to both themes (default + warm-terra)
that were previously missing entirely.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
daymade
2026-04-08 15:01:13 +08:00
parent 9242af5fcb
commit edaeaa89f4
5 changed files with 115 additions and 2 deletions

View File

@@ -416,7 +416,7 @@
"description": "Create PDF documents from markdown with Chinese font support. Supports theme system (default for formal docs, warm-terra for training materials) and dual backend (weasyprint or Chrome). Triggers include convert to PDF, generate PDF, markdown to PDF, or printable documents",
"source": "./",
"strict": false,
"version": "1.2.0",
"version": "1.3.0",
"category": "document-conversion",
"keywords": [
"pdf",
@@ -425,6 +425,7 @@
"chrome",
"themes",
"chinese-fonts",
"cjk",
"document-generation",
"legal",
"reports",

View File

@@ -57,4 +57,6 @@ uv run --with weasyprint scripts/batch_convert.py *.md --output-dir ./pdfs
**weasyprint import error**: Run with `uv run --with weasyprint` or use `--backend chrome` instead.
**CJK text in code blocks garbled (weasyprint)**: The script auto-detects code blocks containing Chinese/Japanese/Korean characters and converts them to styled divs with CJK-capable fonts. If you still see issues, use `--backend chrome` which has native CJK support. Alternatively, convert code blocks to markdown tables before generating the PDF.
**Chrome header/footer appearing**: The script passes `--no-pdf-header-footer`. If it still appears, your Chrome version may not support this flag — update Chrome.

View File

@@ -125,6 +125,35 @@ def _ensure_list_spacing(text: str) -> str:
return "\n".join(result)
_CJK_RANGE = re.compile(
r"[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff"
r"\U00020000-\U0002a6df\U0002a700-\U0002ebef"
r"\u3000-\u303f\uff00-\uffef]"
)
def _fix_cjk_code_blocks(html: str) -> str:
"""Replace <pre><code> blocks containing CJK with styled divs.
weasyprint renders <pre> blocks using monospace fonts that lack CJK glyphs,
causing garbled output. This converts CJK-heavy code blocks to styled divs
that use the document's CJK font stack instead.
"""
def _replace_if_cjk(match: re.Match) -> str:
content = match.group(1)
if _CJK_RANGE.search(content):
return f'<div class="cjk-code-block">{content}</div>'
return match.group(0)
return re.sub(
r"<pre><code(?:\s[^>]*)?>(.+?)</code></pre>",
_replace_if_cjk,
html,
flags=re.DOTALL,
)
def _md_to_html(md_file: str) -> str:
"""Convert markdown to HTML using pandoc with list spacing preprocessing."""
if not shutil.which("pandoc"):
@@ -147,7 +176,9 @@ def _md_to_html(md_file: str) -> str:
print(f"Error: pandoc failed: {result.stderr}", file=sys.stderr)
sys.exit(1)
return result.stdout
html = result.stdout
html = _fix_cjk_code_blocks(html)
return html
def _build_full_html(html_content: str, css: str, title: str) -> str:

View File

@@ -86,3 +86,46 @@ hr {
border-top: 1px solid #ccc;
margin: 1.5em 0;
}
code {
font-family: 'Menlo', 'PingFang SC', 'Heiti SC', 'Noto Sans CJK SC', monospace;
background: #f5f5f5;
padding: 1px 4px;
border-radius: 3px;
font-size: 10pt;
}
pre {
background: #f5f5f5;
border: 1px solid #ddd;
border-radius: 4px;
padding: 12px 16px;
margin: 1em 0;
overflow-wrap: break-word;
white-space: pre-wrap;
word-break: break-all;
}
pre code {
font-family: 'Menlo', 'PingFang SC', 'Heiti SC', 'Noto Sans CJK SC', monospace;
background: none;
padding: 0;
border-radius: 0;
font-size: 9pt;
line-height: 1.6;
}
/* CJK code blocks converted to styled divs by preprocessor.
Uses inherit to reuse body's CJK font (weasyprint may not find PingFang SC). */
.cjk-code-block {
font-family: inherit;
background: #f5f5f5;
border: 1px solid #ddd;
border-radius: 4px;
padding: 12px 16px;
margin: 1em 0;
font-size: 10pt;
line-height: 1.8;
white-space: pre-wrap;
word-break: break-all;
}

View File

@@ -110,12 +110,48 @@ header, .date {
}
code {
font-family: 'Menlo', 'PingFang SC', 'Microsoft YaHei', 'Noto Sans CJK SC', monospace;
background: #faf5f0;
padding: 1px 4px;
border-radius: 3px;
font-size: 12px;
}
pre {
background: #faf5f0;
border: 1px solid #e2d6c8;
border-radius: 4px;
padding: 12px 16px;
margin: 10px 0;
overflow-wrap: break-word;
white-space: pre-wrap;
word-break: break-all;
}
pre code {
font-family: 'Menlo', 'PingFang SC', 'Microsoft YaHei', 'Noto Sans CJK SC', monospace;
background: none;
padding: 0;
border-radius: 0;
font-size: 11px;
line-height: 1.6;
}
/* CJK code blocks converted to styled divs by preprocessor.
Uses inherit to reuse body's CJK font (weasyprint may not resolve all font names). */
.cjk-code-block {
font-family: inherit;
background: #faf5f0;
border: 1px solid #e2d6c8;
border-radius: 4px;
padding: 12px 16px;
margin: 10px 0;
font-size: 12px;
line-height: 1.7;
white-space: pre-wrap;
word-break: break-all;
}
strong {
color: #1f1b17;
}