fix(pdf-creator): resolve CJK text garbled in weasyprint code blocks
weasyprint renders <pre> blocks with monospace fonts that lack CJK glyphs, causing Chinese/Japanese/Korean characters to display as garbled text. Fix: add _fix_cjk_code_blocks() preprocessor that detects CJK in <pre><code> and converts to <div class="cjk-code-block"> with inherited body font. Pure-ASCII code blocks are left untouched. Also adds code/pre/pre-code CSS rules to both themes (default + warm-terra) that were previously missing entirely. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -57,4 +57,6 @@ uv run --with weasyprint scripts/batch_convert.py *.md --output-dir ./pdfs
|
||||
|
||||
**weasyprint import error**: Run with `uv run --with weasyprint` or use `--backend chrome` instead.
|
||||
|
||||
**CJK text in code blocks garbled (weasyprint)**: The script auto-detects code blocks containing Chinese/Japanese/Korean characters and converts them to styled divs with CJK-capable fonts. If you still see issues, use `--backend chrome` which has native CJK support. Alternatively, convert code blocks to markdown tables before generating the PDF.
|
||||
|
||||
**Chrome header/footer appearing**: The script passes `--no-pdf-header-footer`. If it still appears, your Chrome version may not support this flag — update Chrome.
|
||||
|
||||
@@ -125,6 +125,35 @@ def _ensure_list_spacing(text: str) -> str:
|
||||
return "\n".join(result)
|
||||
|
||||
|
||||
_CJK_RANGE = re.compile(
|
||||
r"[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff"
|
||||
r"\U00020000-\U0002a6df\U0002a700-\U0002ebef"
|
||||
r"\u3000-\u303f\uff00-\uffef]"
|
||||
)
|
||||
|
||||
|
||||
def _fix_cjk_code_blocks(html: str) -> str:
|
||||
"""Replace <pre><code> blocks containing CJK with styled divs.
|
||||
|
||||
weasyprint renders <pre> blocks using monospace fonts that lack CJK glyphs,
|
||||
causing garbled output. This converts CJK-heavy code blocks to styled divs
|
||||
that use the document's CJK font stack instead.
|
||||
"""
|
||||
|
||||
def _replace_if_cjk(match: re.Match) -> str:
|
||||
content = match.group(1)
|
||||
if _CJK_RANGE.search(content):
|
||||
return f'<div class="cjk-code-block">{content}</div>'
|
||||
return match.group(0)
|
||||
|
||||
return re.sub(
|
||||
r"<pre><code(?:\s[^>]*)?>(.+?)</code></pre>",
|
||||
_replace_if_cjk,
|
||||
html,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def _md_to_html(md_file: str) -> str:
|
||||
"""Convert markdown to HTML using pandoc with list spacing preprocessing."""
|
||||
if not shutil.which("pandoc"):
|
||||
@@ -147,7 +176,9 @@ def _md_to_html(md_file: str) -> str:
|
||||
print(f"Error: pandoc failed: {result.stderr}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
return result.stdout
|
||||
html = result.stdout
|
||||
html = _fix_cjk_code_blocks(html)
|
||||
return html
|
||||
|
||||
|
||||
def _build_full_html(html_content: str, css: str, title: str) -> str:
|
||||
|
||||
@@ -86,3 +86,46 @@ hr {
|
||||
border-top: 1px solid #ccc;
|
||||
margin: 1.5em 0;
|
||||
}
|
||||
|
||||
code {
|
||||
font-family: 'Menlo', 'PingFang SC', 'Heiti SC', 'Noto Sans CJK SC', monospace;
|
||||
background: #f5f5f5;
|
||||
padding: 1px 4px;
|
||||
border-radius: 3px;
|
||||
font-size: 10pt;
|
||||
}
|
||||
|
||||
pre {
|
||||
background: #f5f5f5;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 4px;
|
||||
padding: 12px 16px;
|
||||
margin: 1em 0;
|
||||
overflow-wrap: break-word;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
pre code {
|
||||
font-family: 'Menlo', 'PingFang SC', 'Heiti SC', 'Noto Sans CJK SC', monospace;
|
||||
background: none;
|
||||
padding: 0;
|
||||
border-radius: 0;
|
||||
font-size: 9pt;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
/* CJK code blocks converted to styled divs by preprocessor.
|
||||
Uses inherit to reuse body's CJK font (weasyprint may not find PingFang SC). */
|
||||
.cjk-code-block {
|
||||
font-family: inherit;
|
||||
background: #f5f5f5;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 4px;
|
||||
padding: 12px 16px;
|
||||
margin: 1em 0;
|
||||
font-size: 10pt;
|
||||
line-height: 1.8;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
@@ -110,12 +110,48 @@ header, .date {
|
||||
}
|
||||
|
||||
code {
|
||||
font-family: 'Menlo', 'PingFang SC', 'Microsoft YaHei', 'Noto Sans CJK SC', monospace;
|
||||
background: #faf5f0;
|
||||
padding: 1px 4px;
|
||||
border-radius: 3px;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
pre {
|
||||
background: #faf5f0;
|
||||
border: 1px solid #e2d6c8;
|
||||
border-radius: 4px;
|
||||
padding: 12px 16px;
|
||||
margin: 10px 0;
|
||||
overflow-wrap: break-word;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
pre code {
|
||||
font-family: 'Menlo', 'PingFang SC', 'Microsoft YaHei', 'Noto Sans CJK SC', monospace;
|
||||
background: none;
|
||||
padding: 0;
|
||||
border-radius: 0;
|
||||
font-size: 11px;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
/* CJK code blocks converted to styled divs by preprocessor.
|
||||
Uses inherit to reuse body's CJK font (weasyprint may not resolve all font names). */
|
||||
.cjk-code-block {
|
||||
font-family: inherit;
|
||||
background: #faf5f0;
|
||||
border: 1px solid #e2d6c8;
|
||||
border-radius: 4px;
|
||||
padding: 12px 16px;
|
||||
margin: 10px 0;
|
||||
font-size: 12px;
|
||||
line-height: 1.7;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
strong {
|
||||
color: #1f1b17;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user