Files
zhengchen.tao 480f4a0e99
Build and Deploy Blog / build (push) Successful in 28s
style: 三篇文章正文 ASCII 标点统一为中文标点
新增 scripts/cn-punct.py 做转换:跳过代码块/URL/链接 URL 部分,保留数学公式、数字列表、英文紧贴的标识符括号 (DNS(...))、嵌套数学记号 (GF(2⁸)) 等。
2026-05-12 11:00:34 +08:00

391 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Convert ASCII punctuation to Chinese punctuation in CJK context.
Strategy:
- Skip fenced code blocks entirely.
- Mask out inline code, markdown links, bare URLs as opaque blobs (placeholder
char from the Private Use Area), so paren matching can span them.
- Process YAML front matter only by converting `description:` / `title:` /
`summary:` string values — leaves `tags: [...]` arrays alone.
- For each prose line in the body, decide if it's "Chinese-flavored" (>= 3 CJK
chars). On Chinese lines, convert ASCII , . : ; ? ! ( ) → Chinese
counterparts where it makes sense.
Preservation rules (kept as ASCII):
- Number lists / decimals: `1,234`, `RS(255, 239)`, `1:8` (ratio).
- Math context: a comma/colon between two math expressions (one of `∇ ∂ √
≡ ≈ ± ¹²³⁴⁵⁶⁷⁸⁹⁰ ₀₁₂₃₄₅₆₇₈₉` or Greek letters within a 20-char window
on each side).
- Nested inside an existing Chinese paren: only converts prose-like content
(no math/digit indicator), preserving notation like `GF(2⁸) 列混合)`.
- English-attached parens: `DNS(Domain Name System)` (the `(` immediately
follows an English letter/digit) stays ASCII.
- Inside ASCII parens we chose to keep ASCII (e.g. `cookie(Ch8, Ch9)`),
inner punctuation stays ASCII.
Usage:
python scripts/cn-punct.py path/to/file1.md path/to/file2.md ...
This is the one-shot conversion script used in the 2026-05 blog cleanup. It is
deliberately conservative; if you re-run it on already-converted files it
should be a near no-op.
"""
import re
import sys
CJK_RE = re.compile(r'[一-鿿]')
PLACEHOLDER = '' # private-use char for opaque blobs
# Walk past these to find the "real" neighbor of a punctuation mark.
WEAK = set(' \t*_"\'`)]}>“”‘’')
# Punctuation that signals Chinese context for adjacent ASCII punctuation.
CHINESE_PUNCT = set(
',。:;?!、()'
'【】「」『』'
'“”‘’…—《》〈〉'
)
# Math-specific characters: superscript / subscript digits, operators, Greek
# letters that almost only appear in formulas. Used to detect comma/colon
# sitting between two math expressions (where it must stay ASCII).
MATH_CHARS = set(
'∇∂√≡≈±'
'¹²³⁴⁵⁶⁷⁸⁹⁰'
'₀₁₂₃₄₅₆₇₈₉'
'αβγδεζηθικ'
'λμνξπρστυφ'
'χψω'
'ΓΔΘΛΞΠΣΦΨΩ'
'∞∑∏∫·×÷'
)
def is_cjk(ch: str) -> bool:
return bool(ch and CJK_RE.match(ch))
def is_chinese_context(ch: str) -> bool:
return is_cjk(ch) or (ch in CHINESE_PUNCT)
def is_ascii_alnum(ch: str) -> bool:
return bool(ch) and ord(ch) < 128 and ch.isalnum()
def find_strong_neighbor(text: str, idx: int, direction: int) -> str:
"""Walk past WEAK chars to find the nearest 'strong' character, or '' if
we hit a boundary."""
n = len(text)
i = idx + direction
while 0 <= i < n:
ch = text[i]
if ch in WEAK:
i += direction
continue
return ch
return ''
def looks_like_math_context(text: str, idx: int, window: int = 20) -> bool:
"""Heuristic: comma at idx is between two math expressions if both sides
contain math-specific characters within a small window."""
left_window = text[max(0, idx - window):idx]
right_window = text[idx + 1:idx + 1 + window]
return (
any(c in MATH_CHARS for c in left_window)
and any(c in MATH_CHARS for c in right_window)
)
def convert_parens(text: str, aggressive: bool, depth_offset: int = 0) -> str:
"""Convert ( ) to when in Chinese context.
Tracks Chinese-paren depth so that a paren nested inside an outer Chinese
paren only converts if its own content contains CJK — preserves math
notation like `GF(2⁸) 列混合)`.
Skips conversion when the immediate preceding char is an English letter or
digit (e.g. `DNS(Domain Name System)`), since such parens behave like a
function call / abbreviation expansion in the source language.
"""
n = len(text)
out = []
i = 0
cn_depth = depth_offset
while i < n:
ch = text[i]
if ch == '': #
cn_depth += 1
out.append(ch)
i += 1
continue
if ch == '': #
cn_depth = max(0, cn_depth - 1)
out.append(ch)
i += 1
continue
if ch == '(':
depth = 1
j = i + 1
while j < n and depth > 0:
if text[j] == '(':
depth += 1
elif text[j] == ')':
depth -= 1
if depth == 0:
break
j += 1
if depth == 0:
content = text[i + 1:j]
left = find_strong_neighbor(text, i, -1)
right = find_strong_neighbor(text, j, +1)
immediate_prev = text[i - 1] if i > 0 else ''
content_has_cjk = bool(CJK_RE.search(content))
neighbor_chinese = (
is_chinese_context(left) or is_chinese_context(right)
)
content_has_math = bool(re.search(
r'[\d=+\-*/×÷^¹²³⁴-⁹⁰₀-₉]',
content,
))
if cn_depth > 0:
# Nested inside Chinese paren — convert prose-like content
should_convert = content_has_cjk or not content_has_math
elif is_ascii_alnum(immediate_prev):
# Attached to English identifier — leave ASCII
should_convert = content_has_cjk
else:
should_convert = (
content_has_cjk
or (aggressive and neighbor_chinese)
)
inner_offset = cn_depth + (1 if should_convert else 0)
converted_content = convert_parens(content, aggressive, inner_offset)
if should_convert:
out.append('')
out.append(converted_content)
out.append('')
else:
out.append('(')
out.append(converted_content)
out.append(')')
i = j + 1
continue
out.append(ch)
i += 1
return ''.join(out)
def convert_punct(text: str, aggressive: bool) -> str:
"""Convert ASCII , . : ; ? ! to Chinese counterparts.
Tracks both Chinese-paren depth and ASCII-paren depth:
- Inside `...` → aggressive (those are Chinese parentheticals).
- Inside `(...)` → conservative (the surviving ASCII parens were kept
ASCII for a reason — likely English-attached or notation).
"""
chars = list(text)
n = len(chars)
out = []
cn_paren_depth = 0
ascii_paren_depth = 0
for i, ch in enumerate(chars):
if ch == '':
cn_paren_depth += 1
out.append(ch)
continue
if ch == '':
cn_paren_depth = max(0, cn_paren_depth - 1)
out.append(ch)
continue
if ch == '(':
ascii_paren_depth += 1
out.append(ch)
continue
if ch == ')':
ascii_paren_depth = max(0, ascii_paren_depth - 1)
out.append(ch)
continue
prev = chars[i - 1] if i > 0 else ''
nxt = chars[i + 1] if i + 1 < n else ''
in_cn_paren = cn_paren_depth > 0
in_ascii_paren = ascii_paren_depth > 0
if ch == ',':
# Number-list separator: digit, [space,] digit → keep ASCII
prev_is_digit = prev.isascii() and prev.isdigit()
after_space_nxt = chars[i + 2] if (nxt == ' ' and i + 2 < n) else nxt
nxt_is_digit = (
after_space_nxt
and after_space_nxt.isascii()
and after_space_nxt.isdigit()
)
if prev_is_digit and nxt_is_digit:
out.append(ch)
continue
if looks_like_math_context(text, i):
out.append(ch)
continue
if is_chinese_context(prev) or is_chinese_context(nxt):
out.append('')
continue
if in_ascii_paren:
out.append(ch)
continue
if in_cn_paren or aggressive:
out.append('')
continue
elif ch == '.':
if is_ascii_alnum(nxt):
pass # decimal / file ext / version
elif is_chinese_context(prev):
out.append('')
continue
elif prev in WEAK:
left = find_strong_neighbor(text, i, -1)
if is_chinese_context(left):
out.append('')
continue
elif ch in (':', ';', '?', '!'):
mapping = {':': '', ';': '', '?': '', '!': ''}
if ch == ':':
# Ratio / time notation like "1:8" or "12:34" → keep ASCII colon
prev_is_digit = prev.isascii() and prev.isdigit()
nxt_is_digit = nxt.isascii() and nxt.isdigit()
if prev_is_digit and nxt_is_digit:
out.append(ch)
continue
if ch in (':', ';') and looks_like_math_context(text, i):
out.append(ch)
continue
left = find_strong_neighbor(text, i, -1)
right = find_strong_neighbor(text, i, +1)
if is_chinese_context(left) or is_chinese_context(right):
out.append(mapping[ch])
continue
if in_ascii_paren:
out.append(ch)
continue
if in_cn_paren or aggressive:
out.append(mapping[ch])
continue
out.append(ch)
return ''.join(out)
def line_is_chinese(line: str) -> bool:
"""Heuristic: does this line have enough CJK to call it 'Chinese-flavored'?"""
cjk_count = sum(1 for c in line if is_cjk(c))
return cjk_count >= 3
def process_text(text: str, aggressive: bool) -> str:
return convert_punct(convert_parens(text, aggressive), aggressive)
# Markdown image, markdown link, bare URL, or inline code (in this priority)
OPAQUE_RE = re.compile(
r'!?\[[^\]]*\]\([^\)]+\)'
r'|https?://[^\s\)\]\>]+'
r'|`[^`\n]+`'
)
FENCE_RE = re.compile(r'(```[\s\S]*?```)')
def process_segment(segment: str, aggressive: bool) -> str:
"""Mask opaque blobs, run conversions, restore (recursively processing
markdown link text)."""
saved = []
def stash(m: re.Match) -> str:
saved.append(m.group(0))
return PLACEHOLDER
masked = OPAQUE_RE.sub(stash, segment)
converted = process_text(masked, aggressive)
out = []
idx = 0
for ch in converted:
if ch == PLACEHOLDER:
tok = saved[idx]
idx += 1
m = re.match(r'(!?\[)([^\]]*)(\]\()([^\)]+)(\))', tok)
if m:
inside = process_text(m.group(2), aggressive)
tok = m.group(1) + inside + m.group(3) + m.group(4) + m.group(5)
out.append(tok)
else:
out.append(ch)
return ''.join(out)
def process_body_segment(segment: str) -> str:
"""Process a non-fenced body segment line by line, choosing aggressive mode
based on whether each logical line is Chinese-flavored."""
lines = segment.split('\n')
out_lines = []
for line in lines:
out_lines.append(process_segment(line, aggressive=line_is_chinese(line)))
return '\n'.join(out_lines)
def process_yaml_frontmatter(text: str) -> str:
"""Convert only quoted string values for description / title / summary keys.
Leaves list/array values like `tags: [...]` alone."""
def replace_value(m: re.Match) -> str:
prefix, value, suffix = m.group(1), m.group(2), m.group(3)
return prefix + process_segment(value, aggressive=line_is_chinese(value)) + suffix
return re.sub(
r'^(\s*(?:description|title|summary):\s*")([^"\n]*)(")',
replace_value,
text,
flags=re.MULTILINE,
)
def process_markdown(content: str) -> str:
front = ''
body = content
if content.startswith('---\n') or content.startswith('---\r\n'):
m = re.match(r'(---\r?\n[\s\S]*?\r?\n---\r?\n)', content)
if m:
front = m.group(1)
body = content[m.end():]
front = process_yaml_frontmatter(front)
parts = FENCE_RE.split(body)
out = []
for i, part in enumerate(parts):
if i % 2 == 1: # fenced code block
out.append(part)
else:
out.append(process_body_segment(part))
return front + ''.join(out)
def main() -> None:
changed = []
for path in sys.argv[1:]:
with open(path, encoding='utf-8', newline='') as f:
content = f.read()
new_content = process_markdown(content)
if new_content != content:
with open(path, 'w', encoding='utf-8', newline='') as f:
f.write(new_content)
changed.append(path)
print(f'updated {path}')
else:
print(f'unchanged {path}')
print(f'\nTotal updated: {len(changed)}')
if __name__ == '__main__':
main()