style: 三篇文章正文 ASCII 标点统一为中文标点
Build and Deploy Blog / build (push) Successful in 28s

新增 scripts/cn-punct.py 做转换:跳过代码块/URL/链接 URL 部分,保留数学公式、数字列表、英文紧贴的标识符括号 (DNS(...))、嵌套数学记号 (GF(2⁸)) 等。
This commit is contained in:
2026-05-12 11:00:34 +08:00
parent 3ff184a03a
commit 480f4a0e99
4 changed files with 904 additions and 514 deletions
+390
View File
@@ -0,0 +1,390 @@
"""Convert ASCII punctuation to Chinese punctuation in CJK context.
Strategy:
- Skip fenced code blocks entirely.
- Mask out inline code, markdown links, bare URLs as opaque blobs (placeholder
char from the Private Use Area), so paren matching can span them.
- Process YAML front matter only by converting `description:` / `title:` /
`summary:` string values — leaves `tags: [...]` arrays alone.
- For each prose line in the body, decide if it's "Chinese-flavored" (>= 3 CJK
chars). On Chinese lines, convert ASCII , . : ; ? ! ( ) → Chinese
counterparts where it makes sense.
Preservation rules (kept as ASCII):
- Number lists / decimals: `1,234`, `RS(255, 239)`, `1:8` (ratio).
- Math context: a comma/colon between two math expressions (one of `∇ ∂ √
≡ ≈ ± ¹²³⁴⁵⁶⁷⁸⁹⁰ ₀₁₂₃₄₅₆₇₈₉` or Greek letters within a 20-char window
on each side).
- Nested inside an existing Chinese paren: only converts prose-like content
(no math/digit indicator), preserving notation like `GF(2⁸) 列混合)`.
- English-attached parens: `DNS(Domain Name System)` (the `(` immediately
follows an English letter/digit) stays ASCII.
- Inside ASCII parens we chose to keep ASCII (e.g. `cookie(Ch8, Ch9)`),
inner punctuation stays ASCII.
Usage:
python scripts/cn-punct.py path/to/file1.md path/to/file2.md ...
This is the one-shot conversion script used in the 2026-05 blog cleanup. It is
deliberately conservative; if you re-run it on already-converted files it
should be a near no-op.
"""
import re
import sys
CJK_RE = re.compile(r'[一-鿿]')
PLACEHOLDER = '' # private-use char for opaque blobs
# Walk past these to find the "real" neighbor of a punctuation mark.
WEAK = set(' \t*_"\'`)]}>“”‘’')
# Punctuation that signals Chinese context for adjacent ASCII punctuation.
CHINESE_PUNCT = set(
',。:;?!、()'
'【】「」『』'
'“”‘’…—《》〈〉'
)
# Math-specific characters: superscript / subscript digits, operators, Greek
# letters that almost only appear in formulas. Used to detect comma/colon
# sitting between two math expressions (where it must stay ASCII).
MATH_CHARS = set(
'∇∂√≡≈±'
'¹²³⁴⁵⁶⁷⁸⁹⁰'
'₀₁₂₃₄₅₆₇₈₉'
'αβγδεζηθικ'
'λμνξπρστυφ'
'χψω'
'ΓΔΘΛΞΠΣΦΨΩ'
'∞∑∏∫·×÷'
)
def is_cjk(ch: str) -> bool:
return bool(ch and CJK_RE.match(ch))
def is_chinese_context(ch: str) -> bool:
return is_cjk(ch) or (ch in CHINESE_PUNCT)
def is_ascii_alnum(ch: str) -> bool:
return bool(ch) and ord(ch) < 128 and ch.isalnum()
def find_strong_neighbor(text: str, idx: int, direction: int) -> str:
"""Walk past WEAK chars to find the nearest 'strong' character, or '' if
we hit a boundary."""
n = len(text)
i = idx + direction
while 0 <= i < n:
ch = text[i]
if ch in WEAK:
i += direction
continue
return ch
return ''
def looks_like_math_context(text: str, idx: int, window: int = 20) -> bool:
"""Heuristic: comma at idx is between two math expressions if both sides
contain math-specific characters within a small window."""
left_window = text[max(0, idx - window):idx]
right_window = text[idx + 1:idx + 1 + window]
return (
any(c in MATH_CHARS for c in left_window)
and any(c in MATH_CHARS for c in right_window)
)
def convert_parens(text: str, aggressive: bool, depth_offset: int = 0) -> str:
"""Convert ( ) to when in Chinese context.
Tracks Chinese-paren depth so that a paren nested inside an outer Chinese
paren only converts if its own content contains CJK — preserves math
notation like `GF(2⁸) 列混合)`.
Skips conversion when the immediate preceding char is an English letter or
digit (e.g. `DNS(Domain Name System)`), since such parens behave like a
function call / abbreviation expansion in the source language.
"""
n = len(text)
out = []
i = 0
cn_depth = depth_offset
while i < n:
ch = text[i]
if ch == '': #
cn_depth += 1
out.append(ch)
i += 1
continue
if ch == '': #
cn_depth = max(0, cn_depth - 1)
out.append(ch)
i += 1
continue
if ch == '(':
depth = 1
j = i + 1
while j < n and depth > 0:
if text[j] == '(':
depth += 1
elif text[j] == ')':
depth -= 1
if depth == 0:
break
j += 1
if depth == 0:
content = text[i + 1:j]
left = find_strong_neighbor(text, i, -1)
right = find_strong_neighbor(text, j, +1)
immediate_prev = text[i - 1] if i > 0 else ''
content_has_cjk = bool(CJK_RE.search(content))
neighbor_chinese = (
is_chinese_context(left) or is_chinese_context(right)
)
content_has_math = bool(re.search(
r'[\d=+\-*/×÷^¹²³⁴-⁹⁰₀-₉]',
content,
))
if cn_depth > 0:
# Nested inside Chinese paren — convert prose-like content
should_convert = content_has_cjk or not content_has_math
elif is_ascii_alnum(immediate_prev):
# Attached to English identifier — leave ASCII
should_convert = content_has_cjk
else:
should_convert = (
content_has_cjk
or (aggressive and neighbor_chinese)
)
inner_offset = cn_depth + (1 if should_convert else 0)
converted_content = convert_parens(content, aggressive, inner_offset)
if should_convert:
out.append('')
out.append(converted_content)
out.append('')
else:
out.append('(')
out.append(converted_content)
out.append(')')
i = j + 1
continue
out.append(ch)
i += 1
return ''.join(out)
def convert_punct(text: str, aggressive: bool) -> str:
"""Convert ASCII , . : ; ? ! to Chinese counterparts.
Tracks both Chinese-paren depth and ASCII-paren depth:
- Inside `...` → aggressive (those are Chinese parentheticals).
- Inside `(...)` → conservative (the surviving ASCII parens were kept
ASCII for a reason — likely English-attached or notation).
"""
chars = list(text)
n = len(chars)
out = []
cn_paren_depth = 0
ascii_paren_depth = 0
for i, ch in enumerate(chars):
if ch == '':
cn_paren_depth += 1
out.append(ch)
continue
if ch == '':
cn_paren_depth = max(0, cn_paren_depth - 1)
out.append(ch)
continue
if ch == '(':
ascii_paren_depth += 1
out.append(ch)
continue
if ch == ')':
ascii_paren_depth = max(0, ascii_paren_depth - 1)
out.append(ch)
continue
prev = chars[i - 1] if i > 0 else ''
nxt = chars[i + 1] if i + 1 < n else ''
in_cn_paren = cn_paren_depth > 0
in_ascii_paren = ascii_paren_depth > 0
if ch == ',':
# Number-list separator: digit, [space,] digit → keep ASCII
prev_is_digit = prev.isascii() and prev.isdigit()
after_space_nxt = chars[i + 2] if (nxt == ' ' and i + 2 < n) else nxt
nxt_is_digit = (
after_space_nxt
and after_space_nxt.isascii()
and after_space_nxt.isdigit()
)
if prev_is_digit and nxt_is_digit:
out.append(ch)
continue
if looks_like_math_context(text, i):
out.append(ch)
continue
if is_chinese_context(prev) or is_chinese_context(nxt):
out.append('')
continue
if in_ascii_paren:
out.append(ch)
continue
if in_cn_paren or aggressive:
out.append('')
continue
elif ch == '.':
if is_ascii_alnum(nxt):
pass # decimal / file ext / version
elif is_chinese_context(prev):
out.append('')
continue
elif prev in WEAK:
left = find_strong_neighbor(text, i, -1)
if is_chinese_context(left):
out.append('')
continue
elif ch in (':', ';', '?', '!'):
mapping = {':': '', ';': '', '?': '', '!': ''}
if ch == ':':
# Ratio / time notation like "1:8" or "12:34" → keep ASCII colon
prev_is_digit = prev.isascii() and prev.isdigit()
nxt_is_digit = nxt.isascii() and nxt.isdigit()
if prev_is_digit and nxt_is_digit:
out.append(ch)
continue
if ch in (':', ';') and looks_like_math_context(text, i):
out.append(ch)
continue
left = find_strong_neighbor(text, i, -1)
right = find_strong_neighbor(text, i, +1)
if is_chinese_context(left) or is_chinese_context(right):
out.append(mapping[ch])
continue
if in_ascii_paren:
out.append(ch)
continue
if in_cn_paren or aggressive:
out.append(mapping[ch])
continue
out.append(ch)
return ''.join(out)
def line_is_chinese(line: str) -> bool:
"""Heuristic: does this line have enough CJK to call it 'Chinese-flavored'?"""
cjk_count = sum(1 for c in line if is_cjk(c))
return cjk_count >= 3
def process_text(text: str, aggressive: bool) -> str:
return convert_punct(convert_parens(text, aggressive), aggressive)
# Markdown image, markdown link, bare URL, or inline code (in this priority)
OPAQUE_RE = re.compile(
r'!?\[[^\]]*\]\([^\)]+\)'
r'|https?://[^\s\)\]\>]+'
r'|`[^`\n]+`'
)
FENCE_RE = re.compile(r'(```[\s\S]*?```)')
def process_segment(segment: str, aggressive: bool) -> str:
"""Mask opaque blobs, run conversions, restore (recursively processing
markdown link text)."""
saved = []
def stash(m: re.Match) -> str:
saved.append(m.group(0))
return PLACEHOLDER
masked = OPAQUE_RE.sub(stash, segment)
converted = process_text(masked, aggressive)
out = []
idx = 0
for ch in converted:
if ch == PLACEHOLDER:
tok = saved[idx]
idx += 1
m = re.match(r'(!?\[)([^\]]*)(\]\()([^\)]+)(\))', tok)
if m:
inside = process_text(m.group(2), aggressive)
tok = m.group(1) + inside + m.group(3) + m.group(4) + m.group(5)
out.append(tok)
else:
out.append(ch)
return ''.join(out)
def process_body_segment(segment: str) -> str:
"""Process a non-fenced body segment line by line, choosing aggressive mode
based on whether each logical line is Chinese-flavored."""
lines = segment.split('\n')
out_lines = []
for line in lines:
out_lines.append(process_segment(line, aggressive=line_is_chinese(line)))
return '\n'.join(out_lines)
def process_yaml_frontmatter(text: str) -> str:
"""Convert only quoted string values for description / title / summary keys.
Leaves list/array values like `tags: [...]` alone."""
def replace_value(m: re.Match) -> str:
prefix, value, suffix = m.group(1), m.group(2), m.group(3)
return prefix + process_segment(value, aggressive=line_is_chinese(value)) + suffix
return re.sub(
r'^(\s*(?:description|title|summary):\s*")([^"\n]*)(")',
replace_value,
text,
flags=re.MULTILINE,
)
def process_markdown(content: str) -> str:
front = ''
body = content
if content.startswith('---\n') or content.startswith('---\r\n'):
m = re.match(r'(---\r?\n[\s\S]*?\r?\n---\r?\n)', content)
if m:
front = m.group(1)
body = content[m.end():]
front = process_yaml_frontmatter(front)
parts = FENCE_RE.split(body)
out = []
for i, part in enumerate(parts):
if i % 2 == 1: # fenced code block
out.append(part)
else:
out.append(process_body_segment(part))
return front + ''.join(out)
def main() -> None:
changed = []
for path in sys.argv[1:]:
with open(path, encoding='utf-8', newline='') as f:
content = f.read()
new_content = process_markdown(content)
if new_content != content:
with open(path, 'w', encoding='utf-8', newline='') as f:
f.write(new_content)
changed.append(path)
print(f'updated {path}')
else:
print(f'unchanged {path}')
print(f'\nTotal updated: {len(changed)}')
if __name__ == '__main__':
main()