"""Convert ASCII punctuation to Chinese punctuation in CJK context. Strategy: - Skip fenced code blocks entirely. - Mask out inline code, markdown links, bare URLs as opaque blobs (placeholder char from the Private Use Area), so paren matching can span them. - Process YAML front matter only by converting `description:` / `title:` / `summary:` string values — leaves `tags: [...]` arrays alone. - For each prose line in the body, decide if it's "Chinese-flavored" (>= 3 CJK chars). On Chinese lines, convert ASCII , . : ; ? ! ( ) → Chinese counterparts where it makes sense. Preservation rules (kept as ASCII): - Number lists / decimals: `1,234`, `RS(255, 239)`, `1:8` (ratio). - Math context: a comma/colon between two math expressions (one of `∇ ∂ √ ≡ ≈ ± ¹²³⁴⁵⁶⁷⁸⁹⁰ ₀₁₂₃₄₅₆₇₈₉` or Greek letters within a 20-char window on each side). - Nested inside an existing Chinese paren: only converts prose-like content (no math/digit indicator), preserving notation like `(GF(2⁸) 列混合)`. - English-attached parens: `DNS(Domain Name System)` (the `(` immediately follows an English letter/digit) stays ASCII. - Inside ASCII parens we chose to keep ASCII (e.g. `cookie(Ch8, Ch9)`), inner punctuation stays ASCII. Usage: python scripts/cn-punct.py path/to/file1.md path/to/file2.md ... This is the one-shot conversion script used in the 2026-05 blog cleanup. It is deliberately conservative; if you re-run it on already-converted files it should be a near no-op. """ import re import sys CJK_RE = re.compile(r'[一-鿿]') PLACEHOLDER = '' # private-use char for opaque blobs # Walk past these to find the "real" neighbor of a punctuation mark. WEAK = set(' \t*_"\'`)]}>“”‘’') # Punctuation that signals Chinese context for adjacent ASCII punctuation. CHINESE_PUNCT = set( ',。:;?!、()' '【】「」『』' '“”‘’…—《》〈〉' ) # Math-specific characters: superscript / subscript digits, operators, Greek # letters that almost only appear in formulas. Used to detect comma/colon # sitting between two math expressions (where it must stay ASCII). MATH_CHARS = set( '∇∂√≡≈±' '¹²³⁴⁵⁶⁷⁸⁹⁰' '₀₁₂₃₄₅₆₇₈₉' 'αβγδεζηθικ' 'λμνξπρστυφ' 'χψω' 'ΓΔΘΛΞΠΣΦΨΩ' '∞∑∏∫·×÷' ) def is_cjk(ch: str) -> bool: return bool(ch and CJK_RE.match(ch)) def is_chinese_context(ch: str) -> bool: return is_cjk(ch) or (ch in CHINESE_PUNCT) def is_ascii_alnum(ch: str) -> bool: return bool(ch) and ord(ch) < 128 and ch.isalnum() def find_strong_neighbor(text: str, idx: int, direction: int) -> str: """Walk past WEAK chars to find the nearest 'strong' character, or '' if we hit a boundary.""" n = len(text) i = idx + direction while 0 <= i < n: ch = text[i] if ch in WEAK: i += direction continue return ch return '' def looks_like_math_context(text: str, idx: int, window: int = 20) -> bool: """Heuristic: comma at idx is between two math expressions if both sides contain math-specific characters within a small window.""" left_window = text[max(0, idx - window):idx] right_window = text[idx + 1:idx + 1 + window] return ( any(c in MATH_CHARS for c in left_window) and any(c in MATH_CHARS for c in right_window) ) def convert_parens(text: str, aggressive: bool, depth_offset: int = 0) -> str: """Convert ( ) to ( ) when in Chinese context. Tracks Chinese-paren depth so that a paren nested inside an outer Chinese paren only converts if its own content contains CJK — preserves math notation like `(GF(2⁸) 列混合)`. Skips conversion when the immediate preceding char is an English letter or digit (e.g. `DNS(Domain Name System)`), since such parens behave like a function call / abbreviation expansion in the source language. """ n = len(text) out = [] i = 0 cn_depth = depth_offset while i < n: ch = text[i] if ch == '(': # ( cn_depth += 1 out.append(ch) i += 1 continue if ch == ')': # ) cn_depth = max(0, cn_depth - 1) out.append(ch) i += 1 continue if ch == '(': depth = 1 j = i + 1 while j < n and depth > 0: if text[j] == '(': depth += 1 elif text[j] == ')': depth -= 1 if depth == 0: break j += 1 if depth == 0: content = text[i + 1:j] left = find_strong_neighbor(text, i, -1) right = find_strong_neighbor(text, j, +1) immediate_prev = text[i - 1] if i > 0 else '' content_has_cjk = bool(CJK_RE.search(content)) neighbor_chinese = ( is_chinese_context(left) or is_chinese_context(right) ) content_has_math = bool(re.search( r'[\d=+\-*/×÷^¹²³⁴-⁹⁰₀-₉]', content, )) if cn_depth > 0: # Nested inside Chinese paren — convert prose-like content should_convert = content_has_cjk or not content_has_math elif is_ascii_alnum(immediate_prev): # Attached to English identifier — leave ASCII should_convert = content_has_cjk else: should_convert = ( content_has_cjk or (aggressive and neighbor_chinese) ) inner_offset = cn_depth + (1 if should_convert else 0) converted_content = convert_parens(content, aggressive, inner_offset) if should_convert: out.append('(') out.append(converted_content) out.append(')') else: out.append('(') out.append(converted_content) out.append(')') i = j + 1 continue out.append(ch) i += 1 return ''.join(out) def convert_punct(text: str, aggressive: bool) -> str: """Convert ASCII , . : ; ? ! to Chinese counterparts. Tracks both Chinese-paren depth and ASCII-paren depth: - Inside `(...)` → aggressive (those are Chinese parentheticals). - Inside `(...)` → conservative (the surviving ASCII parens were kept ASCII for a reason — likely English-attached or notation). """ chars = list(text) n = len(chars) out = [] cn_paren_depth = 0 ascii_paren_depth = 0 for i, ch in enumerate(chars): if ch == '(': cn_paren_depth += 1 out.append(ch) continue if ch == ')': cn_paren_depth = max(0, cn_paren_depth - 1) out.append(ch) continue if ch == '(': ascii_paren_depth += 1 out.append(ch) continue if ch == ')': ascii_paren_depth = max(0, ascii_paren_depth - 1) out.append(ch) continue prev = chars[i - 1] if i > 0 else '' nxt = chars[i + 1] if i + 1 < n else '' in_cn_paren = cn_paren_depth > 0 in_ascii_paren = ascii_paren_depth > 0 if ch == ',': # Number-list separator: digit, [space,] digit → keep ASCII prev_is_digit = prev.isascii() and prev.isdigit() after_space_nxt = chars[i + 2] if (nxt == ' ' and i + 2 < n) else nxt nxt_is_digit = ( after_space_nxt and after_space_nxt.isascii() and after_space_nxt.isdigit() ) if prev_is_digit and nxt_is_digit: out.append(ch) continue if looks_like_math_context(text, i): out.append(ch) continue if is_chinese_context(prev) or is_chinese_context(nxt): out.append(',') continue if in_ascii_paren: out.append(ch) continue if in_cn_paren or aggressive: out.append(',') continue elif ch == '.': if is_ascii_alnum(nxt): pass # decimal / file ext / version elif is_chinese_context(prev): out.append('。') continue elif prev in WEAK: left = find_strong_neighbor(text, i, -1) if is_chinese_context(left): out.append('。') continue elif ch in (':', ';', '?', '!'): mapping = {':': ':', ';': ';', '?': '?', '!': '!'} if ch == ':': # Ratio / time notation like "1:8" or "12:34" → keep ASCII colon prev_is_digit = prev.isascii() and prev.isdigit() nxt_is_digit = nxt.isascii() and nxt.isdigit() if prev_is_digit and nxt_is_digit: out.append(ch) continue if ch in (':', ';') and looks_like_math_context(text, i): out.append(ch) continue left = find_strong_neighbor(text, i, -1) right = find_strong_neighbor(text, i, +1) if is_chinese_context(left) or is_chinese_context(right): out.append(mapping[ch]) continue if in_ascii_paren: out.append(ch) continue if in_cn_paren or aggressive: out.append(mapping[ch]) continue out.append(ch) return ''.join(out) def line_is_chinese(line: str) -> bool: """Heuristic: does this line have enough CJK to call it 'Chinese-flavored'?""" cjk_count = sum(1 for c in line if is_cjk(c)) return cjk_count >= 3 def process_text(text: str, aggressive: bool) -> str: return convert_punct(convert_parens(text, aggressive), aggressive) # Markdown image, markdown link, bare URL, or inline code (in this priority) OPAQUE_RE = re.compile( r'!?\[[^\]]*\]\([^\)]+\)' r'|https?://[^\s\)\]\>]+' r'|`[^`\n]+`' ) FENCE_RE = re.compile(r'(```[\s\S]*?```)') def process_segment(segment: str, aggressive: bool) -> str: """Mask opaque blobs, run conversions, restore (recursively processing markdown link text).""" saved = [] def stash(m: re.Match) -> str: saved.append(m.group(0)) return PLACEHOLDER masked = OPAQUE_RE.sub(stash, segment) converted = process_text(masked, aggressive) out = [] idx = 0 for ch in converted: if ch == PLACEHOLDER: tok = saved[idx] idx += 1 m = re.match(r'(!?\[)([^\]]*)(\]\()([^\)]+)(\))', tok) if m: inside = process_text(m.group(2), aggressive) tok = m.group(1) + inside + m.group(3) + m.group(4) + m.group(5) out.append(tok) else: out.append(ch) return ''.join(out) def process_body_segment(segment: str) -> str: """Process a non-fenced body segment line by line, choosing aggressive mode based on whether each logical line is Chinese-flavored.""" lines = segment.split('\n') out_lines = [] for line in lines: out_lines.append(process_segment(line, aggressive=line_is_chinese(line))) return '\n'.join(out_lines) def process_yaml_frontmatter(text: str) -> str: """Convert only quoted string values for description / title / summary keys. Leaves list/array values like `tags: [...]` alone.""" def replace_value(m: re.Match) -> str: prefix, value, suffix = m.group(1), m.group(2), m.group(3) return prefix + process_segment(value, aggressive=line_is_chinese(value)) + suffix return re.sub( r'^(\s*(?:description|title|summary):\s*")([^"\n]*)(")', replace_value, text, flags=re.MULTILINE, ) def process_markdown(content: str) -> str: front = '' body = content if content.startswith('---\n') or content.startswith('---\r\n'): m = re.match(r'(---\r?\n[\s\S]*?\r?\n---\r?\n)', content) if m: front = m.group(1) body = content[m.end():] front = process_yaml_frontmatter(front) parts = FENCE_RE.split(body) out = [] for i, part in enumerate(parts): if i % 2 == 1: # fenced code block out.append(part) else: out.append(process_body_segment(part)) return front + ''.join(out) def main() -> None: changed = [] for path in sys.argv[1:]: with open(path, encoding='utf-8', newline='') as f: content = f.read() new_content = process_markdown(content) if new_content != content: with open(path, 'w', encoding='utf-8', newline='') as f: f.write(new_content) changed.append(path) print(f'updated {path}') else: print(f'unchanged {path}') print(f'\nTotal updated: {len(changed)}') if __name__ == '__main__': main()