From dd622f83c1e410e987587cab7568ac6bc06b5f74 Mon Sep 17 00:00:00 2001
From: Trevin Chow <trevin@trevinchow.com>
Date: Sun, 19 Apr 2026 10:05:48 -0700
Subject: [PATCH] fix(annotations): use Unicode script properties for CJK
 detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address review feedback on #471 from @coderabbitai. The BMP-only
codepoint ranges missed two classes of characters:

- Non-BMP Han extensions (CJK Unified Ideographs Extension B, C, D, E, F)
  such as 𠀀. A long string of Extension-B characters would still be
  tokenized as a single unbreakable unit and overflow the box.
- Halfwidth Katakana (U+FF65-U+FF9F) such as ｶ. Same failure mode.

Switch to Unicode script property escapes (\\p{Script=Han},
\\p{Script=Hiragana}, \\p{Script=Katakana}, \\p{Script=Hangul}) which
cover these cases without enumerating ranges. tsconfig target is ES2020;
property escapes require ES2018+ so this is safe.

Verified coverage: 漢 あ ア 가 𠀀 ｶ all match; A and digits do not.
---
 src/lib/exporter/annotationRenderer.ts | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/lib/exporter/annotationRenderer.ts b/src/lib/exporter/annotationRenderer.ts
index 0b895a0..c0d5657 100644
--- a/src/lib/exporter/annotationRenderer.ts
+++ b/src/lib/exporter/annotationRenderer.ts
@@ -10,12 +10,12 @@ import {
 let blurScratchCanvas: HTMLCanvasElement | null = null;
 let blurScratchCtx: CanvasRenderingContext2D | null = null;
 
-// Matches a single code point in Hiragana, Katakana, CJK Unified Ideographs
-// Extension A, CJK Unified Ideographs, Hangul Syllables, or CJK Compatibility
-// Ideographs. Used to split CJK text at character boundaries during wrap,
-// since CJK scripts have no word-separating whitespace.
-const CJK_CHAR =
-	/[\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af\uf900-\ufaff]/u;
+// Matches a single code point whose script is Han (including non-BMP
+// Extension A-F), Hiragana, Katakana (including halfwidth forms), or
+// Hangul. Used to split CJK text at character boundaries during wrap,
+// since CJK scripts have no word-separating whitespace. Unicode script
+// property escapes require ES2018+; tsconfig target is ES2020.
+const CJK_CHAR = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u;
 
 function tokenizeForWrap(line: string): string[] {
 	// Split Latin text on whitespace (preserving the whitespace as its own token,