fix(annotations): wrap CJK text at character boundaries in export renderer

renderText split each line on whitespace, which works for Latin text but leaves CJK strings as a single unbreakable token because CJK scripts have no word-separating whitespace. Result: CJK annotation text overflows the clipped annotation box even though the editor's HTML preview wraps it correctly via CSS word-break: break-word. Replace the ad-hoc whitespace split with a tokenizeForWrap helper that emits each CJK character (Hiragana, Katakana, Hangul Syllables, CJK Unified Ideographs + Extension A, and CJK Compatibility Ideographs) as its own token, while keeping Latin words + whitespace intact. The existing width-measurement wrap loop then handles CJK per-character, matching the editor's behavior. Closes #449
2026-04-19 02:49:17 -07:00
parent fd6a0778fb
commit f04c2b7c14
1 changed files with 37 additions and 4 deletions
@@ -10,6 +10,39 @@ import {
 let blurScratchCanvas: HTMLCanvasElement | null = null;
 let blurScratchCtx: CanvasRenderingContext2D | null = null;

+// Matches a single code point in Hiragana, Katakana, CJK Unified Ideographs
+// Extension A, CJK Unified Ideographs, Hangul Syllables, or CJK Compatibility
+// Ideographs. Used to split CJK text at character boundaries during wrap,
+// since CJK scripts have no word-separating whitespace.
+const CJK_CHAR =
+	/[\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af\uf900-\ufaff]/u;
+
+function tokenizeForWrap(line: string): string[] {
+	// Split Latin text on whitespace (preserving the whitespace as its own token,
+	// matching the original behavior), and split CJK runs into individual
+	// characters so each one becomes a breakable unit. This mirrors the editor's
+	// CSS `word-break: break-word` handling for CJK content.
+	const tokens: string[] = [];
+	let buffer = "";
+	const chars = Array.from(line);
+	const flushBuffer = () => {
+		if (buffer) {
+			tokens.push(...buffer.split(/(\s+)/).filter((s) => s.length > 0));
+			buffer = "";
+		}
+	};
+	for (const ch of chars) {
+		if (CJK_CHAR.test(ch)) {
+			flushBuffer();
+			tokens.push(ch);
+		} else {
+			buffer += ch;
+		}
+	}
+	flushBuffer();
+	return tokens;
+}
+
 // SVG path data for each arrow direction
 const ARROW_PATHS: Record<ArrowDirection, string[]> = {
 	up: ["M 50 20 L 50 80", "M 50 20 L 35 35", "M 50 20 L 65 35"],
@@ -249,13 +282,13 @@ function renderText(
 			lines.push("");
 			continue;
 		}
-		const words = rawLine.split(/(\s+)/);
+		const tokens = tokenizeForWrap(rawLine);
 		let current = "";
-		for (const word of words) {
-			const test = current + word;
+		for (const token of tokens) {
+			const test = current + token;
 			if (current && ctx.measureText(test).width > availableWidth) {
 				lines.push(current);
-				current = word.trimStart();
+				current = token.trimStart();
 			} else {
 				current = test;
 			}