From f04c2b7c14d1062d9cd74cd88a0191709b2810e8 Mon Sep 17 00:00:00 2001 From: Trevin Chow Date: Sun, 19 Apr 2026 02:49:17 -0700 Subject: [PATCH] fix(annotations): wrap CJK text at character boundaries in export renderer renderText split each line on whitespace, which works for Latin text but leaves CJK strings as a single unbreakable token because CJK scripts have no word-separating whitespace. Result: CJK annotation text overflows the clipped annotation box even though the editor's HTML preview wraps it correctly via CSS word-break: break-word. Replace the ad-hoc whitespace split with a tokenizeForWrap helper that emits each CJK character (Hiragana, Katakana, Hangul Syllables, CJK Unified Ideographs + Extension A, and CJK Compatibility Ideographs) as its own token, while keeping Latin words + whitespace intact. The existing width-measurement wrap loop then handles CJK per-character, matching the editor's behavior. Closes #449 --- src/lib/exporter/annotationRenderer.ts | 41 +++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/src/lib/exporter/annotationRenderer.ts b/src/lib/exporter/annotationRenderer.ts index b0c4948..0b895a0 100644 --- a/src/lib/exporter/annotationRenderer.ts +++ b/src/lib/exporter/annotationRenderer.ts @@ -10,6 +10,39 @@ import { let blurScratchCanvas: HTMLCanvasElement | null = null; let blurScratchCtx: CanvasRenderingContext2D | null = null; +// Matches a single code point in Hiragana, Katakana, CJK Unified Ideographs +// Extension A, CJK Unified Ideographs, Hangul Syllables, or CJK Compatibility +// Ideographs. Used to split CJK text at character boundaries during wrap, +// since CJK scripts have no word-separating whitespace. +const CJK_CHAR = + /[\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af\uf900-\ufaff]/u; + +function tokenizeForWrap(line: string): string[] { + // Split Latin text on whitespace (preserving the whitespace as its own token, + // matching the original behavior), and split CJK runs into individual + // characters so each one becomes a breakable unit. This mirrors the editor's + // CSS `word-break: break-word` handling for CJK content. + const tokens: string[] = []; + let buffer = ""; + const chars = Array.from(line); + const flushBuffer = () => { + if (buffer) { + tokens.push(...buffer.split(/(\s+)/).filter((s) => s.length > 0)); + buffer = ""; + } + }; + for (const ch of chars) { + if (CJK_CHAR.test(ch)) { + flushBuffer(); + tokens.push(ch); + } else { + buffer += ch; + } + } + flushBuffer(); + return tokens; +} + // SVG path data for each arrow direction const ARROW_PATHS: Record = { up: ["M 50 20 L 50 80", "M 50 20 L 35 35", "M 50 20 L 65 35"], @@ -249,13 +282,13 @@ function renderText( lines.push(""); continue; } - const words = rawLine.split(/(\s+)/); + const tokens = tokenizeForWrap(rawLine); let current = ""; - for (const word of words) { - const test = current + word; + for (const token of tokens) { + const test = current + token; if (current && ctx.measureText(test).width > availableWidth) { lines.push(current); - current = word.trimStart(); + current = token.trimStart(); } else { current = test; }