fix(annotations): wrap CJK text at character boundaries in export renderer

renderText split each line on whitespace, which works for Latin text
but leaves CJK strings as a single unbreakable token because CJK
scripts have no word-separating whitespace. Result: CJK annotation
text overflows the clipped annotation box even though the editor's
HTML preview wraps it correctly via CSS word-break: break-word.

Replace the ad-hoc whitespace split with a tokenizeForWrap helper
that emits each CJK character (Hiragana, Katakana, Hangul Syllables,
CJK Unified Ideographs + Extension A, and CJK Compatibility
Ideographs) as its own token, while keeping Latin words + whitespace
intact. The existing width-measurement wrap loop then handles CJK
per-character, matching the editor's behavior.

Closes #449
This commit is contained in:
Trevin Chow
2026-04-19 02:49:17 -07:00
parent fd6a0778fb
commit f04c2b7c14
+37 -4
View File
@@ -10,6 +10,39 @@ import {
let blurScratchCanvas: HTMLCanvasElement | null = null;
let blurScratchCtx: CanvasRenderingContext2D | null = null;
// Matches a single code point in Hiragana, Katakana, CJK Unified Ideographs
// Extension A, CJK Unified Ideographs, Hangul Syllables, or CJK Compatibility
// Ideographs. Used to split CJK text at character boundaries during wrap,
// since CJK scripts have no word-separating whitespace.
const CJK_CHAR =
/[\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af\uf900-\ufaff]/u;
function tokenizeForWrap(line: string): string[] {
// Split Latin text on whitespace (preserving the whitespace as its own token,
// matching the original behavior), and split CJK runs into individual
// characters so each one becomes a breakable unit. This mirrors the editor's
// CSS `word-break: break-word` handling for CJK content.
const tokens: string[] = [];
let buffer = "";
const chars = Array.from(line);
const flushBuffer = () => {
if (buffer) {
tokens.push(...buffer.split(/(\s+)/).filter((s) => s.length > 0));
buffer = "";
}
};
for (const ch of chars) {
if (CJK_CHAR.test(ch)) {
flushBuffer();
tokens.push(ch);
} else {
buffer += ch;
}
}
flushBuffer();
return tokens;
}
// SVG path data for each arrow direction
const ARROW_PATHS: Record<ArrowDirection, string[]> = {
up: ["M 50 20 L 50 80", "M 50 20 L 35 35", "M 50 20 L 65 35"],
@@ -249,13 +282,13 @@ function renderText(
lines.push("");
continue;
}
const words = rawLine.split(/(\s+)/);
const tokens = tokenizeForWrap(rawLine);
let current = "";
for (const word of words) {
const test = current + word;
for (const token of tokens) {
const test = current + token;
if (current && ctx.measureText(test).width > availableWidth) {
lines.push(current);
current = word.trimStart();
current = token.trimStart();
} else {
current = test;
}