395 lines
11 KiB
TypeScript
395 lines
11 KiB
TypeScript
import type {
|
|
GuideAction,
|
|
GuideEvent,
|
|
GuideSession,
|
|
GuideStepCandidate,
|
|
GuideTargetRole,
|
|
OcrBlock,
|
|
} from "./contracts";
|
|
|
|
const DEFAULT_MAX_NEARBY_TEXT = 5;
|
|
const DEFAULT_CLICK_RADIUS = 0.18;
|
|
const TARGET_SCORE_THRESHOLD = 0.32;
|
|
|
|
interface TextRegion {
|
|
text: string;
|
|
confidence: number;
|
|
box: OcrBlock["box"];
|
|
}
|
|
|
|
export interface BuildGuideStepCandidatesOptions {
|
|
maxNearbyText?: number;
|
|
clickRadius?: number;
|
|
}
|
|
|
|
export function buildGuideStepCandidates(
|
|
session: GuideSession,
|
|
options: BuildGuideStepCandidatesOptions = {},
|
|
): GuideStepCandidate[] {
|
|
const maxNearbyText = Math.max(1, options.maxNearbyText ?? DEFAULT_MAX_NEARBY_TEXT);
|
|
const clickRadius = Math.max(0.01, options.clickRadius ?? DEFAULT_CLICK_RADIUS);
|
|
const snapshotsByEventId = new Map(
|
|
session.snapshots.map((snapshot) => [snapshot.eventId, snapshot]),
|
|
);
|
|
const ocrBlocksBySnapshotId = groupOcrBlocksBySnapshot(session.ocrBlocks);
|
|
|
|
return [...session.events]
|
|
.sort((left, right) => left.timeMs - right.timeMs)
|
|
.map((event): GuideStepCandidate => {
|
|
const snapshot = snapshotsByEventId.get(event.id);
|
|
const blocks = snapshot ? (ocrBlocksBySnapshotId.get(snapshot.id) ?? []) : [];
|
|
const rankedRegions = rankTextRegionsForEvent(event, blocks, clickRadius);
|
|
const targetRegion = rankedRegions.find(
|
|
({ score }) => score >= TARGET_SCORE_THRESHOLD,
|
|
)?.region;
|
|
const nearbyText = uniqueText(rankedRegions.map(({ region }) => region.text)).slice(
|
|
0,
|
|
maxNearbyText,
|
|
);
|
|
const label = normalizeEventLabelForTarget(event);
|
|
const point = getEventPoint(event);
|
|
const targetText = point
|
|
? (normalizeText(targetRegion?.text) ?? label)
|
|
: (label ?? normalizeText(targetRegion?.text));
|
|
|
|
return {
|
|
id: `candidate-${event.id}`,
|
|
eventId: event.id,
|
|
snapshotId: snapshot?.id,
|
|
timeMs: event.timeMs,
|
|
action: inferAction(event),
|
|
targetText,
|
|
targetRole: inferTargetRole(targetText),
|
|
position: point ? describeEventPosition(point) : undefined,
|
|
nearbyText,
|
|
confidence: calculateCandidateConfidence(event, targetRegion, rankedRegions[0]?.score),
|
|
};
|
|
});
|
|
}
|
|
|
|
function groupOcrBlocksBySnapshot(ocrBlocks: OcrBlock[]): Map<string, OcrBlock[]> {
|
|
const grouped = new Map<string, OcrBlock[]>();
|
|
for (const block of ocrBlocks) {
|
|
const existing = grouped.get(block.snapshotId) ?? [];
|
|
existing.push(block);
|
|
grouped.set(block.snapshotId, existing);
|
|
}
|
|
return grouped;
|
|
}
|
|
|
|
function rankTextRegionsForEvent(
|
|
event: GuideEvent,
|
|
blocks: OcrBlock[],
|
|
clickRadius: number,
|
|
): Array<{ region: TextRegion; score: number }> {
|
|
const click = getEventPoint(event);
|
|
return buildTextRegions(blocks)
|
|
.map((region) => ({ region, score: scoreRegion(region, click, clickRadius) }))
|
|
.filter(({ region }) => normalizeText(region.text) !== undefined)
|
|
.sort(
|
|
(left, right) =>
|
|
right.score - left.score ||
|
|
right.region.confidence - left.region.confidence ||
|
|
right.region.text.length - left.region.text.length,
|
|
);
|
|
}
|
|
|
|
function buildTextRegions(blocks: OcrBlock[]): TextRegion[] {
|
|
const wordRegions = blocks
|
|
.map((block): TextRegion | null => {
|
|
const text = normalizeText(block.text);
|
|
if (!text || !isUsefulOcrText(text)) {
|
|
return null;
|
|
}
|
|
return {
|
|
text,
|
|
confidence: block.confidence,
|
|
box: block.box,
|
|
};
|
|
})
|
|
.filter((region): region is TextRegion => region !== null);
|
|
const phraseRegions = buildPhraseRegions(wordRegions);
|
|
return [...phraseRegions, ...wordRegions];
|
|
}
|
|
|
|
function buildPhraseRegions(regions: TextRegion[]): TextRegion[] {
|
|
const sorted = [...regions].sort((left, right) => regionCenterY(left) - regionCenterY(right));
|
|
const lines: TextRegion[][] = [];
|
|
for (const region of sorted) {
|
|
const centerY = regionCenterY(region);
|
|
const line = lines.find(
|
|
(candidate) =>
|
|
Math.abs(regionCenterY(candidate[0]) - centerY) <=
|
|
Math.max(0.012, averageHeight(candidate) * 0.9),
|
|
);
|
|
if (line) {
|
|
line.push(region);
|
|
} else {
|
|
lines.push([region]);
|
|
}
|
|
}
|
|
|
|
const phrases: TextRegion[] = [];
|
|
for (const line of lines) {
|
|
const segments = splitLineIntoSegments(line.sort((left, right) => left.box.x - right.box.x));
|
|
for (const segment of segments) {
|
|
if (segment.length < 2) {
|
|
continue;
|
|
}
|
|
const phrase = mergeRegions(segment);
|
|
if (phrase.text.length >= 3) {
|
|
phrases.push(phrase);
|
|
}
|
|
}
|
|
}
|
|
return phrases;
|
|
}
|
|
|
|
function splitLineIntoSegments(line: TextRegion[]): TextRegion[][] {
|
|
const segments: TextRegion[][] = [];
|
|
let current: TextRegion[] = [];
|
|
for (const region of line) {
|
|
const previous = current.at(-1);
|
|
if (
|
|
previous &&
|
|
region.box.x - (previous.box.x + previous.box.width) >
|
|
Math.max(0.025, averageHeight(current) * 3)
|
|
) {
|
|
segments.push(current);
|
|
current = [];
|
|
}
|
|
current.push(region);
|
|
}
|
|
if (current.length > 0) {
|
|
segments.push(current);
|
|
}
|
|
return segments;
|
|
}
|
|
|
|
function mergeRegions(regions: TextRegion[]): TextRegion {
|
|
const minX = Math.min(...regions.map((region) => region.box.x));
|
|
const minY = Math.min(...regions.map((region) => region.box.y));
|
|
const maxX = Math.max(...regions.map((region) => region.box.x + region.box.width));
|
|
const maxY = Math.max(...regions.map((region) => region.box.y + region.box.height));
|
|
return {
|
|
text: regions.map((region) => region.text).join(" "),
|
|
confidence:
|
|
regions.reduce((total, region) => total + clamp01(region.confidence), 0) / regions.length,
|
|
box: {
|
|
x: minX,
|
|
y: minY,
|
|
width: maxX - minX,
|
|
height: maxY - minY,
|
|
},
|
|
};
|
|
}
|
|
|
|
function scoreRegion(
|
|
region: TextRegion,
|
|
click: { x: number; y: number } | null,
|
|
clickRadius: number,
|
|
): number {
|
|
if (!click) {
|
|
return clamp01(region.confidence);
|
|
}
|
|
|
|
const centerX = region.box.x + region.box.width / 2;
|
|
const centerY = region.box.y + region.box.height / 2;
|
|
const distance = Math.hypot(centerX - click.x, centerY - click.y);
|
|
const proximity = clamp01(1 - distance / clickRadius);
|
|
const contains = pointInsideExpandedBox(click, region, 0.025) ? 0.35 : 0;
|
|
return clamp01(
|
|
proximity * 0.35 +
|
|
clamp01(region.confidence) * 0.2 +
|
|
contains +
|
|
calculateTextQuality(region.text) * 0.2,
|
|
);
|
|
}
|
|
|
|
function getEventPoint(event: GuideEvent): { x: number; y: number } | null {
|
|
if (event.normalizedX !== undefined && event.normalizedY !== undefined) {
|
|
return { x: clamp01(event.normalizedX), y: clamp01(event.normalizedY) };
|
|
}
|
|
if (
|
|
event.x !== undefined &&
|
|
event.y !== undefined &&
|
|
event.x >= 0 &&
|
|
event.x <= 1 &&
|
|
event.y >= 0 &&
|
|
event.y <= 1
|
|
) {
|
|
return { x: event.x, y: event.y };
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function pointInsideExpandedBox(
|
|
point: { x: number; y: number },
|
|
region: Pick<TextRegion, "box">,
|
|
padding: number,
|
|
): boolean {
|
|
return (
|
|
point.x >= region.box.x - padding &&
|
|
point.x <= region.box.x + region.box.width + padding &&
|
|
point.y >= region.box.y - padding &&
|
|
point.y <= region.box.y + region.box.height + padding
|
|
);
|
|
}
|
|
|
|
function inferAction(event: GuideEvent): GuideAction {
|
|
if (event.kind === "click" || (event.kind === "hotkey" && getEventPoint(event))) {
|
|
return "click";
|
|
}
|
|
return "manual";
|
|
}
|
|
|
|
function inferTargetRole(text: string | undefined): GuideTargetRole | undefined {
|
|
if (!text) {
|
|
return undefined;
|
|
}
|
|
|
|
const normalized = text.toLowerCase();
|
|
if (
|
|
/\b(ok|save|create|next|continue|login|submit|cancel|done|apply|open|start|finish)\b/.test(
|
|
normalized,
|
|
) ||
|
|
/(lưu|tiếp|đăng nhập|hủy|áp dụng|mở|bắt đầu|hoàn tất)/i.test(text)
|
|
) {
|
|
return "button";
|
|
}
|
|
if (/\b(menu|file|edit|view|settings)\b/.test(normalized) || /(menu|cài đặt)/i.test(text)) {
|
|
return "menu";
|
|
}
|
|
if (/\b(tab)\b/.test(normalized) || /(thẻ|tab)/i.test(text)) {
|
|
return "tab";
|
|
}
|
|
if (/\b(search|name|email|password|input|field)\b/.test(normalized)) {
|
|
return "field";
|
|
}
|
|
return "unknown";
|
|
}
|
|
|
|
function calculateCandidateConfidence(
|
|
event: GuideEvent,
|
|
targetRegion: TextRegion | undefined,
|
|
score: number | undefined,
|
|
): number {
|
|
if (targetRegion) {
|
|
return roundConfidence(
|
|
0.45 + clamp01(targetRegion.confidence) * 0.25 + clamp01(score ?? 0) * 0.3,
|
|
);
|
|
}
|
|
if (normalizeEventLabelForTarget(event)) {
|
|
return 0.75;
|
|
}
|
|
if (getEventPoint(event)) {
|
|
return 0.45;
|
|
}
|
|
return 0.3;
|
|
}
|
|
|
|
function uniqueText(values: Array<string | undefined>): string[] {
|
|
const seen = new Set<string>();
|
|
const result: string[] = [];
|
|
for (const value of values) {
|
|
const normalized = normalizeText(value);
|
|
if (!normalized) {
|
|
continue;
|
|
}
|
|
const key = normalized.toLowerCase();
|
|
if (seen.has(key)) {
|
|
continue;
|
|
}
|
|
seen.add(key);
|
|
result.push(normalized);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
function normalizeText(value: string | undefined): string | undefined {
|
|
const text = value?.replace(/\s+/g, " ").trim();
|
|
return text ? text : undefined;
|
|
}
|
|
|
|
function normalizeEventLabelForTarget(event: GuideEvent): string | undefined {
|
|
const label = normalizeText(event.label);
|
|
if (!label) {
|
|
return undefined;
|
|
}
|
|
if (/^(?:ctrl(?:\s*\+\s*f12)?|control)\s+marker$/i.test(label)) {
|
|
return undefined;
|
|
}
|
|
if (/^manual\s+marker$/i.test(label)) {
|
|
return undefined;
|
|
}
|
|
return label;
|
|
}
|
|
|
|
function describeEventPosition(point: { x: number; y: number }): GuideStepCandidate["position"] {
|
|
const normalizedX = clamp01(point.x);
|
|
const normalizedY = clamp01(point.y);
|
|
return {
|
|
normalizedX: roundPosition(normalizedX),
|
|
normalizedY: roundPosition(normalizedY),
|
|
xPercent: Math.round(normalizedX * 100),
|
|
yPercent: Math.round(normalizedY * 100),
|
|
description: describeScreenRegion(normalizedX, normalizedY),
|
|
};
|
|
}
|
|
|
|
function describeScreenRegion(x: number, y: number): string {
|
|
const vertical = y < 0.33 ? "top" : y > 0.66 ? "bottom" : "middle";
|
|
const horizontal = x < 0.33 ? "left" : x > 0.66 ? "right" : "center";
|
|
return vertical === "middle" && horizontal === "center" ? "center" : `${vertical} ${horizontal}`;
|
|
}
|
|
|
|
function isUsefulOcrText(text: string): boolean {
|
|
if (!/[A-Za-z0-9À-ỹ]/.test(text)) {
|
|
return false;
|
|
}
|
|
if (text.length === 1) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function calculateTextQuality(text: string): number {
|
|
let score = 0.35;
|
|
if (text.includes(" ")) {
|
|
score += 0.5;
|
|
}
|
|
if (text.length >= 4) {
|
|
score += 0.25;
|
|
}
|
|
if (/[��]/.test(text)) {
|
|
score -= 0.25;
|
|
}
|
|
if (/^[\W_]+$/.test(text)) {
|
|
score -= 0.35;
|
|
}
|
|
return clamp01(score);
|
|
}
|
|
|
|
function regionCenterY(region: TextRegion): number {
|
|
return region.box.y + region.box.height / 2;
|
|
}
|
|
|
|
function averageHeight(regions: TextRegion[]): number {
|
|
return regions.reduce((total, region) => total + region.box.height, 0) / regions.length;
|
|
}
|
|
|
|
function roundConfidence(value: number): number {
|
|
return Math.round(clamp01(value) * 100) / 100;
|
|
}
|
|
|
|
function roundPosition(value: number): number {
|
|
return Math.round(clamp01(value) * 1000) / 1000;
|
|
}
|
|
|
|
function clamp01(value: number): number {
|
|
if (!Number.isFinite(value)) {
|
|
return 0;
|
|
}
|
|
return Math.min(1, Math.max(0, value));
|
|
}
|