Add auto guide generation with bundled OCR

This commit is contained in:
huanld
2026-05-28 07:07:30 +07:00
parent 8117d4826f
commit 24a16c693a
61 changed files with 8734 additions and 193 deletions
+88
View File
@@ -27,6 +27,94 @@ interface Window {
invokeNativeBridge: <TData = unknown>(
request: import("../src/native/contracts").NativeBridgeRequest,
) => Promise<import("../src/native/contracts").NativeBridgeResponse<TData>>;
guide: {
startSession: (
recordingId: import("../src/guide/contracts").GuideRecordingIdInput,
) => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").GuideSession
>
>;
readSession: (
recordingId: import("../src/guide/contracts").GuideRecordingIdInput,
) => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").GuideSession
>
>;
addMarker: (input: import("../src/guide/contracts").AddGuideMarkerInput) => Promise<
import("../src/guide/contracts").GuideIpcResult<{
session: import("../src/guide/contracts").GuideSession;
event: import("../src/guide/contracts").GuideEvent;
}>
>;
finalizeEvents: (
input: import("../src/guide/contracts").FinalizeGuideEventsInput,
) => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").GuideSession
>
>;
writeSnapshot: (
input: import("../src/guide/contracts").WriteGuideSnapshotInput,
) => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").GuideSession
>
>;
runOcr: (
input: import("../src/guide/contracts").RunGuideOcrInput,
) => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").GuideSession
>
>;
generateDraft: (
input: import("../src/guide/contracts").GenerateGuideDraftInput,
) => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").GuideSession
>
>;
getAiSettings: () => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").GuideAiSettings
>
>;
saveAiSettings: (
input: import("../src/guide/contracts").SaveGuideAiSettingsInput,
) => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").GuideAiSettings
>
>;
saveGuide: (
input: import("../src/guide/contracts").SaveGuideInput,
) => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").GuideSession
>
>;
exportMarkdown: (
input: import("../src/guide/contracts").ExportGuideInput,
) => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").ExportGuideResult
>
>;
exportHtml: (
input: import("../src/guide/contracts").ExportGuideInput,
) => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").ExportGuideResult
>
>;
discardSession: (input: import("../src/guide/contracts").DiscardGuideSessionInput) => Promise<
import("../src/guide/contracts").GuideIpcResult<{
discarded: true;
}>
>;
};
getSources: (opts: Electron.SourcesOptions) => Promise<ProcessedDesktopSource[]>;
switchToEditor: () => Promise<void>;
switchToHud: () => Promise<void>;
+181
View File
@@ -0,0 +1,181 @@
import type {
GeneratedGuide,
GuideLanguage,
GuideSession,
GuideStepCandidate,
} from "../../../src/guide/contracts";
import { buildGuideDraftPrompt } from "../../../src/guide/promptBuilder";
import type { DeepSeekGuideConfigProvider } from "./deepseekSettingsStore";
export interface GuideDraftClient {
generate(input: {
session: GuideSession;
candidates: GuideStepCandidate[];
language: GuideLanguage;
}): Promise<GeneratedGuide>;
}
export class DeepSeekGuideClientError extends Error {
constructor(
readonly code: "guide-ai-key-missing" | "guide-ai-request-failed" | "guide-ai-invalid-output",
message: string,
readonly retryable = false,
) {
super(message);
this.name = "DeepSeekGuideClientError";
}
}
interface DeepSeekChatResponse {
choices?: Array<{
message?: {
content?: string;
};
}>;
}
export class DeepSeekGuideClient implements GuideDraftClient {
constructor(
private readonly configProvider?: DeepSeekGuideConfigProvider,
private readonly fallbackApiKey = process.env.DEEPSEEK_API_KEY,
private readonly fallbackBaseUrl = process.env.DEEPSEEK_BASE_URL ?? "https://api.deepseek.com",
private readonly fallbackModel = process.env.DEEPSEEK_MODEL ?? "deepseek-chat",
) {}
async generate(input: {
session: GuideSession;
candidates: GuideStepCandidate[];
language: GuideLanguage;
}): Promise<GeneratedGuide> {
const config = await this.resolveConfig();
if (!config.apiKey) {
throw new DeepSeekGuideClientError(
"guide-ai-key-missing",
"DeepSeek API key is not configured.",
);
}
let response: Response;
try {
response = await fetch(`${config.baseUrl.replace(/\/$/, "")}/chat/completions`, {
method: "POST",
headers: {
"content-type": "application/json",
authorization: `Bearer ${config.apiKey}`,
},
body: JSON.stringify({
model: config.model,
temperature: 0.2,
response_format: { type: "json_object" },
messages: [
{
role: "system",
content:
"You convert UI interaction telemetry into concise software user-guide steps.",
},
{
role: "user",
content: buildGuideDraftPrompt(input),
},
],
}),
});
} catch (error) {
throw new DeepSeekGuideClientError(
"guide-ai-request-failed",
`DeepSeek request failed: ${error instanceof Error ? error.message : String(error)}`,
true,
);
}
if (!response.ok) {
throw new DeepSeekGuideClientError(
"guide-ai-request-failed",
`DeepSeek returned HTTP ${response.status}.`,
true,
);
}
const payload = (await response.json()) as DeepSeekChatResponse;
const content = payload.choices?.[0]?.message?.content;
if (!content) {
throw new DeepSeekGuideClientError(
"guide-ai-invalid-output",
"DeepSeek returned an empty response.",
);
}
return parseGeneratedGuide(content);
}
private async resolveConfig(): Promise<{ apiKey?: string; baseUrl: string; model: string }> {
if (this.configProvider) {
return await this.configProvider.getDeepSeekConfig();
}
return {
apiKey: this.fallbackApiKey,
baseUrl: this.fallbackBaseUrl,
model: this.fallbackModel,
};
}
}
function parseGeneratedGuide(content: string): GeneratedGuide {
try {
const parsed = JSON.parse(stripCodeFence(content)) as unknown;
const normalized = normalizeGeneratedGuide(parsed);
if (!normalized) {
throw new Error("Unexpected guide JSON shape.");
}
return normalized;
} catch (error) {
throw new DeepSeekGuideClientError(
"guide-ai-invalid-output",
`DeepSeek response is not valid guide JSON: ${error instanceof Error ? error.message : String(error)}`,
);
}
}
function stripCodeFence(content: string): string {
return content
.replace(/^```(?:json)?\s*/i, "")
.replace(/\s*```$/i, "")
.trim();
}
function normalizeGeneratedGuide(value: unknown): GeneratedGuide | null {
if (!value || typeof value !== "object") {
return null;
}
const guide = value as Partial<GeneratedGuide>;
if (typeof guide.title !== "string" || !Array.isArray(guide.steps)) {
return null;
}
const steps = guide.steps
.map((step, index) => {
if (!step || typeof step !== "object") {
return null;
}
const raw = step as Partial<GeneratedGuide["steps"][number]>;
if (typeof raw.title !== "string" || typeof raw.instruction !== "string") {
return null;
}
const order =
typeof raw.order === "number" && Number.isFinite(raw.order) ? raw.order : index + 1;
return {
id: typeof raw.id === "string" && raw.id.trim() ? raw.id : `guide-step-${order}`,
order,
title: raw.title,
instruction: raw.instruction,
...(typeof raw.screenshotPath === "string" ? { screenshotPath: raw.screenshotPath } : {}),
...(typeof raw.sourceCandidateId === "string"
? { sourceCandidateId: raw.sourceCandidateId }
: {}),
};
})
.filter((step): step is GeneratedGuide["steps"][number] => step !== null);
return {
title: guide.title,
summary: typeof guide.summary === "string" ? guide.summary : undefined,
steps,
};
}
+157
View File
@@ -0,0 +1,157 @@
import fs from "node:fs/promises";
import path from "node:path";
import type { GuideAiSettings, SaveGuideAiSettingsInput } from "../../../src/guide/contracts";
export interface DeepSeekGuideConfig {
apiKey?: string;
baseUrl: string;
model: string;
}
export interface DeepSeekGuideConfigProvider {
getDeepSeekConfig(): Promise<DeepSeekGuideConfig>;
}
interface PersistedGuideAiSettings {
schemaVersion: 1;
deepseek?: {
apiKeyEnvName?: string;
baseUrl?: string;
model?: string;
updatedAt?: string;
};
}
const DEFAULT_DEEPSEEK_API_KEY_ENV_NAME = "DEEPSEEK_API_KEY";
const DEFAULT_DEEPSEEK_BASE_URL = "https://api.deepseek.com";
const DEFAULT_DEEPSEEK_MODEL = "deepseek-chat";
export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
constructor(private readonly filePath: string) {}
async getStatus(): Promise<GuideAiSettings> {
const raw = await this.readSettings();
const apiKeyEnvName = normalizeEnvName(raw?.deepseek?.apiKeyEnvName);
const activeApiKey = process.env[apiKeyEnvName];
return {
deepseek: {
hasApiKey: Boolean(activeApiKey),
apiKeyEnvName,
baseUrl: normalizeBaseUrl(raw?.deepseek?.baseUrl ?? process.env.DEEPSEEK_BASE_URL),
model: normalizeModel(raw?.deepseek?.model ?? process.env.DEEPSEEK_MODEL),
storage: activeApiKey ? "environment" : "none",
encryptionAvailable: false,
updatedAt: raw?.deepseek?.updatedAt,
},
};
}
async save(input: SaveGuideAiSettingsInput): Promise<GuideAiSettings> {
const current = (await this.readSettings()) ?? { schemaVersion: 1 };
const currentDeepSeek = current.deepseek ?? {};
const nextDeepSeek = {
...currentDeepSeek,
baseUrl: normalizeBaseUrl(input.baseUrl ?? currentDeepSeek.baseUrl),
model: normalizeModel(input.model ?? currentDeepSeek.model),
updatedAt: new Date().toISOString(),
};
if (input.clearDeepseekApiKeyEnvName) {
delete nextDeepSeek.apiKeyEnvName;
} else if (input.deepseekApiKeyEnvName !== undefined) {
nextDeepSeek.apiKeyEnvName = normalizeEnvName(input.deepseekApiKeyEnvName);
}
await this.writeSettings({
schemaVersion: 1,
deepseek: nextDeepSeek,
});
return await this.getStatus();
}
async getDeepSeekConfig(): Promise<DeepSeekGuideConfig> {
const raw = await this.readSettings();
const apiKeyEnvName = normalizeEnvName(raw?.deepseek?.apiKeyEnvName);
return {
apiKey: process.env[apiKeyEnvName],
baseUrl: normalizeBaseUrl(raw?.deepseek?.baseUrl ?? process.env.DEEPSEEK_BASE_URL),
model: normalizeModel(raw?.deepseek?.model ?? process.env.DEEPSEEK_MODEL),
};
}
private async readSettings(): Promise<PersistedGuideAiSettings | null> {
try {
const content = await fs.readFile(this.filePath, "utf-8");
const parsed = JSON.parse(content) as unknown;
const normalized = normalizePersistedSettings(parsed);
if (normalized && hasLegacyStoredSecret(parsed)) {
await this.writeSettings(normalized);
}
return normalized;
} catch {
return null;
}
}
private async writeSettings(settings: PersistedGuideAiSettings): Promise<void> {
await fs.mkdir(path.dirname(this.filePath), { recursive: true });
const tempPath = `${this.filePath}.${process.pid}.${Date.now()}.tmp`;
await fs.writeFile(tempPath, JSON.stringify(settings, null, 2), "utf-8");
await fs.rename(tempPath, this.filePath);
}
}
function hasLegacyStoredSecret(input: unknown): boolean {
return (
typeof input === "object" &&
input !== null &&
typeof (input as { deepseek?: { apiKey?: unknown } }).deepseek?.apiKey === "object"
);
}
function normalizePersistedSettings(input: unknown): PersistedGuideAiSettings | null {
if (!input || typeof input !== "object") {
return null;
}
const raw = input as Partial<PersistedGuideAiSettings>;
if (raw.schemaVersion !== 1) {
return null;
}
return {
schemaVersion: 1,
deepseek: {
apiKeyEnvName: normalizeEnvName(raw.deepseek?.apiKeyEnvName),
baseUrl: raw.deepseek?.baseUrl,
model: raw.deepseek?.model,
updatedAt: raw.deepseek?.updatedAt,
},
};
}
function normalizeEnvName(value: string | undefined): string {
const normalized = value?.trim();
if (!normalized) {
return DEFAULT_DEEPSEEK_API_KEY_ENV_NAME;
}
return /^[A-Za-z_][A-Za-z0-9_]*$/.test(normalized)
? normalized
: DEFAULT_DEEPSEEK_API_KEY_ENV_NAME;
}
function normalizeBaseUrl(value: string | undefined): string {
const candidate = value?.trim() || DEFAULT_DEEPSEEK_BASE_URL;
try {
const url = new URL(candidate);
if (url.protocol !== "https:" && url.protocol !== "http:") {
return DEFAULT_DEEPSEEK_BASE_URL;
}
return url.toString().replace(/\/$/, "");
} catch {
return DEFAULT_DEEPSEEK_BASE_URL;
}
}
function normalizeModel(value: string | undefined): string {
return value?.trim() || DEFAULT_DEEPSEEK_MODEL;
}
+152
View File
@@ -0,0 +1,152 @@
import type { IpcMain } from "electron";
import type {
AddGuideMarkerInput,
DiscardGuideSessionInput,
ExportGuideInput,
ExportGuideResult,
FinalizeGuideEventsInput,
GenerateGuideDraftInput,
GuideAiSettings,
GuideEvent,
GuideIpcResult,
GuideSession,
RunGuideOcrInput,
SaveGuideAiSettingsInput,
SaveGuideInput,
WriteGuideSnapshotInput,
} from "../../src/guide/contracts";
import type { DeepSeekSettingsStore } from "./ai/deepseekSettingsStore";
import { GuideStore, GuideStoreError } from "./guideStore";
export function registerGuideIpcHandlers(
ipcMain: IpcMain,
store: GuideStore,
aiSettingsStore?: DeepSeekSettingsStore,
): void {
ipcMain.handle(
"guide:start-session",
async (_, recordingId): Promise<GuideIpcResult<GuideSession>> => {
return await toGuideResult(() => store.startSession(recordingId));
},
);
ipcMain.handle(
"guide:read-session",
async (_, recordingId): Promise<GuideIpcResult<GuideSession>> => {
return await toGuideResult(() => store.readSession(recordingId));
},
);
ipcMain.handle(
"guide:add-marker",
async (
_,
input: AddGuideMarkerInput,
): Promise<GuideIpcResult<{ session: GuideSession; event: GuideEvent }>> => {
return await toGuideResult(() => store.addMarker(input));
},
);
ipcMain.handle(
"guide:finalize-events",
async (_, input: FinalizeGuideEventsInput): Promise<GuideIpcResult<GuideSession>> => {
return await toGuideResult(() => store.finalizeEvents(input));
},
);
ipcMain.handle(
"guide:write-snapshot",
async (_, input: WriteGuideSnapshotInput): Promise<GuideIpcResult<GuideSession>> => {
return await toGuideResult(() => store.writeSnapshot(input));
},
);
ipcMain.handle(
"guide:run-ocr",
async (_, input: RunGuideOcrInput): Promise<GuideIpcResult<GuideSession>> => {
return await toGuideResult(() => store.runOcr(input));
},
);
ipcMain.handle(
"guide:generate-draft",
async (_, input: GenerateGuideDraftInput): Promise<GuideIpcResult<GuideSession>> => {
return await toGuideResult(() => store.generateDraft(input));
},
);
ipcMain.handle("guide:get-ai-settings", async (): Promise<GuideIpcResult<GuideAiSettings>> => {
return await toGuideResult(() => requireAiSettingsStore(aiSettingsStore).getStatus());
});
ipcMain.handle(
"guide:save-ai-settings",
async (_, input: SaveGuideAiSettingsInput): Promise<GuideIpcResult<GuideAiSettings>> => {
return await toGuideResult(() => requireAiSettingsStore(aiSettingsStore).save(input));
},
);
ipcMain.handle(
"guide:save-guide",
async (_, input: SaveGuideInput): Promise<GuideIpcResult<GuideSession>> => {
return await toGuideResult(() => store.saveGuide(input));
},
);
ipcMain.handle(
"guide:export-markdown",
async (_, input: ExportGuideInput): Promise<GuideIpcResult<ExportGuideResult>> => {
return await toGuideResult(() => store.exportMarkdown(input));
},
);
ipcMain.handle(
"guide:export-html",
async (_, input: ExportGuideInput): Promise<GuideIpcResult<ExportGuideResult>> => {
return await toGuideResult(() => store.exportHtml(input));
},
);
ipcMain.handle(
"guide:discard-session",
async (_, input: DiscardGuideSessionInput): Promise<GuideIpcResult<{ discarded: true }>> => {
return await toGuideResult(async () => {
await store.discardSession(input);
return { discarded: true };
});
},
);
}
function requireAiSettingsStore(store: DeepSeekSettingsStore | undefined): DeepSeekSettingsStore {
if (!store) {
throw new GuideStoreError("guide-internal-error", "Guide AI settings store is unavailable.");
}
return store;
}
async function toGuideResult<TData>(action: () => Promise<TData>): Promise<GuideIpcResult<TData>> {
try {
return {
success: true,
data: await action(),
};
} catch (error) {
if (error instanceof GuideStoreError) {
return {
success: false,
code: error.code,
error: error.message,
retryable: error.retryable,
};
}
console.error("Guide IPC failed:", error);
return {
success: false,
code: "guide-internal-error",
error: error instanceof Error ? error.message : String(error),
retryable: false,
};
}
}
+57
View File
@@ -0,0 +1,57 @@
import path from "node:path";
import type { GuideRecordingIdInput } from "../../src/guide/contracts";
export const GUIDE_SESSION_SUFFIX = ".guide.json";
export const GUIDE_OUTPUT_DIR_SUFFIX = "-guide";
export interface GuidePaths {
recordingId: string;
baseName: string;
baseDir: string;
guidePath: string;
outputDir: string;
}
export function normalizeGuideRecordingId(recordingId: GuideRecordingIdInput): string | null {
if (typeof recordingId === "number") {
return Number.isFinite(recordingId) ? String(Math.trunc(recordingId)) : null;
}
if (typeof recordingId !== "string") {
return null;
}
const trimmed = recordingId.trim();
return trimmed.length > 0 ? trimmed : null;
}
export function resolveGuidePaths(input: {
recordingsDir: string;
recordingId: GuideRecordingIdInput;
videoPath?: string | null;
}): GuidePaths | null {
const recordingId = normalizeGuideRecordingId(input.recordingId);
if (!recordingId) {
return null;
}
const normalizedVideoPath =
typeof input.videoPath === "string" && input.videoPath.trim()
? path.resolve(input.videoPath.trim())
: null;
const parsedVideoPath = normalizedVideoPath ? path.parse(normalizedVideoPath) : null;
const baseName = parsedVideoPath?.name ?? defaultGuideBaseName(recordingId);
const baseDir = parsedVideoPath?.dir ?? path.resolve(input.recordingsDir);
return {
recordingId,
baseName,
baseDir,
guidePath: path.join(baseDir, `${baseName}${GUIDE_SESSION_SUFFIX}`),
outputDir: path.join(baseDir, `${baseName}${GUIDE_OUTPUT_DIR_SUFFIX}`),
};
}
function defaultGuideBaseName(recordingId: string): string {
return recordingId.startsWith("recording-") ? recordingId : `recording-${recordingId}`;
}
+233
View File
@@ -0,0 +1,233 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { GuideStore, GuideStoreError } from "./guideStore";
let recordingsDir = "";
beforeEach(async () => {
recordingsDir = await fs.mkdtemp(path.join(os.tmpdir(), "openscreen-guide-"));
});
afterEach(async () => {
if (recordingsDir) {
await fs.rm(recordingsDir, { recursive: true, force: true });
}
});
describe("GuideStore", () => {
it("creates and reads an empty guide session", async () => {
const store = new GuideStore(recordingsDir);
const session = await store.startSession(123);
const readSession = await store.readSession(123);
expect(session.recordingId).toBe("123");
expect(session.status).toBe("recording");
expect(session.guidePath).toBe(path.join(recordingsDir, "recording-123.guide.json"));
expect(readSession).toEqual(session);
await expect(fs.stat(session.outputDir)).resolves.toMatchObject({
isDirectory: expect.any(Function),
});
});
it("adds marker events in timeline order", async () => {
const store = new GuideStore(recordingsDir);
await store.startSession(456);
await store.addMarker({ recordingId: 456, kind: "manual", timeMs: 2000, label: "Later" });
const result = await store.addMarker({
recordingId: 456,
kind: "hotkey",
timeMs: 500,
label: "First",
});
expect(result.event.kind).toBe("hotkey");
expect(result.session.events.map((event) => event.timeMs)).toEqual([500, 2000]);
expect(result.session.events[0]?.source).toBe("guide-hotkey");
expect(result.session.events[1]?.source).toBe("review-ui");
});
it("finalizes a session against the saved video path", async () => {
const store = new GuideStore(recordingsDir);
await store.startSession(789);
const videoPath = path.join(recordingsDir, "recording-789.mp4");
await fs.writeFile(videoPath, "");
const session = await store.finalizeEvents({ recordingId: 789, videoPath });
expect(session.status).toBe("events-ready");
expect(session.videoPath).toBe(videoPath);
expect(session.guidePath).toBe(path.join(recordingsDir, "recording-789.guide.json"));
});
it("adds cursor click events when finalizing a session", async () => {
const store = new GuideStore(recordingsDir);
await store.startSession(790);
await store.addMarker({ recordingId: 790, kind: "manual", timeMs: 250, label: "Manual" });
const videoPath = path.join(recordingsDir, "recording-790.mp4");
await fs.writeFile(videoPath, "");
await fs.writeFile(
`${videoPath}.cursor.json`,
JSON.stringify({
version: 2,
provider: "native",
assets: [],
samples: [
{ timeMs: 100, cx: 0.2, cy: 0.3, interactionType: "move" },
{ timeMs: 200, cx: 0.4, cy: 0.5, interactionType: "click" },
{ timeMs: 225, cx: 0.401, cy: 0.501, interactionType: "click" },
],
}),
"utf-8",
);
const session = await store.finalizeEvents({ recordingId: 790, videoPath });
expect(session.cursorPath).toBe(`${videoPath}.cursor.json`);
expect(session.events.map((event) => event.kind)).toEqual(["click", "manual"]);
expect(session.events[0]).toMatchObject({
timeMs: 200,
normalizedX: 0.4,
normalizedY: 0.5,
});
});
it("rejects guide artifacts outside the recordings directory", async () => {
const store = new GuideStore(recordingsDir);
await store.startSession(321);
const outsideVideoPath = path.join(path.dirname(recordingsDir), "outside.mp4");
await expect(
store.finalizeEvents({ recordingId: 321, videoPath: outsideVideoPath }),
).rejects.toMatchObject({
code: "guide-invalid-input",
});
});
it("rejects invalid guide session schema", async () => {
const store = new GuideStore(recordingsDir);
await fs.writeFile(
path.join(recordingsDir, "recording-bad.guide.json"),
JSON.stringify({ schemaVersion: 999 }),
"utf-8",
);
await expect(store.readSession("bad")).rejects.toBeInstanceOf(GuideStoreError);
await expect(store.readSession("bad")).rejects.toMatchObject({
code: "guide-invalid-schema",
});
});
it("saves a reviewed generated guide", async () => {
const store = new GuideStore(recordingsDir);
await store.startSession(654);
const session = await store.saveGuide({
recordingId: 654,
generatedGuide: {
title: "Huong dan thao tac",
steps: [
{
id: "step-1",
order: 1,
title: "Mo cai dat",
instruction: "Nhan nut Settings.",
},
],
},
});
expect(session.status).toBe("reviewed");
expect(session.generatedGuide?.steps).toHaveLength(1);
});
it("writes snapshots and builds candidates without OCR", async () => {
const store = new GuideStore(recordingsDir);
await store.startSession(112);
await store.addMarker({ recordingId: 112, kind: "manual", timeMs: 500, label: "Save" });
const videoPath = path.join(recordingsDir, "recording-112.mp4");
await fs.writeFile(videoPath, "");
const eventsSession = await store.finalizeEvents({ recordingId: 112, videoPath });
const session = await store.writeSnapshot({
recordingId: 112,
eventId: eventsSession.events[0]?.id ?? "",
timeMs: 1000,
offsetMs: 500,
width: 800,
height: 600,
pngBytes: new Uint8Array([137, 80, 78, 71]).buffer,
});
expect(session.status).toBe("snapshots-ready");
expect(session.snapshots).toHaveLength(1);
expect(session.candidates[0]).toMatchObject({ targetText: "Save" });
await expect(fs.readFile(session.snapshots[0]?.path ?? "")).resolves.toEqual(
Buffer.from([137, 80, 78, 71]),
);
});
it("runs OCR, generates a local draft, and exports files", async () => {
const store = new GuideStore(recordingsDir, {
ocrClient: {
recognize: async (snapshot) => [
{
id: `ocr-${snapshot.id}-1`,
snapshotId: snapshot.id,
text: "Save",
confidence: 0.95,
box: { x: 0.45, y: 0.45, width: 0.15, height: 0.08 },
},
],
},
});
await store.startSession(113);
const videoPath = path.join(recordingsDir, "recording-113.mp4");
await fs.writeFile(videoPath, "");
await fs.writeFile(
`${videoPath}.cursor.json`,
JSON.stringify({
samples: [{ timeMs: 200, cx: 0.5, cy: 0.5, interactionType: "click" }],
}),
"utf-8",
);
const eventsSession = await store.finalizeEvents({ recordingId: 113, videoPath });
await store.writeSnapshot({
recordingId: 113,
eventId: eventsSession.events[0]?.id ?? "",
timeMs: 700,
offsetMs: 500,
width: 800,
height: 600,
pngBytes: new Uint8Array([1, 2, 3]).buffer,
});
const ocrSession = await store.runOcr({ recordingId: 113 });
const draftSession = await store.generateDraft({
recordingId: 113,
language: "en",
provider: "local",
});
const markdown = await store.exportMarkdown({ recordingId: 113 });
const html = await store.exportHtml({ recordingId: 113 });
expect(ocrSession.candidates[0]).toMatchObject({ targetText: "Save" });
expect(draftSession.generatedGuide?.steps[0]?.instruction).toBe('Click "Save".');
await expect(fs.readFile(markdown.path, "utf-8")).resolves.toContain("# User guide");
await expect(fs.readFile(html.path, "utf-8")).resolves.toContain("<!doctype html>");
});
it("discards a guide session and output directory", async () => {
const store = new GuideStore(recordingsDir);
const session = await store.startSession(111);
await fs.writeFile(path.join(session.outputDir, "step-001.png"), "");
await store.discardSession({ recordingId: 111 });
await expect(fs.stat(session.guidePath)).rejects.toMatchObject({ code: "ENOENT" });
await expect(fs.stat(session.outputDir)).rejects.toMatchObject({ code: "ENOENT" });
});
});
+824
View File
@@ -0,0 +1,824 @@
import { randomUUID } from "node:crypto";
import fs from "node:fs/promises";
import path from "node:path";
import {
type AddGuideMarkerInput,
type DiscardGuideSessionInput,
type ExportGuideInput,
type ExportGuideResult,
type FinalizeGuideEventsInput,
type GeneratedGuide,
type GeneratedGuideStep,
type GenerateGuideDraftInput,
GUIDE_SCHEMA_VERSION,
type GuideErrorCode,
type GuideEvent,
type GuideEventKind,
type GuideEventSource,
type GuideSession,
type GuideSessionStatus,
type GuideSnapshot,
type GuideStepCandidate,
type OcrBlock,
type RunGuideOcrInput,
type SaveGuideInput,
type WriteGuideSnapshotInput,
} from "../../src/guide/contracts";
import { buildGuideEventsFromCursor, mergeGuideEvents } from "../../src/guide/eventBuilder";
import { exportGuideToHtml, exportGuideToMarkdown } from "../../src/guide/exporters";
import { buildLocalGuideDraft } from "../../src/guide/promptBuilder";
import { buildGuideStepCandidates } from "../../src/guide/targetMapper";
import type { CursorRecordingSample } from "../../src/native/contracts";
import {
DeepSeekGuideClient,
DeepSeekGuideClientError,
type GuideDraftClient,
} from "./ai/deepseekGuideClient";
import type { DeepSeekGuideConfigProvider } from "./ai/deepseekSettingsStore";
import { type GuidePaths, normalizeGuideRecordingId, resolveGuidePaths } from "./guidePaths";
import { createFocusedOcrSnapshot, remapFocusedOcrBlocks } from "./ocr/focusedOcrSnapshot";
import { DefaultGuideOcrClient, type GuideOcrClient } from "./ocr/paddleOcrClient";
const VALID_SESSION_STATUSES = new Set<GuideSessionStatus>([
"recording",
"events-ready",
"snapshots-ready",
"ocr-ready",
"draft-ready",
"reviewed",
]);
const VALID_EVENT_KINDS = new Set<GuideEventKind>(["click", "hotkey", "manual"]);
const VALID_EVENT_SOURCES = new Set<GuideEventSource>([
"cursor-recording",
"guide-hotkey",
"review-ui",
]);
export class GuideStoreError extends Error {
constructor(
readonly code: GuideErrorCode,
message: string,
readonly retryable = false,
) {
super(message);
this.name = "GuideStoreError";
}
}
export interface GuideStoreDependencies {
ocrClient?: GuideOcrClient;
draftClient?: GuideDraftClient;
deepSeekConfigProvider?: DeepSeekGuideConfigProvider;
focusOcrSnapshots?: boolean;
}
export class GuideStore {
constructor(
private readonly recordingsDir: string,
private readonly dependencies: GuideStoreDependencies = {},
) {}
async startSession(recordingIdInput: AddGuideMarkerInput["recordingId"]): Promise<GuideSession> {
const paths = this.requireGuidePaths(recordingIdInput);
const now = new Date().toISOString();
const session: GuideSession = {
schemaVersion: GUIDE_SCHEMA_VERSION,
recordingId: paths.recordingId,
videoPath: "",
guidePath: paths.guidePath,
outputDir: paths.outputDir,
status: "recording",
events: [],
snapshots: [],
ocrBlocks: [],
candidates: [],
createdAt: now,
updatedAt: now,
};
await this.writeSession(session);
return session;
}
async readSession(recordingIdInput: AddGuideMarkerInput["recordingId"]): Promise<GuideSession> {
const paths = this.requireGuidePaths(recordingIdInput);
return await this.readSessionAtPath(paths.guidePath);
}
async addMarker(
input: AddGuideMarkerInput,
): Promise<{ session: GuideSession; event: GuideEvent }> {
const recordingId = normalizeGuideRecordingId(input.recordingId);
if (!recordingId) {
throw new GuideStoreError("guide-invalid-input", "Guide marker is missing recordingId.");
}
if (input.kind !== "hotkey" && input.kind !== "manual") {
throw new GuideStoreError("guide-invalid-input", "Guide marker kind is invalid.");
}
if (!Number.isFinite(input.timeMs) || input.timeMs < 0) {
throw new GuideStoreError("guide-invalid-input", "Guide marker timeMs must be non-negative.");
}
const session = await this.readSession(recordingId);
const event: GuideEvent = {
id: `guide-event-${randomUUID()}`,
recordingId,
kind: input.kind,
source: input.kind === "hotkey" ? "guide-hotkey" : "review-ui",
timeMs: Math.max(0, input.timeMs),
label: normalizeOptionalString(input.label),
screenshotOffsetMs: 500,
createdAt: new Date().toISOString(),
};
const updatedSession = touchSession({
...session,
events: sortGuideEvents([...session.events, event]),
});
await this.writeSession(updatedSession);
return { session: updatedSession, event };
}
async finalizeEvents(input: FinalizeGuideEventsInput): Promise<GuideSession> {
const recordingId = normalizeGuideRecordingId(input.recordingId);
if (!recordingId) {
throw new GuideStoreError(
"guide-invalid-input",
"Guide finalization is missing recordingId.",
);
}
if (typeof input.videoPath !== "string" || input.videoPath.trim().length === 0) {
throw new GuideStoreError("guide-invalid-input", "Guide finalization is missing videoPath.");
}
const videoPath = path.resolve(input.videoPath);
const currentSession = await this.readSession(recordingId);
const nextPaths = this.requireGuidePaths(recordingId, videoPath);
const cursorPath = await this.resolveCursorPath(videoPath, input.cursorPath);
const cursorEvents = cursorPath
? await this.readCursorGuideEvents(recordingId, cursorPath)
: [];
const manualEvents = currentSession.events.filter(
(event) => event.source !== "cursor-recording",
);
const updatedSession = touchSession({
...currentSession,
videoPath,
cursorPath,
guidePath: nextPaths.guidePath,
outputDir: nextPaths.outputDir,
status: "events-ready",
events: mergeGuideEvents([...cursorEvents, ...manualEvents]),
});
await this.writeSession(updatedSession);
if (path.resolve(currentSession.guidePath) !== path.resolve(updatedSession.guidePath)) {
await fs.unlink(currentSession.guidePath).catch(() => undefined);
}
return updatedSession;
}
async writeSnapshot(input: WriteGuideSnapshotInput): Promise<GuideSession> {
const recordingId = normalizeGuideRecordingId(input.recordingId);
if (!recordingId) {
throw new GuideStoreError("guide-invalid-input", "Snapshot write is missing recordingId.");
}
if (!input.eventId || !Number.isFinite(input.timeMs) || input.timeMs < 0) {
throw new GuideStoreError("guide-invalid-input", "Snapshot metadata is invalid.");
}
if (!input.pngBytes || input.pngBytes.byteLength === 0) {
throw new GuideStoreError("guide-invalid-input", "Snapshot PNG data is empty.");
}
if (
!Number.isFinite(input.width) ||
input.width <= 0 ||
!Number.isFinite(input.height) ||
input.height <= 0
) {
throw new GuideStoreError("guide-invalid-input", "Snapshot dimensions are invalid.");
}
const session = await this.readSession(recordingId);
const eventIndex = session.events.findIndex((event) => event.id === input.eventId);
if (eventIndex === -1) {
throw new GuideStoreError("guide-invalid-input", "Snapshot event does not exist.");
}
this.assertGuidePathIsAllowed(session.outputDir);
await fs.mkdir(session.outputDir, { recursive: true });
const fileName = `step-${String(eventIndex + 1).padStart(3, "0")}.png`;
const snapshotPath = path.join(session.outputDir, fileName);
this.assertGuidePathIsAllowed(snapshotPath);
await fs.writeFile(snapshotPath, Buffer.from(new Uint8Array(input.pngBytes)));
const snapshot: GuideSnapshot = {
id: `snapshot-${input.eventId}`,
eventId: input.eventId,
timeMs: Math.max(0, input.timeMs),
offsetMs: input.offsetMs,
path: snapshotPath,
width: Math.round(input.width),
height: Math.round(input.height),
};
const updatedSnapshots = [
...session.snapshots.filter((existing) => existing.eventId !== input.eventId),
snapshot,
].sort((left, right) => left.timeMs - right.timeMs);
const updatedSession = touchSession({
...session,
status: "snapshots-ready",
snapshots: updatedSnapshots,
ocrBlocks: session.ocrBlocks.filter((block) => block.snapshotId !== snapshot.id),
candidates: buildGuideStepCandidates({
...session,
snapshots: updatedSnapshots,
ocrBlocks: session.ocrBlocks.filter((block) => block.snapshotId !== snapshot.id),
}),
generatedGuide: undefined,
});
await this.writeSession(updatedSession);
return updatedSession;
}
async runOcr(input: RunGuideOcrInput): Promise<GuideSession> {
const session = await this.readSession(input.recordingId);
const requestedIds = new Set(input.snapshotIds ?? []);
const snapshots =
requestedIds.size > 0
? session.snapshots.filter((snapshot) => requestedIds.has(snapshot.id))
: session.snapshots;
if (snapshots.length === 0) {
throw new GuideStoreError("guide-invalid-input", "No guide snapshots are available for OCR.");
}
const ocrClient = this.dependencies.ocrClient ?? new DefaultGuideOcrClient();
const shouldFocusOcrSnapshots =
this.dependencies.focusOcrSnapshots ?? this.dependencies.ocrClient === undefined;
const eventsById = new Map(session.events.map((event) => [event.id, event]));
const blocks: OcrBlock[] = [];
try {
for (const snapshot of snapshots) {
const focusedSnapshot = shouldFocusOcrSnapshots
? await createFocusedOcrSnapshot({
snapshot,
event: eventsById.get(snapshot.eventId),
outputDir: session.outputDir,
})
: { snapshot };
const recognizedBlocks = await ocrClient.recognize(focusedSnapshot.snapshot);
blocks.push(...remapFocusedOcrBlocks(recognizedBlocks, focusedSnapshot.transform));
}
} catch (error) {
throw new GuideStoreError(
"guide-ocr-unavailable",
error instanceof Error ? error.message : "OCR failed.",
true,
);
}
const snapshotIds = new Set(snapshots.map((snapshot) => snapshot.id));
const updatedOcrBlocks = [
...session.ocrBlocks.filter((block) => !snapshotIds.has(block.snapshotId)),
...blocks,
];
const draftSession = {
...session,
ocrBlocks: updatedOcrBlocks,
};
const updatedSession = touchSession({
...draftSession,
status: "ocr-ready",
candidates: buildGuideStepCandidates(draftSession),
generatedGuide: undefined,
});
await this.writeSession(updatedSession);
return updatedSession;
}
async generateDraft(input: GenerateGuideDraftInput): Promise<GuideSession> {
const session = await this.readSession(input.recordingId);
const candidates =
session.candidates.length > 0 ? session.candidates : buildGuideStepCandidates(session);
if (candidates.length === 0) {
throw new GuideStoreError(
"guide-invalid-input",
"No guide events are available for drafting.",
);
}
let generatedGuide: GeneratedGuide;
if (input.provider === "local") {
generatedGuide = buildLocalGuideDraft(session, candidates, input.language);
} else {
const draftClient =
this.dependencies.draftClient ??
new DeepSeekGuideClient(this.dependencies.deepSeekConfigProvider);
try {
generatedGuide = await draftClient.generate({
session,
candidates,
language: input.language,
});
} catch (error) {
if (error instanceof DeepSeekGuideClientError) {
throw new GuideStoreError(error.code, error.message, error.retryable);
}
throw new GuideStoreError(
"guide-ai-request-failed",
error instanceof Error ? error.message : "Guide draft generation failed.",
true,
);
}
}
const updatedSession = touchSession({
...session,
candidates,
generatedGuide: normalizeGeneratedGuide(generatedGuide) ?? generatedGuide,
status: "draft-ready",
});
await this.writeSession(updatedSession);
return updatedSession;
}
async saveGuide(input: SaveGuideInput): Promise<GuideSession> {
const session = await this.readSession(input.recordingId);
const generatedGuide = normalizeGeneratedGuide(input.generatedGuide);
if (!generatedGuide) {
throw new GuideStoreError("guide-invalid-input", "Generated guide shape is invalid.");
}
const updatedSession = touchSession({
...session,
generatedGuide,
status: "reviewed",
});
await this.writeSession(updatedSession);
return updatedSession;
}
async exportMarkdown(input: ExportGuideInput): Promise<ExportGuideResult> {
const session = await this.readSession(input.recordingId);
return await this.writeGuideExport(session, "guide.md", () => exportGuideToMarkdown(session));
}
async exportHtml(input: ExportGuideInput): Promise<ExportGuideResult> {
const session = await this.readSession(input.recordingId);
return await this.writeGuideExport(session, "guide.html", () => exportGuideToHtml(session));
}
async discardSession(input: DiscardGuideSessionInput): Promise<void> {
const paths = this.requireGuidePaths(input.recordingId);
const session = await this.readSession(input.recordingId).catch(() => null);
const guidePath = session?.guidePath ?? paths.guidePath;
const outputDir = session?.outputDir ?? paths.outputDir;
this.assertGuidePathIsAllowed(guidePath);
this.assertGuidePathIsAllowed(outputDir);
await fs.unlink(guidePath).catch(() => undefined);
await fs.rm(outputDir, { recursive: true, force: true });
}
private async writeGuideExport(
session: GuideSession,
fileName: string,
renderContent: () => string,
): Promise<ExportGuideResult> {
if (!session.generatedGuide) {
throw new GuideStoreError("guide-invalid-input", "Generate a guide draft before exporting.");
}
const exportPath = path.join(session.outputDir, fileName);
this.assertGuidePathIsAllowed(exportPath);
try {
await fs.mkdir(session.outputDir, { recursive: true });
await fs.writeFile(exportPath, renderContent(), "utf-8");
} catch (error) {
throw new GuideStoreError(
"guide-export-failed",
error instanceof Error ? error.message : "Guide export failed.",
true,
);
}
return { path: exportPath, session };
}
async writeSession(session: GuideSession): Promise<void> {
const normalized = normalizeGuideSession(session);
if (!normalized) {
throw new GuideStoreError("guide-invalid-schema", "Guide session schema is invalid.");
}
this.assertGuidePathIsAllowed(normalized.guidePath);
this.assertGuidePathIsAllowed(normalized.outputDir);
await fs.mkdir(path.dirname(normalized.guidePath), { recursive: true });
await fs.mkdir(normalized.outputDir, { recursive: true });
await atomicWriteJson(normalized.guidePath, normalized);
}
private async readSessionAtPath(guidePath: string): Promise<GuideSession> {
this.assertGuidePathIsAllowed(guidePath);
try {
const content = await fs.readFile(guidePath, "utf-8");
const session = normalizeGuideSession(JSON.parse(content));
if (!session) {
throw new GuideStoreError("guide-invalid-schema", "Guide session schema is invalid.");
}
return session;
} catch (error) {
if (error instanceof GuideStoreError) {
throw error;
}
const nodeError = error as NodeJS.ErrnoException;
if (nodeError.code === "ENOENT") {
throw new GuideStoreError("guide-session-not-found", "Guide session was not found.");
}
throw error;
}
}
private requireGuidePaths(
recordingIdInput: AddGuideMarkerInput["recordingId"],
videoPath?: string | null,
): GuidePaths {
const paths = resolveGuidePaths({
recordingsDir: this.recordingsDir,
recordingId: recordingIdInput,
videoPath,
});
if (!paths) {
throw new GuideStoreError("guide-invalid-input", "Guide recordingId is invalid.");
}
this.assertGuidePathIsAllowed(paths.guidePath);
this.assertGuidePathIsAllowed(paths.outputDir);
return paths;
}
private assertGuidePathIsAllowed(targetPath: string): void {
if (this.isPathAllowed(targetPath)) {
return;
}
throw new GuideStoreError(
"guide-invalid-input",
"Guide artifacts must be stored inside the recordings directory.",
);
}
private async resolveCursorPath(
videoPath: string,
explicitCursorPath?: string,
): Promise<string | undefined> {
const candidates = [
normalizeOptionalString(explicitCursorPath),
`${videoPath}.cursor.json`,
].filter((candidate): candidate is string => Boolean(candidate));
for (const candidate of candidates) {
const resolvedCandidate = path.resolve(candidate);
if (!this.isPathAllowed(resolvedCandidate)) {
continue;
}
try {
await fs.access(resolvedCandidate);
return resolvedCandidate;
} catch {
// Cursor telemetry is optional for guide sessions.
}
}
return undefined;
}
private async readCursorGuideEvents(
recordingId: string,
cursorPath: string,
): Promise<GuideEvent[]> {
try {
const content = await fs.readFile(cursorPath, "utf-8");
const parsed = JSON.parse(content) as unknown;
const rawSamples =
isRecord(parsed) && Array.isArray(parsed.samples) ? parsed.samples : parsed;
const samples = Array.isArray(rawSamples)
? rawSamples
.map(normalizeCursorSampleForGuide)
.filter((sample): sample is CursorRecordingSample => sample !== null)
: [];
return buildGuideEventsFromCursor({ recordingId, samples });
} catch (error) {
console.warn("Failed to read cursor telemetry for guide events:", error);
return [];
}
}
private isPathAllowed(targetPath: string): boolean {
const resolvedTarget = path.resolve(targetPath);
const resolvedRecordingsDir = path.resolve(this.recordingsDir);
const relative = path.relative(resolvedRecordingsDir, resolvedTarget);
return relative === "" || (!relative.startsWith("..") && !path.isAbsolute(relative));
}
}
function touchSession(session: GuideSession): GuideSession {
return {
...session,
updatedAt: new Date().toISOString(),
};
}
function sortGuideEvents(events: GuideEvent[]): GuideEvent[] {
return [...events].sort((left, right) => left.timeMs - right.timeMs);
}
function normalizeCursorSampleForGuide(input: unknown): CursorRecordingSample | null {
if (!isRecord(input)) {
return null;
}
const interactionType =
input.interactionType === "click" ||
input.interactionType === "mouseup" ||
input.interactionType === "move"
? input.interactionType
: "move";
const timeMs = normalizeNonNegativeNumber(input.timeMs);
const cx = normalizeOptionalNumber(input.cx);
const cy = normalizeOptionalNumber(input.cy);
if (timeMs === null || cx === undefined || cy === undefined) {
return null;
}
return {
timeMs,
cx,
cy,
interactionType,
};
}
async function atomicWriteJson(filePath: string, value: unknown): Promise<void> {
const tempPath = `${filePath}.${process.pid}.${Date.now()}.tmp`;
await fs.writeFile(tempPath, JSON.stringify(value, null, 2), "utf-8");
await fs.rename(tempPath, filePath);
}
function normalizeGuideSession(input: unknown): GuideSession | null {
if (!isRecord(input) || input.schemaVersion !== GUIDE_SCHEMA_VERSION) {
return null;
}
const recordingId = normalizeString(input.recordingId);
const videoPath = normalizeString(input.videoPath);
const guidePath = normalizeString(input.guidePath);
const outputDir = normalizeString(input.outputDir);
const status = normalizeSessionStatus(input.status);
const createdAt = normalizeString(input.createdAt);
const updatedAt = normalizeString(input.updatedAt);
if (
!recordingId ||
videoPath === null ||
!guidePath ||
!outputDir ||
!status ||
!createdAt ||
!updatedAt
) {
return null;
}
const generatedGuide =
input.generatedGuide === undefined ? undefined : normalizeGeneratedGuide(input.generatedGuide);
if (generatedGuide === null) {
return null;
}
return {
schemaVersion: GUIDE_SCHEMA_VERSION,
recordingId,
videoPath,
cursorPath: normalizeOptionalString(input.cursorPath),
guidePath,
outputDir,
status,
events: normalizeArray(input.events, normalizeGuideEvent),
snapshots: normalizeArray(input.snapshots, normalizeGuideSnapshot),
ocrBlocks: normalizeArray(input.ocrBlocks, normalizeOcrBlock),
candidates: normalizeArray(input.candidates, normalizeGuideStepCandidate),
generatedGuide,
createdAt,
updatedAt,
};
}
function normalizeGuideEvent(input: unknown): GuideEvent | null {
if (!isRecord(input)) {
return null;
}
const id = normalizeString(input.id);
const recordingId = normalizeString(input.recordingId);
const kind = VALID_EVENT_KINDS.has(input.kind as GuideEventKind)
? (input.kind as GuideEventKind)
: null;
const source = VALID_EVENT_SOURCES.has(input.source as GuideEventSource)
? (input.source as GuideEventSource)
: null;
const timeMs = normalizeNonNegativeNumber(input.timeMs);
const createdAt = normalizeString(input.createdAt);
if (!id || !recordingId || !kind || !source || timeMs === null || !createdAt) {
return null;
}
return {
id,
recordingId,
kind,
source,
timeMs,
x: normalizeOptionalNumber(input.x),
y: normalizeOptionalNumber(input.y),
normalizedX: normalizeOptionalNumber(input.normalizedX),
normalizedY: normalizeOptionalNumber(input.normalizedY),
button:
input.button === "left" ||
input.button === "right" ||
input.button === "middle" ||
input.button === "unknown"
? input.button
: undefined,
label: normalizeOptionalString(input.label),
screenshotOffsetMs: normalizeOptionalNumber(input.screenshotOffsetMs),
createdAt,
};
}
function normalizeGuideSnapshot(input: unknown): GuideSnapshot | null {
if (!isRecord(input)) {
return null;
}
const id = normalizeString(input.id);
const eventId = normalizeString(input.eventId);
const pathValue = normalizeString(input.path);
const timeMs = normalizeNonNegativeNumber(input.timeMs);
const offsetMs = normalizeOptionalNumber(input.offsetMs);
const width = normalizePositiveInteger(input.width);
const height = normalizePositiveInteger(input.height);
if (
!id ||
!eventId ||
!pathValue ||
timeMs === null ||
offsetMs === undefined ||
width === null ||
height === null
) {
return null;
}
return { id, eventId, timeMs, offsetMs, path: pathValue, width, height };
}
function normalizeOcrBlock(input: unknown): OcrBlock | null {
if (!isRecord(input) || !isRecord(input.box)) {
return null;
}
const id = normalizeString(input.id);
const snapshotId = normalizeString(input.snapshotId);
const text = normalizeString(input.text);
const confidence = normalizeOptionalNumber(input.confidence);
const x = normalizeOptionalNumber(input.box.x);
const y = normalizeOptionalNumber(input.box.y);
const width = normalizeOptionalNumber(input.box.width);
const height = normalizeOptionalNumber(input.box.height);
if (
!id ||
!snapshotId ||
text === null ||
confidence === undefined ||
x === undefined ||
y === undefined ||
width === undefined ||
height === undefined
) {
return null;
}
return { id, snapshotId, text, confidence, box: { x, y, width, height } };
}
function normalizeGuideStepCandidate(input: unknown): GuideStepCandidate | null {
if (!isRecord(input)) {
return null;
}
const id = normalizeString(input.id);
const eventId = normalizeString(input.eventId);
const timeMs = normalizeNonNegativeNumber(input.timeMs);
const confidence = normalizeOptionalNumber(input.confidence);
const nearbyText = Array.isArray(input.nearbyText)
? input.nearbyText.map(normalizeString).filter((text): text is string => text !== null)
: [];
if (!id || !eventId || timeMs === null || confidence === undefined) {
return null;
}
return {
id,
eventId,
snapshotId: normalizeOptionalString(input.snapshotId),
timeMs,
action:
input.action === "click" ||
input.action === "choose" ||
input.action === "type" ||
input.action === "wait" ||
input.action === "manual"
? input.action
: "manual",
targetText: normalizeOptionalString(input.targetText),
targetRole:
input.targetRole === "button" ||
input.targetRole === "menu" ||
input.targetRole === "tab" ||
input.targetRole === "field" ||
input.targetRole === "link" ||
input.targetRole === "unknown"
? input.targetRole
: undefined,
nearbyText,
confidence,
};
}
function normalizeGeneratedGuide(input: unknown): GeneratedGuide | null {
if (!isRecord(input)) {
return null;
}
const title = normalizeString(input.title);
if (!title || !Array.isArray(input.steps)) {
return null;
}
const steps = input.steps
.map((step): GeneratedGuideStep | null => {
if (!isRecord(step)) {
return null;
}
const id = normalizeString(step.id);
const order = normalizePositiveInteger(step.order);
const stepTitle = normalizeString(step.title);
const instruction = normalizeString(step.instruction);
if (!id || order === null || !stepTitle || !instruction) {
return null;
}
return {
id,
order,
title: stepTitle,
instruction,
screenshotPath: normalizeOptionalString(step.screenshotPath),
sourceCandidateId: normalizeOptionalString(step.sourceCandidateId),
};
})
.filter((step): step is GeneratedGuide["steps"][number] => step !== null);
return {
title,
summary: normalizeOptionalString(input.summary),
steps,
};
}
function normalizeArray<T>(input: unknown, normalize: (value: unknown) => T | null): T[] {
return Array.isArray(input)
? input.map((value) => normalize(value)).filter((value): value is T => value !== null)
: [];
}
function normalizeSessionStatus(value: unknown): GuideSessionStatus | null {
return VALID_SESSION_STATUSES.has(value as GuideSessionStatus)
? (value as GuideSessionStatus)
: null;
}
function normalizeString(value: unknown): string | null {
return typeof value === "string" ? value : null;
}
function normalizeOptionalString(value: unknown): string | undefined {
const text = normalizeString(value);
return text === null || text.length === 0 ? undefined : text;
}
function normalizeNonNegativeNumber(value: unknown): number | null {
return typeof value === "number" && Number.isFinite(value) && value >= 0 ? value : null;
}
function normalizeOptionalNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function normalizePositiveInteger(value: unknown): number | null {
return typeof value === "number" && Number.isFinite(value) && value > 0
? Math.round(value)
: null;
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null;
}
+232
View File
@@ -0,0 +1,232 @@
import { type ChildProcessWithoutNullStreams, spawn } from "node:child_process";
import fs from "node:fs/promises";
import path from "node:path";
import { app } from "electron";
const DEFAULT_OCR_BASE_URL = "http://127.0.0.1:8866";
const DEFAULT_OCR_PORT = "8866";
const SERVICE_EXE_NAME = "openscreen-ocr-service.exe";
const HEALTH_TIMEOUT_MS = 1000;
const STARTUP_TIMEOUT_MS = 90000;
const PADDLEX_MODEL_NAMES = ["PP-OCRv5_mobile_det", "latin_PP-OCRv5_mobile_rec"];
let ocrProcess: ChildProcessWithoutNullStreams | null = null;
let startupPromise: Promise<void> | null = null;
let quitHookRegistered = false;
export async function ensureBundledOcrServiceRunning(
baseUrl = DEFAULT_OCR_BASE_URL,
): Promise<void> {
if (!shouldManageOcrService(baseUrl)) {
return;
}
if (await isOcrServiceHealthy(baseUrl, HEALTH_TIMEOUT_MS)) {
return;
}
const executablePath = await findBundledOcrServiceExecutable();
if (!executablePath) {
return;
}
if (!startupPromise) {
startupPromise = startAndWaitForOcrService(executablePath, baseUrl).finally(() => {
startupPromise = null;
});
}
await startupPromise;
}
function shouldManageOcrService(baseUrl: string): boolean {
try {
const url = new URL(baseUrl);
const hostname = url.hostname.toLowerCase();
return (
(url.protocol === "http:" || url.protocol === "https:") &&
(hostname === "127.0.0.1" || hostname === "localhost") &&
(url.port === "" || url.port === DEFAULT_OCR_PORT)
);
} catch {
return false;
}
}
async function findBundledOcrServiceExecutable(): Promise<string | null> {
const candidates = [
process.env.OPENSCREEN_GUIDE_OCR_EXE,
path.join(process.resourcesPath, "ocr-service", SERVICE_EXE_NAME),
path.join(process.resourcesPath, "ocr-service", "openscreen-ocr-service", SERVICE_EXE_NAME),
path.resolve(process.cwd(), "tools", "ocr", "dist", "openscreen-ocr-service", SERVICE_EXE_NAME),
].filter(
(candidate): candidate is string => typeof candidate === "string" && candidate.length > 0,
);
for (const candidate of candidates) {
try {
const stats = await fs.stat(candidate);
if (stats.isFile()) {
return candidate;
}
} catch {
// Try the next candidate.
}
}
return null;
}
async function startAndWaitForOcrService(executablePath: string, baseUrl: string): Promise<void> {
const runtimePaths = await prepareOcrRuntimePaths();
if (!ocrProcess || ocrProcess.exitCode !== null || ocrProcess.killed) {
startOcrServiceProcess(executablePath, runtimePaths);
}
await waitForOcrServiceHealth(baseUrl, STARTUP_TIMEOUT_MS);
}
async function prepareOcrRuntimePaths(): Promise<{
modelCachePath: string;
paddlexCachePath: string;
}> {
const modelCachePath = path.join(app.getPath("userData"), "ocr-models");
const paddlexCachePath = path.join(modelCachePath, "paddlex");
await seedBundledPaddlexModels(paddlexCachePath);
return { modelCachePath, paddlexCachePath };
}
async function seedBundledPaddlexModels(destinationCachePath: string): Promise<void> {
const sourceCachePath = await findBundledPaddlexModelCache();
if (!sourceCachePath) {
return;
}
const sourceOfficialModels = path.join(sourceCachePath, "official_models");
const destinationOfficialModels = path.join(destinationCachePath, "official_models");
await fs.mkdir(destinationOfficialModels, { recursive: true });
for (const modelName of PADDLEX_MODEL_NAMES) {
const sourceModelPath = path.join(sourceOfficialModels, modelName);
const destinationModelPath = path.join(destinationOfficialModels, modelName);
if (!(await pathExists(sourceModelPath)) || (await pathExists(destinationModelPath))) {
continue;
}
await fs.cp(sourceModelPath, destinationModelPath, {
recursive: true,
errorOnExist: false,
force: false,
});
}
}
async function findBundledPaddlexModelCache(): Promise<string | null> {
const candidates = [
path.join(process.resourcesPath, "ocr-models", "paddlex"),
path.resolve(process.cwd(), "tools", "ocr", "models", "paddlex"),
];
for (const candidate of candidates) {
try {
const stats = await fs.stat(candidate);
if (stats.isDirectory()) {
return candidate;
}
} catch {
// Try the next candidate.
}
}
return null;
}
async function pathExists(value: string): Promise<boolean> {
try {
await fs.access(value);
return true;
} catch {
return false;
}
}
function startOcrServiceProcess(
executablePath: string,
runtimePaths: { modelCachePath: string; paddlexCachePath: string },
): void {
registerQuitHook();
ocrProcess = spawn(executablePath, [], {
cwd: path.dirname(executablePath),
env: {
...process.env,
OPENSCREEN_OCR_HOST: "127.0.0.1",
OPENSCREEN_OCR_PORT: DEFAULT_OCR_PORT,
PADDLEOCR_DEVICE: process.env.PADDLEOCR_DEVICE ?? "cpu",
PADDLEOCR_ENABLE_MKLDNN: process.env.PADDLEOCR_ENABLE_MKLDNN ?? "0",
PADDLEOCR_LANG: process.env.PADDLEOCR_LANG ?? "latin",
PADDLEOCR_USE_MOBILE: process.env.PADDLEOCR_USE_MOBILE ?? "1",
PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT: process.env.PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT ?? "False",
PADDLE_PDX_CACHE_HOME: process.env.PADDLE_PDX_CACHE_HOME ?? runtimePaths.paddlexCachePath,
PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK:
process.env.PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK ?? "True",
PADDLE_HOME: process.env.PADDLE_HOME ?? path.join(runtimePaths.modelCachePath, "paddle"),
PADDLEOCR_HOME:
process.env.PADDLEOCR_HOME ?? path.join(runtimePaths.modelCachePath, "paddleocr"),
PYTHONUTF8: "1",
},
windowsHide: true,
});
ocrProcess.stdout.on("data", (chunk) => {
console.info(`[guide-ocr-service] ${chunk.toString().trim()}`);
});
ocrProcess.stderr.on("data", (chunk) => {
console.warn(`[guide-ocr-service] ${chunk.toString().trim()}`);
});
ocrProcess.on("exit", (code, signal) => {
console.info("[guide-ocr-service] exited", { code, signal });
ocrProcess = null;
});
}
function registerQuitHook(): void {
if (quitHookRegistered) {
return;
}
quitHookRegistered = true;
app.once("before-quit", () => {
const processToStop = ocrProcess;
ocrProcess = null;
processToStop?.kill();
});
}
async function waitForOcrServiceHealth(baseUrl: string, timeoutMs: number): Promise<void> {
const startedAt = Date.now();
let lastError: unknown;
while (Date.now() - startedAt < timeoutMs) {
if (await isOcrServiceHealthy(baseUrl, HEALTH_TIMEOUT_MS)) {
return;
}
if (ocrProcess?.exitCode !== null && ocrProcess?.exitCode !== undefined) {
throw new Error(`Bundled OCR service exited with code ${ocrProcess.exitCode}.`);
}
await sleep(750);
}
if (lastError instanceof Error) {
throw lastError;
}
throw new Error("Timed out waiting for bundled OCR service to start.");
}
async function isOcrServiceHealthy(baseUrl: string, timeoutMs: number): Promise<boolean> {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(`${baseUrl.replace(/\/$/, "")}/health`, {
signal: controller.signal,
});
return response.ok;
} catch {
return false;
} finally {
clearTimeout(timeoutId);
}
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
@@ -0,0 +1,33 @@
import { describe, expect, it } from "vitest";
import type { OcrBlock } from "../../../src/guide/contracts";
import { remapFocusedOcrBlocks } from "./focusedOcrSnapshot";
describe("remapFocusedOcrBlocks", () => {
it("maps boxes from a focused crop back to the original snapshot coordinates", () => {
const blocks: OcrBlock[] = [
{
id: "ocr-1",
snapshotId: "snapshot-1",
text: "Settings",
confidence: 0.9,
box: { x: 0.25, y: 0.5, width: 0.2, height: 0.1 },
},
];
const remapped = remapFocusedOcrBlocks(blocks, {
cropX: 320,
cropY: 180,
cropWidth: 640,
cropHeight: 360,
originalWidth: 1280,
originalHeight: 720,
});
expect(remapped[0]?.box).toEqual({
x: 0.375,
y: 0.5,
width: 0.1,
height: 0.05,
});
});
});
+225
View File
@@ -0,0 +1,225 @@
import { execFile } from "node:child_process";
import fs from "node:fs/promises";
import path from "node:path";
import { promisify } from "node:util";
import type { GuideEvent, GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
const execFileAsync = promisify(execFile);
interface FocusTransform {
cropX: number;
cropY: number;
cropWidth: number;
cropHeight: number;
originalWidth: number;
originalHeight: number;
}
export interface FocusedOcrSnapshot {
snapshot: GuideSnapshot;
transform?: FocusTransform;
}
export async function createFocusedOcrSnapshot(input: {
snapshot: GuideSnapshot;
event?: GuideEvent;
outputDir: string;
}): Promise<FocusedOcrSnapshot> {
if (process.platform !== "win32") {
return { snapshot: input.snapshot };
}
const click = getEventPoint(input.event, input.snapshot);
if (!click) {
return { snapshot: input.snapshot };
}
const crop = calculateFocusCrop(input.snapshot, click);
if (
!crop ||
(crop.cropWidth === input.snapshot.width && crop.cropHeight === input.snapshot.height)
) {
return { snapshot: input.snapshot };
}
const focusDir = path.join(input.outputDir, "ocr-focus");
await fs.mkdir(focusDir, { recursive: true });
const focusPath = path.join(focusDir, `${path.parse(input.snapshot.path).name}-focus.png`);
const zoom = 2;
const focusedSnapshot: GuideSnapshot = {
...input.snapshot,
path: focusPath,
width: crop.cropWidth * zoom,
height: crop.cropHeight * zoom,
};
try {
await writeFocusedPng({
sourcePath: input.snapshot.path,
outputPath: focusPath,
cropX: crop.cropX,
cropY: crop.cropY,
cropWidth: crop.cropWidth,
cropHeight: crop.cropHeight,
outputWidth: focusedSnapshot.width,
outputHeight: focusedSnapshot.height,
});
return { snapshot: focusedSnapshot, transform: crop };
} catch {
return { snapshot: input.snapshot };
}
}
export function remapFocusedOcrBlocks(
blocks: OcrBlock[],
transform: FocusedOcrSnapshot["transform"],
): OcrBlock[] {
if (!transform) {
return blocks;
}
return blocks.map((block) => ({
...block,
box: {
x: clamp01((transform.cropX + block.box.x * transform.cropWidth) / transform.originalWidth),
y: clamp01((transform.cropY + block.box.y * transform.cropHeight) / transform.originalHeight),
width: clamp01((block.box.width * transform.cropWidth) / transform.originalWidth),
height: clamp01((block.box.height * transform.cropHeight) / transform.originalHeight),
},
}));
}
function getEventPoint(
event: GuideEvent | undefined,
snapshot: GuideSnapshot,
): { x: number; y: number } | null {
if (!event) {
return null;
}
if (isNormalizedNumber(event.normalizedX) && isNormalizedNumber(event.normalizedY)) {
return { x: event.normalizedX, y: event.normalizedY };
}
if (isNormalizedNumber(event.x) && isNormalizedNumber(event.y)) {
return { x: event.x, y: event.y };
}
if (
typeof event.x === "number" &&
typeof event.y === "number" &&
event.x >= 0 &&
event.y >= 0 &&
event.x <= snapshot.width &&
event.y <= snapshot.height
) {
return { x: clamp01(event.x / snapshot.width), y: clamp01(event.y / snapshot.height) };
}
return null;
}
function calculateFocusCrop(
snapshot: GuideSnapshot,
click: { x: number; y: number },
): FocusTransform | null {
if (snapshot.width <= 0 || snapshot.height <= 0) {
return null;
}
const cropWidth = clampInteger(
Math.round(snapshot.width * 0.42),
Math.min(360, snapshot.width),
Math.min(720, snapshot.width),
);
const cropHeight = clampInteger(
Math.round(snapshot.height * 0.42),
Math.min(240, snapshot.height),
Math.min(520, snapshot.height),
);
const clickX = Math.round(clamp01(click.x) * snapshot.width);
const clickY = Math.round(clamp01(click.y) * snapshot.height);
return {
cropX: clampInteger(Math.round(clickX - cropWidth / 2), 0, snapshot.width - cropWidth),
cropY: clampInteger(Math.round(clickY - cropHeight / 2), 0, snapshot.height - cropHeight),
cropWidth,
cropHeight,
originalWidth: snapshot.width,
originalHeight: snapshot.height,
};
}
async function writeFocusedPng(input: {
sourcePath: string;
outputPath: string;
cropX: number;
cropY: number;
cropWidth: number;
cropHeight: number;
outputWidth: number;
outputHeight: number;
}): Promise<void> {
const script = buildCropScript(input);
const encodedCommand = Buffer.from(script, "utf16le").toString("base64");
await execFileAsync(
"powershell.exe",
["-NoProfile", "-ExecutionPolicy", "Bypass", "-EncodedCommand", encodedCommand],
{
timeout: 30000,
maxBuffer: 1024 * 1024,
windowsHide: true,
},
);
}
function buildCropScript(input: {
sourcePath: string;
outputPath: string;
cropX: number;
cropY: number;
cropWidth: number;
cropHeight: number;
outputWidth: number;
outputHeight: number;
}): string {
const sourcePathBase64 = Buffer.from(input.sourcePath, "utf8").toString("base64");
const outputPathBase64 = Buffer.from(input.outputPath, "utf8").toString("base64");
return `
$ErrorActionPreference = "Stop"
$sourcePath = [System.Text.Encoding]::UTF8.GetString([Convert]::FromBase64String("${sourcePathBase64}"))
$outputPath = [System.Text.Encoding]::UTF8.GetString([Convert]::FromBase64String("${outputPathBase64}"))
Add-Type -AssemblyName System.Drawing
$source = [System.Drawing.Image]::FromFile($sourcePath)
$target = [System.Drawing.Bitmap]::new(${input.outputWidth}, ${input.outputHeight})
$graphics = [System.Drawing.Graphics]::FromImage($target)
try {
$graphics.Clear([System.Drawing.Color]::White)
$graphics.InterpolationMode = [System.Drawing.Drawing2D.InterpolationMode]::HighQualityBicubic
$graphics.SmoothingMode = [System.Drawing.Drawing2D.SmoothingMode]::HighQuality
$graphics.PixelOffsetMode = [System.Drawing.Drawing2D.PixelOffsetMode]::HighQuality
$sourceRect = [System.Drawing.Rectangle]::new(${input.cropX}, ${input.cropY}, ${input.cropWidth}, ${input.cropHeight})
$targetRect = [System.Drawing.Rectangle]::new(0, 0, ${input.outputWidth}, ${input.outputHeight})
$graphics.DrawImage($source, $targetRect, $sourceRect, [System.Drawing.GraphicsUnit]::Pixel)
$target.Save($outputPath, [System.Drawing.Imaging.ImageFormat]::Png)
} finally {
$graphics.Dispose()
$target.Dispose()
$source.Dispose()
}
`;
}
function isNormalizedNumber(value: unknown): value is number {
return typeof value === "number" && Number.isFinite(value) && value >= 0 && value <= 1;
}
function clampInteger(value: number, min: number, max: number): number {
if (max < min) {
return min;
}
return Math.round(Math.min(max, Math.max(min, value)));
}
function clamp01(value: number): number {
if (!Number.isFinite(value)) {
return 0;
}
return Math.min(1, Math.max(0, value));
}
+110
View File
@@ -0,0 +1,110 @@
import { describe, expect, it } from "vitest";
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
import {
DefaultGuideOcrClient,
normalizeOcrResponse,
parseWindowsOcrPayload,
} from "./paddleOcrClient";
const snapshot: GuideSnapshot = {
id: "snapshot-1",
eventId: "event-1",
timeMs: 1000,
offsetMs: 500,
path: "/tmp/step-001.png",
width: 1000,
height: 800,
};
describe("normalizeOcrResponse", () => {
it("normalizes pixel boxes into guide OCR blocks", () => {
const blocks = normalizeOcrResponse(
{
blocks: [
{
text: "Save",
confidence: 92,
box: { x: 400, y: 320, width: 120, height: 40 },
},
],
},
snapshot,
);
expect(blocks).toEqual([
{
id: "ocr-snapshot-1-1",
snapshotId: "snapshot-1",
text: "Save",
confidence: 0.92,
box: { x: 0.4, y: 0.4, width: 0.12, height: 0.05 },
},
]);
});
it("normalizes polygon responses", () => {
const blocks = normalizeOcrResponse(
[
{
text: "Next",
score: 0.8,
bbox: [
[100, 200],
[300, 200],
[300, 260],
[100, 260],
],
},
],
snapshot,
);
expect(blocks[0]).toMatchObject({
text: "Next",
confidence: 0.8,
box: { x: 0.1, y: 0.25, width: 0.2, height: 0.075 },
});
});
});
describe("DefaultGuideOcrClient", () => {
it("falls back when the HTTP OCR service is unavailable", async () => {
const fallbackBlock: OcrBlock = {
id: "ocr-snapshot-1-1",
snapshotId: "snapshot-1",
text: "Save",
confidence: 0.75,
box: { x: 0.1, y: 0.2, width: 0.3, height: 0.4 },
};
const client = new DefaultGuideOcrClient(
{
recognize: async () => {
throw new Error("HTTP down");
},
},
{
recognize: async () => [fallbackBlock],
},
);
await expect(client.recognize(snapshot)).resolves.toEqual([fallbackBlock]);
});
});
describe("parseWindowsOcrPayload", () => {
it("recovers from raw control characters in OCR text", () => {
const payload = parseWindowsOcrPayload(
'{"blocks":[{"text":"Save\u0001now","confidence":0.75,"box":{"x":1,"y":2,"width":3,"height":4}}]}',
);
expect(payload).toEqual({
blocks: [
{
text: "Save now",
confidence: 0.75,
box: { x: 1, y: 2, width: 3, height: 4 },
},
],
});
});
});
+372
View File
@@ -0,0 +1,372 @@
import { execFile } from "node:child_process";
import fs from "node:fs/promises";
import { promisify } from "node:util";
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
import { ensureBundledOcrServiceRunning } from "./bundledOcrService";
const execFileAsync = promisify(execFile);
export interface GuideOcrClient {
recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]>;
}
interface PaddleOcrResponseBlock {
text?: unknown;
confidence?: unknown;
score?: unknown;
box?: unknown;
bbox?: unknown;
}
export class PaddleOcrHttpClient implements GuideOcrClient {
constructor(
private readonly baseUrl = process.env.OPENSCREEN_GUIDE_OCR_URL ?? "http://127.0.0.1:8866",
private readonly language = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE ?? "vi,en",
) {}
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
await ensureBundledOcrServiceRunning(this.baseUrl);
const imageBase64 = await fs.readFile(snapshot.path, "base64");
let response: Response;
try {
response = await fetch(`${this.baseUrl.replace(/\/$/, "")}/ocr`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
imageBase64,
path: snapshot.path,
language: this.language,
}),
});
} catch (error) {
throw new Error(
`OCR service is unavailable: ${error instanceof Error ? error.message : String(error)}`,
);
}
if (!response.ok) {
throw new Error(`OCR service returned HTTP ${response.status}.`);
}
const payload = (await response.json()) as unknown;
return normalizeOcrResponse(payload, snapshot);
}
}
export class WindowsOcrClient implements GuideOcrClient {
constructor(private readonly language = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE ?? "vi,en") {}
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
if (process.platform !== "win32") {
throw new Error("Windows OCR fallback is only available on Windows.");
}
const script = buildWindowsOcrScript(snapshot.path, this.language);
const encodedCommand = Buffer.from(script, "utf16le").toString("base64");
let stdout: string;
try {
const result = await execFileAsync(
"powershell.exe",
["-NoProfile", "-ExecutionPolicy", "Bypass", "-EncodedCommand", encodedCommand],
{
maxBuffer: 8 * 1024 * 1024,
timeout: 30000,
windowsHide: true,
},
);
stdout = result.stdout;
} catch (error) {
throw new Error(
`Windows OCR failed: ${error instanceof Error ? error.message : String(error)}`,
);
}
let payload: unknown;
try {
payload = parseWindowsOcrPayload(stdout);
} catch (error) {
throw new Error(
`Windows OCR returned invalid JSON: ${
error instanceof Error ? error.message : String(error)
}`,
);
}
return normalizeOcrResponse(payload, snapshot);
}
}
export class DefaultGuideOcrClient implements GuideOcrClient {
constructor(
private readonly httpClient = new PaddleOcrHttpClient(),
private readonly windowsClient = new WindowsOcrClient(),
) {}
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
try {
return await this.httpClient.recognize(snapshot);
} catch (httpError) {
try {
return await this.windowsClient.recognize(snapshot);
} catch (fallbackError) {
throw new Error(
[
httpError instanceof Error ? httpError.message : String(httpError),
fallbackError instanceof Error ? fallbackError.message : String(fallbackError),
].join(" "),
);
}
}
}
}
export function parseWindowsOcrPayload(stdout: string): unknown {
const normalized = stdout.replace(/^\uFEFF/, "").trim();
try {
return JSON.parse(normalized);
} catch {
return JSON.parse(replaceRawJsonControlCharacters(normalized));
}
}
function replaceRawJsonControlCharacters(value: string): string {
let result = "";
for (const character of value) {
const code = character.charCodeAt(0);
result += code < 32 || code === 127 ? " " : character;
}
return result;
}
export function normalizeOcrResponse(payload: unknown, snapshot: GuideSnapshot): OcrBlock[] {
const rawBlocks = extractRawBlocks(payload);
return rawBlocks
.map((raw, index) => normalizeBlock(raw, snapshot, index))
.filter((block): block is OcrBlock => block !== null);
}
function extractRawBlocks(payload: unknown): PaddleOcrResponseBlock[] {
if (Array.isArray(payload)) {
return payload as PaddleOcrResponseBlock[];
}
if (isRecord(payload)) {
if (Array.isArray(payload.blocks)) {
return payload.blocks as PaddleOcrResponseBlock[];
}
if (Array.isArray(payload.results)) {
return payload.results as PaddleOcrResponseBlock[];
}
if (Array.isArray(payload.data)) {
return payload.data as PaddleOcrResponseBlock[];
}
}
return [];
}
function normalizeBlock(
raw: PaddleOcrResponseBlock,
snapshot: GuideSnapshot,
index: number,
): OcrBlock | null {
if (!isRecord(raw)) {
return null;
}
const text = typeof raw.text === "string" ? raw.text.trim() : "";
if (!text) {
return null;
}
const confidence = normalizeConfidence(raw.confidence ?? raw.score);
const box = normalizeBox(raw.box ?? raw.bbox, snapshot);
if (!box) {
return null;
}
return {
id: `ocr-${snapshot.id}-${index + 1}`,
snapshotId: snapshot.id,
text,
confidence,
box,
};
}
function normalizeConfidence(value: unknown): number {
if (typeof value !== "number" || !Number.isFinite(value)) {
return 0.5;
}
return value > 1 ? clamp01(value / 100) : clamp01(value);
}
function normalizeBox(
value: unknown,
snapshot: GuideSnapshot,
): { x: number; y: number; width: number; height: number } | null {
if (Array.isArray(value)) {
return normalizeArrayBox(value, snapshot);
}
if (!isRecord(value)) {
return null;
}
const x = normalizeNumber(value.x);
const y = normalizeNumber(value.y);
const width = normalizeNumber(value.width ?? value.w);
const height = normalizeNumber(value.height ?? value.h);
if (x === null || y === null || width === null || height === null) {
return null;
}
return normalizeBoxDimensions({ x, y, width, height }, snapshot);
}
function normalizeArrayBox(
value: unknown[],
snapshot: GuideSnapshot,
): { x: number; y: number; width: number; height: number } | null {
const numbers = value.flat(2).filter((item): item is number => typeof item === "number");
if (numbers.length >= 8) {
const xs = [numbers[0], numbers[2], numbers[4], numbers[6]];
const ys = [numbers[1], numbers[3], numbers[5], numbers[7]];
const minX = Math.min(...xs);
const maxX = Math.max(...xs);
const minY = Math.min(...ys);
const maxY = Math.max(...ys);
return normalizeBoxDimensions(
{ x: minX, y: minY, width: maxX - minX, height: maxY - minY },
snapshot,
);
}
if (numbers.length >= 4) {
return normalizeBoxDimensions(
{ x: numbers[0] ?? 0, y: numbers[1] ?? 0, width: numbers[2] ?? 0, height: numbers[3] ?? 0 },
snapshot,
);
}
return null;
}
function normalizeBoxDimensions(
box: { x: number; y: number; width: number; height: number },
snapshot: GuideSnapshot,
): { x: number; y: number; width: number; height: number } {
const usesPixels =
box.x > 1 ||
box.y > 1 ||
box.width > 1 ||
box.height > 1 ||
box.x + box.width > 1 ||
box.y + box.height > 1;
const scaleX = usesPixels ? snapshot.width : 1;
const scaleY = usesPixels ? snapshot.height : 1;
return {
x: clamp01(box.x / scaleX),
y: clamp01(box.y / scaleY),
width: clamp01(box.width / scaleX),
height: clamp01(box.height / scaleY),
};
}
function normalizeNumber(value: unknown): number | null {
return typeof value === "number" && Number.isFinite(value) ? value : null;
}
function clamp01(value: number): number {
if (!Number.isFinite(value)) {
return 0;
}
return Math.min(1, Math.max(0, value));
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null;
}
function buildWindowsOcrScript(imagePath: string, language: string): string {
const imagePathBase64 = Buffer.from(imagePath, "utf8").toString("base64");
const languageBase64 = Buffer.from(language, "utf8").toString("base64");
return `
$ErrorActionPreference = "Stop"
[Console]::OutputEncoding = [System.Text.UTF8Encoding]::new($false)
$OutputEncoding = [System.Text.UTF8Encoding]::new($false)
$imagePath = [System.Text.Encoding]::UTF8.GetString([Convert]::FromBase64String("${imagePathBase64}"))
$languageSetting = [System.Text.Encoding]::UTF8.GetString([Convert]::FromBase64String("${languageBase64}"))
Add-Type -AssemblyName System.Runtime.WindowsRuntime
[void][Windows.Storage.StorageFile, Windows.Storage, ContentType=WindowsRuntime]
[void][Windows.Storage.FileAccessMode, Windows.Storage, ContentType=WindowsRuntime]
[void][Windows.Graphics.Imaging.BitmapDecoder, Windows.Graphics.Imaging, ContentType=WindowsRuntime]
[void][Windows.Graphics.Imaging.SoftwareBitmap, Windows.Graphics.Imaging, ContentType=WindowsRuntime]
[void][Windows.Media.Ocr.OcrEngine, Windows.Foundation, ContentType=WindowsRuntime]
[void][Windows.Globalization.Language, Windows.Globalization, ContentType=WindowsRuntime]
$asTaskGeneric = ([System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {
$_.Name -eq "AsTask" -and $_.IsGenericMethodDefinition -and $_.GetParameters().Count -eq 1
})[0]
function Await-WinRt($operation, [Type]$resultType) {
$asTask = $asTaskGeneric.MakeGenericMethod($resultType)
$task = $asTask.Invoke($null, @($operation))
$task.Wait()
return $task.Result
}
function New-OcrEngine($languageSetting) {
$languageTags = @()
foreach ($item in $languageSetting.Split(",")) {
$tag = $item.Trim()
if ($tag -eq "vi") { $tag = "vi-VN" }
if ($tag -eq "en") { $tag = "en-US" }
if ($tag.Length -gt 0) { $languageTags += $tag }
}
foreach ($tag in $languageTags) {
try {
$language = [Windows.Globalization.Language]::new($tag)
$engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($language)
if ($null -ne $engine) { return $engine }
} catch {}
}
$profileEngine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromUserProfileLanguages()
if ($null -ne $profileEngine) { return $profileEngine }
return [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage([Windows.Globalization.Language]::new("en-US"))
}
function Normalize-OcrText($value) {
if ($null -eq $value) { return "" }
$text = [string]$value
$text = [System.Text.RegularExpressions.Regex]::Replace($text, "[\\x00-\\x1F\\x7F]", " ")
return $text.Trim()
}
$file = Await-WinRt ([Windows.Storage.StorageFile]::GetFileFromPathAsync($imagePath)) ([Windows.Storage.StorageFile])
$stream = Await-WinRt ($file.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream])
$decoder = Await-WinRt ([Windows.Graphics.Imaging.BitmapDecoder]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder])
$bitmap = Await-WinRt ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap])
$engine = New-OcrEngine $languageSetting
if ($null -eq $engine) { throw "No Windows OCR engine is available." }
$result = Await-WinRt ($engine.RecognizeAsync($bitmap)) ([Windows.Media.Ocr.OcrResult])
$blocks = @()
$index = 0
foreach ($line in $result.Lines) {
foreach ($word in $line.Words) {
$rect = $word.BoundingRect
$text = Normalize-OcrText $word.Text
if ($text.Length -gt 0) {
$index += 1
$blocks += [PSCustomObject]@{
text = $text
confidence = 0.75
box = [PSCustomObject]@{
x = [double]$rect.X
y = [double]$rect.Y
width = [double]$rect.Width
height = [double]$rect.Height
}
}
}
}
}
[PSCustomObject]@{ blocks = $blocks } | ConvertTo-Json -Depth 6 -Compress
`;
}
+11
View File
@@ -35,6 +35,9 @@ import type {
ProjectFileResult,
ProjectPathResult,
} from "../../src/native/contracts";
import { DeepSeekSettingsStore } from "../guide/ai/deepseekSettingsStore";
import { registerGuideIpcHandlers } from "../guide/guideIpc";
import { GuideStore } from "../guide/guideStore";
import { mainT } from "../i18n";
import { RECORDINGS_DIR } from "../main";
import { createCursorRecordingSession } from "../native-bridge/cursor/recording/factory";
@@ -2172,6 +2175,14 @@ export function registerIpcHandlers(
// never buffers the full video in memory (the #616 fix).
const recordingStreams = new RecordingStreamRegistry();
registerRecordingStreamHandlers(ipcMain, recordingStreams, resolveRecordingOutputPath);
const guideAiSettingsStore = new DeepSeekSettingsStore(
path.join(app.getPath("userData"), "guide-ai-settings.json"),
);
registerGuideIpcHandlers(
ipcMain,
new GuideStore(RECORDINGS_DIR, { deepSeekConfigProvider: guideAiSettingsStore }),
guideAiSettingsStore,
);
ipcMain.handle("store-recorded-session", async (_, payload: StoreRecordedSessionInput) => {
try {
+2 -2
View File
@@ -632,8 +632,8 @@ int main(int argc, char* argv[]) {
(webcamOutputFrameIndex * 10'000'000ULL) / std::max(1, webcamCapture.fps()));
if (!webcamEncoder.writeBgraFrame(webcamFrame, webcamTimestampHns)) {
encodeFailed = true;
stopRequested = true;
cv.notify_all();
control.stopRequested = true;
control.cv.notify_all();
return;
}
lastWrittenWebcamSequence = latestWebcamSequence;
+52
View File
@@ -1,4 +1,15 @@
import { contextBridge, ipcRenderer } from "electron";
import type {
AddGuideMarkerInput,
DiscardGuideSessionInput,
ExportGuideInput,
FinalizeGuideEventsInput,
GenerateGuideDraftInput,
RunGuideOcrInput,
SaveGuideAiSettingsInput,
SaveGuideInput,
WriteGuideSnapshotInput,
} from "../src/guide/contracts";
import type { NativeMacRecordingRequest } from "../src/lib/nativeMacRecording";
import type { NativeWindowsRecordingRequest } from "../src/lib/nativeWindowsRecording";
import type { RecordingSession, StoreRecordedSessionInput } from "../src/lib/recordingSession";
@@ -16,6 +27,47 @@ contextBridge.exposeInMainWorld("electronAPI", {
invokeNativeBridge: <TData>(request: NativeBridgeRequest) => {
return ipcRenderer.invoke(NATIVE_BRIDGE_CHANNEL, request) as Promise<TData>;
},
guide: {
startSession: (recordingId: string | number) => {
return ipcRenderer.invoke("guide:start-session", recordingId);
},
readSession: (recordingId: string | number) => {
return ipcRenderer.invoke("guide:read-session", recordingId);
},
addMarker: (input: AddGuideMarkerInput) => {
return ipcRenderer.invoke("guide:add-marker", input);
},
finalizeEvents: (input: FinalizeGuideEventsInput) => {
return ipcRenderer.invoke("guide:finalize-events", input);
},
writeSnapshot: (input: WriteGuideSnapshotInput) => {
return ipcRenderer.invoke("guide:write-snapshot", input);
},
runOcr: (input: RunGuideOcrInput) => {
return ipcRenderer.invoke("guide:run-ocr", input);
},
generateDraft: (input: GenerateGuideDraftInput) => {
return ipcRenderer.invoke("guide:generate-draft", input);
},
getAiSettings: () => {
return ipcRenderer.invoke("guide:get-ai-settings");
},
saveAiSettings: (input: SaveGuideAiSettingsInput) => {
return ipcRenderer.invoke("guide:save-ai-settings", input);
},
saveGuide: (input: SaveGuideInput) => {
return ipcRenderer.invoke("guide:save-guide", input);
},
exportMarkdown: (input: ExportGuideInput) => {
return ipcRenderer.invoke("guide:export-markdown", input);
},
exportHtml: (input: ExportGuideInput) => {
return ipcRenderer.invoke("guide:export-html", input);
},
discardSession: (input: DiscardGuideSessionInput) => {
return ipcRenderer.invoke("guide:discard-session", input);
},
},
hudOverlayHide: () => {
ipcRenderer.send("hud-overlay-hide");
},