3 Commits

Author SHA1 Message Date
huanld cce81dd7c4 Add Windows OCR service installer 2026-05-28 19:01:34 +07:00
huanld 7823507a18 Fix Windows native capture state and monitor adapter 2026-05-28 13:22:24 +07:00
huanld 0b78ff6f7d Release OpenScreen 1.4.4 2026-05-28 12:25:23 +07:00
25 changed files with 1583 additions and 147 deletions
+17
View File
@@ -0,0 +1,17 @@
!macro customInstall
DetailPrint "Installing OpenScreen OCR Windows service"
nsExec::ExecToLog '"$SYSDIR\sc.exe" stop OpenScreenOCR'
nsExec::ExecToLog '"$SYSDIR\sc.exe" delete OpenScreenOCR'
Sleep 1000
ExpandEnvStrings $0 "%ProgramData%\OpenScreen\ocr-runtime"
CreateDirectory "$0"
nsExec::ExecToLog '"$SYSDIR\sc.exe" create OpenScreenOCR binPath= "\"$INSTDIR\resources\electron\native\bin\win32-x64\openscreen-ocr-service-wrapper.exe\" --service --exe \"$INSTDIR\resources\ocr-service\openscreen-ocr-service.exe\" --resources \"$INSTDIR\resources\" --data \"$0\"" start= auto DisplayName= "OpenScreen OCR Service"'
nsExec::ExecToLog '"$SYSDIR\sc.exe" description OpenScreenOCR "Local OCR service used by OpenScreen guide capture."'
nsExec::ExecToLog '"$SYSDIR\sc.exe" start OpenScreenOCR'
!macroend
!macro customUnInstall
DetailPrint "Removing OpenScreen OCR Windows service"
nsExec::ExecToLog '"$SYSDIR\sc.exe" stop OpenScreenOCR'
nsExec::ExecToLog '"$SYSDIR\sc.exe" delete OpenScreenOCR'
!macroend
+8 -4
View File
@@ -6,7 +6,7 @@ OpenScreen calls OCR through a local HTTP service. The default endpoint is:
http://127.0.0.1:8866/ocr http://127.0.0.1:8866/ocr
``` ```
The app sends either `imageBase64` or `path` and expects OCR blocks: The app sends either `imageBase64` or `path`, plus optional `language` and `profile`, and expects OCR blocks:
```json ```json
{ {
@@ -38,7 +38,7 @@ If `paddle` is still missing after installing `paddleocr`, install the CPU Paddl
```powershell ```powershell
.\.venv-ocr\Scripts\Activate.ps1 .\.venv-ocr\Scripts\Activate.ps1
$env:PADDLEOCR_DEVICE="cpu" $env:PADDLEOCR_DEVICE="cpu"
$env:PADDLEOCR_LANG="latin" $env:OPENSCREEN_OCR_PROFILE="vietnamese"
npm run ocr:paddle npm run ocr:paddle
``` ```
@@ -58,7 +58,8 @@ Expected healthy environment:
"paddleocrInstalled": true, "paddleocrInstalled": true,
"paddleInstalled": true, "paddleInstalled": true,
"engineReady": false, "engineReady": false,
"defaultLanguage": "latin" "defaultLanguage": "vi,en",
"defaultProfile": "vietnamese"
} }
``` ```
@@ -67,7 +68,10 @@ Expected healthy environment:
## Configuration ## Configuration
- `PADDLEOCR_DEVICE`: `cpu`, `gpu:0`, or another PaddleOCR device string. - `PADDLEOCR_DEVICE`: `cpu`, `gpu:0`, or another PaddleOCR device string.
- `PADDLEOCR_LANG`: defaults to `latin`; this is preferred for Vietnamese UI text because it uses a Latin-script recognition model. - `OPENSCREEN_OCR_PROFILE`: `fast`, `vietnamese`, or `hybrid`. The default `vietnamese` profile upscales and sharpens focused UI screenshots before OCR.
- `OPENSCREEN_GUIDE_OCR_LANGUAGE`: defaults to `vi,en`.
- `PADDLEOCR_LANG`: optional hard override. Leave unset for the app profile/language settings to work.
- `PADDLEOCR_VERSION`: defaults to `PP-OCRv5`. - `PADDLEOCR_VERSION`: defaults to `PP-OCRv5`.
- `PADDLEOCR_USE_MOBILE`: defaults to `1`; set to `0` to use the default/server models. - `PADDLEOCR_USE_MOBILE`: defaults to `1`; set to `0` to use the default/server models.
- `PADDLEOCR_REC_MODEL`: optional recognizer model override. The bundled profile uses `latin_PP-OCRv5_mobile_rec`, which supports Vietnamese Latin-script text.
- `OPENSCREEN_GUIDE_OCR_URL`: OpenScreen OCR endpoint override; defaults to `http://127.0.0.1:8866`. - `OPENSCREEN_GUIDE_OCR_URL`: OpenScreen OCR endpoint override; defaults to `http://127.0.0.1:8866`.
+8 -5
View File
@@ -79,6 +79,7 @@
"nsis" "nsis"
], ],
"icon": "icons/icons/win/icon.ico", "icon": "icons/icons/win/icon.ico",
"requestedExecutionLevel": "requireAdministrator",
"signAndEditExecutable": false, "signAndEditExecutable": false,
"signExts": ["!.exe"], "signExts": ["!.exe"],
"extraResources": [ "extraResources": [
@@ -99,8 +100,10 @@
} }
] ]
}, },
"nsis": { "nsis": {
"oneClick": false, "oneClick": false,
"allowToChangeInstallationDirectory": true "allowToChangeInstallationDirectory": true,
} "perMachine": true,
} "include": "build/installer.nsh"
}
}
@@ -0,0 +1,66 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { DeepSeekSettingsStore } from "./deepseekSettingsStore";
const tempDirs: string[] = [];
const originalOcrProfile = process.env.OPENSCREEN_GUIDE_OCR_PROFILE;
const originalOcrLanguage = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE;
beforeEach(() => {
delete process.env.OPENSCREEN_GUIDE_OCR_PROFILE;
delete process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE;
});
afterEach(async () => {
restoreEnv("OPENSCREEN_GUIDE_OCR_PROFILE", originalOcrProfile);
restoreEnv("OPENSCREEN_GUIDE_OCR_LANGUAGE", originalOcrLanguage);
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true })));
});
function restoreEnv(name: string, value: string | undefined): void {
if (value === undefined) {
delete process.env[name];
return;
}
process.env[name] = value;
}
async function createStore(): Promise<DeepSeekSettingsStore> {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openscreen-guide-settings-"));
tempDirs.push(dir);
return new DeepSeekSettingsStore(path.join(dir, "guide-ai-settings.json"));
}
describe("DeepSeekSettingsStore OCR settings", () => {
it("defaults to the Vietnamese enhanced OCR profile", async () => {
const store = await createStore();
await expect(store.getOcrConfig()).resolves.toEqual({
profile: "vietnamese",
language: "vi,en",
});
});
it("persists OCR profile changes alongside DeepSeek settings", async () => {
const store = await createStore();
const status = await store.save({
deepseekApiKeyEnvName: "DEEPSEEK_API_KEY",
baseUrl: "https://api.deepseek.com",
model: "deepseek-chat",
ocrProfile: "hybrid",
ocrLanguage: "vi,en",
});
expect(status.ocr).toMatchObject({
profile: "hybrid",
language: "vi,en",
});
await expect(store.getOcrConfig()).resolves.toEqual({
profile: "hybrid",
language: "vi,en",
});
});
});
+68 -2
View File
@@ -1,6 +1,10 @@
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import path from "node:path"; import path from "node:path";
import type { GuideAiSettings, SaveGuideAiSettingsInput } from "../../../src/guide/contracts"; import type {
GuideAiSettings,
GuideOcrProfile,
SaveGuideAiSettingsInput,
} from "../../../src/guide/contracts";
export interface DeepSeekGuideConfig { export interface DeepSeekGuideConfig {
apiKey?: string; apiKey?: string;
@@ -12,8 +16,22 @@ export interface DeepSeekGuideConfigProvider {
getDeepSeekConfig(): Promise<DeepSeekGuideConfig>; getDeepSeekConfig(): Promise<DeepSeekGuideConfig>;
} }
export interface GuideOcrConfig {
profile: GuideOcrProfile;
language: string;
}
export interface GuideOcrConfigProvider {
getOcrConfig(): Promise<GuideOcrConfig>;
}
interface PersistedGuideAiSettings { interface PersistedGuideAiSettings {
schemaVersion: 1; schemaVersion: 1;
ocr?: {
profile?: GuideOcrProfile;
language?: string;
updatedAt?: string;
};
deepseek?: { deepseek?: {
apiKeyEnvName?: string; apiKeyEnvName?: string;
baseUrl?: string; baseUrl?: string;
@@ -25,8 +43,10 @@ interface PersistedGuideAiSettings {
const DEFAULT_DEEPSEEK_API_KEY_ENV_NAME = "DEEPSEEK_API_KEY"; const DEFAULT_DEEPSEEK_API_KEY_ENV_NAME = "DEEPSEEK_API_KEY";
const DEFAULT_DEEPSEEK_BASE_URL = "https://api.deepseek.com"; const DEFAULT_DEEPSEEK_BASE_URL = "https://api.deepseek.com";
const DEFAULT_DEEPSEEK_MODEL = "deepseek-chat"; const DEFAULT_DEEPSEEK_MODEL = "deepseek-chat";
const DEFAULT_OCR_PROFILE: GuideOcrProfile = "vietnamese";
const DEFAULT_OCR_LANGUAGE = "vi,en";
export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider { export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider, GuideOcrConfigProvider {
constructor(private readonly filePath: string) {} constructor(private readonly filePath: string) {}
async getStatus(): Promise<GuideAiSettings> { async getStatus(): Promise<GuideAiSettings> {
@@ -35,6 +55,13 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
const activeApiKey = process.env[apiKeyEnvName]; const activeApiKey = process.env[apiKeyEnvName];
return { return {
ocr: {
profile: normalizeOcrProfile(raw?.ocr?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
language: normalizeOcrLanguage(
raw?.ocr?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE,
),
updatedAt: raw?.ocr?.updatedAt,
},
deepseek: { deepseek: {
hasApiKey: Boolean(activeApiKey), hasApiKey: Boolean(activeApiKey),
apiKeyEnvName, apiKeyEnvName,
@@ -49,7 +76,14 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
async save(input: SaveGuideAiSettingsInput): Promise<GuideAiSettings> { async save(input: SaveGuideAiSettingsInput): Promise<GuideAiSettings> {
const current = (await this.readSettings()) ?? { schemaVersion: 1 }; const current = (await this.readSettings()) ?? { schemaVersion: 1 };
const currentOcr = current.ocr ?? {};
const currentDeepSeek = current.deepseek ?? {}; const currentDeepSeek = current.deepseek ?? {};
const nextOcr = {
...currentOcr,
profile: normalizeOcrProfile(input.ocrProfile ?? currentOcr.profile),
language: normalizeOcrLanguage(input.ocrLanguage ?? currentOcr.language),
updatedAt: new Date().toISOString(),
};
const nextDeepSeek = { const nextDeepSeek = {
...currentDeepSeek, ...currentDeepSeek,
baseUrl: normalizeBaseUrl(input.baseUrl ?? currentDeepSeek.baseUrl), baseUrl: normalizeBaseUrl(input.baseUrl ?? currentDeepSeek.baseUrl),
@@ -65,6 +99,7 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
await this.writeSettings({ await this.writeSettings({
schemaVersion: 1, schemaVersion: 1,
ocr: nextOcr,
deepseek: nextDeepSeek, deepseek: nextDeepSeek,
}); });
return await this.getStatus(); return await this.getStatus();
@@ -80,6 +115,16 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
}; };
} }
async getOcrConfig(): Promise<GuideOcrConfig> {
const raw = await this.readSettings();
return {
profile: normalizeOcrProfile(raw?.ocr?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
language: normalizeOcrLanguage(
raw?.ocr?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE,
),
};
}
private async readSettings(): Promise<PersistedGuideAiSettings | null> { private async readSettings(): Promise<PersistedGuideAiSettings | null> {
try { try {
const content = await fs.readFile(this.filePath, "utf-8"); const content = await fs.readFile(this.filePath, "utf-8");
@@ -120,6 +165,11 @@ function normalizePersistedSettings(input: unknown): PersistedGuideAiSettings |
} }
return { return {
schemaVersion: 1, schemaVersion: 1,
ocr: {
profile: normalizeOcrProfile(raw.ocr?.profile),
language: normalizeOcrLanguage(raw.ocr?.language),
updatedAt: raw.ocr?.updatedAt,
},
deepseek: { deepseek: {
apiKeyEnvName: normalizeEnvName(raw.deepseek?.apiKeyEnvName), apiKeyEnvName: normalizeEnvName(raw.deepseek?.apiKeyEnvName),
baseUrl: raw.deepseek?.baseUrl, baseUrl: raw.deepseek?.baseUrl,
@@ -155,3 +205,19 @@ function normalizeBaseUrl(value: string | undefined): string {
function normalizeModel(value: string | undefined): string { function normalizeModel(value: string | undefined): string {
return value?.trim() || DEFAULT_DEEPSEEK_MODEL; return value?.trim() || DEFAULT_DEEPSEEK_MODEL;
} }
function normalizeOcrProfile(value: string | undefined): GuideOcrProfile {
if (value === "fast" || value === "vietnamese" || value === "hybrid") {
return value;
}
return DEFAULT_OCR_PROFILE;
}
function normalizeOcrLanguage(value: string | undefined): string {
const normalized = value
?.split(",")
.map((part) => part.trim().toLowerCase())
.filter(Boolean)
.join(",");
return normalized || DEFAULT_OCR_LANGUAGE;
}
+4
View File
@@ -168,6 +168,7 @@ describe("GuideStore", () => {
width: 800, width: 800,
height: 600, height: 600,
pngBytes: new Uint8Array([137, 80, 78, 71]).buffer, pngBytes: new Uint8Array([137, 80, 78, 71]).buffer,
markedPngBytes: new Uint8Array([137, 80, 78, 71, 1]).buffer,
}); });
expect(session.status).toBe("snapshots-ready"); expect(session.status).toBe("snapshots-ready");
@@ -176,6 +177,9 @@ describe("GuideStore", () => {
await expect(fs.readFile(session.snapshots[0]?.path ?? "")).resolves.toEqual( await expect(fs.readFile(session.snapshots[0]?.path ?? "")).resolves.toEqual(
Buffer.from([137, 80, 78, 71]), Buffer.from([137, 80, 78, 71]),
); );
await expect(fs.readFile(session.snapshots[0]?.markedPath ?? "")).resolves.toEqual(
Buffer.from([137, 80, 78, 71, 1]),
);
}); });
it("runs OCR, generates a local draft, and exports files", async () => { it("runs OCR, generates a local draft, and exports files", async () => {
+21 -4
View File
@@ -34,7 +34,10 @@ import {
DeepSeekGuideClientError, DeepSeekGuideClientError,
type GuideDraftClient, type GuideDraftClient,
} from "./ai/deepseekGuideClient"; } from "./ai/deepseekGuideClient";
import type { DeepSeekGuideConfigProvider } from "./ai/deepseekSettingsStore"; import type {
DeepSeekGuideConfigProvider,
GuideOcrConfigProvider,
} from "./ai/deepseekSettingsStore";
import { type GuidePaths, normalizeGuideRecordingId, resolveGuidePaths } from "./guidePaths"; import { type GuidePaths, normalizeGuideRecordingId, resolveGuidePaths } from "./guidePaths";
import { createFocusedOcrSnapshot, remapFocusedOcrBlocks } from "./ocr/focusedOcrSnapshot"; import { createFocusedOcrSnapshot, remapFocusedOcrBlocks } from "./ocr/focusedOcrSnapshot";
import { DefaultGuideOcrClient, type GuideOcrClient } from "./ocr/paddleOcrClient"; import { DefaultGuideOcrClient, type GuideOcrClient } from "./ocr/paddleOcrClient";
@@ -70,6 +73,7 @@ export interface GuideStoreDependencies {
ocrClient?: GuideOcrClient; ocrClient?: GuideOcrClient;
draftClient?: GuideDraftClient; draftClient?: GuideDraftClient;
deepSeekConfigProvider?: DeepSeekGuideConfigProvider; deepSeekConfigProvider?: DeepSeekGuideConfigProvider;
ocrConfigProvider?: GuideOcrConfigProvider;
focusOcrSnapshots?: boolean; focusOcrSnapshots?: boolean;
} }
@@ -209,10 +213,19 @@ export class GuideStore {
this.assertGuidePathIsAllowed(session.outputDir); this.assertGuidePathIsAllowed(session.outputDir);
await fs.mkdir(session.outputDir, { recursive: true }); await fs.mkdir(session.outputDir, { recursive: true });
const fileName = `step-${String(eventIndex + 1).padStart(3, "0")}.png`; const fileBaseName = `step-${String(eventIndex + 1).padStart(3, "0")}`;
const fileName = `${fileBaseName}.png`;
const snapshotPath = path.join(session.outputDir, fileName); const snapshotPath = path.join(session.outputDir, fileName);
const markedSnapshotPath = path.join(session.outputDir, `${fileBaseName}-marked.png`);
this.assertGuidePathIsAllowed(snapshotPath); this.assertGuidePathIsAllowed(snapshotPath);
this.assertGuidePathIsAllowed(markedSnapshotPath);
await fs.writeFile(snapshotPath, Buffer.from(new Uint8Array(input.pngBytes))); await fs.writeFile(snapshotPath, Buffer.from(new Uint8Array(input.pngBytes)));
const hasMarkedSnapshot = Boolean(input.markedPngBytes?.byteLength);
if (hasMarkedSnapshot && input.markedPngBytes) {
await fs.writeFile(markedSnapshotPath, Buffer.from(new Uint8Array(input.markedPngBytes)));
} else {
await fs.unlink(markedSnapshotPath).catch(() => undefined);
}
const snapshot: GuideSnapshot = { const snapshot: GuideSnapshot = {
id: `snapshot-${input.eventId}`, id: `snapshot-${input.eventId}`,
@@ -220,6 +233,7 @@ export class GuideStore {
timeMs: Math.max(0, input.timeMs), timeMs: Math.max(0, input.timeMs),
offsetMs: input.offsetMs, offsetMs: input.offsetMs,
path: snapshotPath, path: snapshotPath,
markedPath: hasMarkedSnapshot ? markedSnapshotPath : undefined,
width: Math.round(input.width), width: Math.round(input.width),
height: Math.round(input.height), height: Math.round(input.height),
}; };
@@ -255,7 +269,9 @@ export class GuideStore {
throw new GuideStoreError("guide-invalid-input", "No guide snapshots are available for OCR."); throw new GuideStoreError("guide-invalid-input", "No guide snapshots are available for OCR.");
} }
const ocrClient = this.dependencies.ocrClient ?? new DefaultGuideOcrClient(); const ocrClient =
this.dependencies.ocrClient ??
DefaultGuideOcrClient.fromConfig(await this.dependencies.ocrConfigProvider?.getOcrConfig());
const shouldFocusOcrSnapshots = const shouldFocusOcrSnapshots =
this.dependencies.focusOcrSnapshots ?? this.dependencies.ocrClient === undefined; this.dependencies.focusOcrSnapshots ?? this.dependencies.ocrClient === undefined;
const eventsById = new Map(session.events.map((event) => [event.id, event])); const eventsById = new Map(session.events.map((event) => [event.id, event]));
@@ -662,6 +678,7 @@ function normalizeGuideSnapshot(input: unknown): GuideSnapshot | null {
const id = normalizeString(input.id); const id = normalizeString(input.id);
const eventId = normalizeString(input.eventId); const eventId = normalizeString(input.eventId);
const pathValue = normalizeString(input.path); const pathValue = normalizeString(input.path);
const markedPath = normalizeOptionalString(input.markedPath);
const timeMs = normalizeNonNegativeNumber(input.timeMs); const timeMs = normalizeNonNegativeNumber(input.timeMs);
const offsetMs = normalizeOptionalNumber(input.offsetMs); const offsetMs = normalizeOptionalNumber(input.offsetMs);
const width = normalizePositiveInteger(input.width); const width = normalizePositiveInteger(input.width);
@@ -677,7 +694,7 @@ function normalizeGuideSnapshot(input: unknown): GuideSnapshot | null {
) { ) {
return null; return null;
} }
return { id, eventId, timeMs, offsetMs, path: pathValue, width, height }; return { id, eventId, timeMs, offsetMs, path: pathValue, markedPath, width, height };
} }
function normalizeOcrBlock(input: unknown): OcrBlock | null { function normalizeOcrBlock(input: unknown): OcrBlock | null {
+46 -2
View File
@@ -1,14 +1,17 @@
import { type ChildProcessWithoutNullStreams, spawn } from "node:child_process"; import { type ChildProcessWithoutNullStreams, execFile, spawn } from "node:child_process";
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import path from "node:path"; import path from "node:path";
import { promisify } from "node:util";
import { app } from "electron"; import { app } from "electron";
const DEFAULT_OCR_BASE_URL = "http://127.0.0.1:8866"; const DEFAULT_OCR_BASE_URL = "http://127.0.0.1:8866";
const DEFAULT_OCR_PORT = "8866"; const DEFAULT_OCR_PORT = "8866";
const WINDOWS_SERVICE_NAME = "OpenScreenOCR";
const SERVICE_EXE_NAME = "openscreen-ocr-service.exe"; const SERVICE_EXE_NAME = "openscreen-ocr-service.exe";
const HEALTH_TIMEOUT_MS = 1000; const HEALTH_TIMEOUT_MS = 1000;
const STARTUP_TIMEOUT_MS = 90000; const STARTUP_TIMEOUT_MS = 90000;
const PADDLEX_MODEL_NAMES = ["PP-OCRv5_mobile_det", "latin_PP-OCRv5_mobile_rec"]; const PADDLEX_MODEL_NAMES = ["PP-OCRv5_mobile_det", "latin_PP-OCRv5_mobile_rec"];
const execFileAsync = promisify(execFile);
let ocrProcess: ChildProcessWithoutNullStreams | null = null; let ocrProcess: ChildProcessWithoutNullStreams | null = null;
let startupPromise: Promise<void> | null = null; let startupPromise: Promise<void> | null = null;
@@ -24,6 +27,11 @@ export async function ensureBundledOcrServiceRunning(
return; return;
} }
if (process.platform === "win32" && (await startInstalledWindowsOcrService())) {
await waitForOcrServiceHealth(baseUrl, STARTUP_TIMEOUT_MS);
return;
}
const executablePath = await findBundledOcrServiceExecutable(); const executablePath = await findBundledOcrServiceExecutable();
if (!executablePath) { if (!executablePath) {
return; return;
@@ -51,6 +59,39 @@ function shouldManageOcrService(baseUrl: string): boolean {
} }
} }
async function startInstalledWindowsOcrService(): Promise<boolean> {
const query = await runSc(["query", WINDOWS_SERVICE_NAME]);
if (!query.success) {
return false;
}
if (/\bRUNNING\b/i.test(query.output)) {
return true;
}
const start = await runSc(["start", WINDOWS_SERVICE_NAME]);
return start.success || /\b1056\b/.test(start.output) || /already running/i.test(start.output);
}
async function runSc(args: string[]): Promise<{ success: boolean; output: string }> {
try {
const result = await execFileAsync("sc.exe", args, {
windowsHide: true,
timeout: 10000,
maxBuffer: 512 * 1024,
});
return {
success: true,
output: `${result.stdout ?? ""}\n${result.stderr ?? ""}`,
};
} catch (error) {
const failed = error as { stdout?: string; stderr?: string };
return {
success: false,
output: `${failed.stdout ?? ""}\n${failed.stderr ?? ""}`,
};
}
}
async function findBundledOcrServiceExecutable(): Promise<string | null> { async function findBundledOcrServiceExecutable(): Promise<string | null> {
const candidates = [ const candidates = [
process.env.OPENSCREEN_GUIDE_OCR_EXE, process.env.OPENSCREEN_GUIDE_OCR_EXE,
@@ -156,8 +197,11 @@ function startOcrServiceProcess(
OPENSCREEN_OCR_PORT: DEFAULT_OCR_PORT, OPENSCREEN_OCR_PORT: DEFAULT_OCR_PORT,
PADDLEOCR_DEVICE: process.env.PADDLEOCR_DEVICE ?? "cpu", PADDLEOCR_DEVICE: process.env.PADDLEOCR_DEVICE ?? "cpu",
PADDLEOCR_ENABLE_MKLDNN: process.env.PADDLEOCR_ENABLE_MKLDNN ?? "0", PADDLEOCR_ENABLE_MKLDNN: process.env.PADDLEOCR_ENABLE_MKLDNN ?? "0",
PADDLEOCR_LANG: process.env.PADDLEOCR_LANG ?? "latin", PADDLEOCR_LANG: process.env.PADDLEOCR_LANG ?? "",
PADDLEOCR_USE_MOBILE: process.env.PADDLEOCR_USE_MOBILE ?? "1", PADDLEOCR_USE_MOBILE: process.env.PADDLEOCR_USE_MOBILE ?? "1",
OPENSCREEN_OCR_PROFILE:
process.env.OPENSCREEN_OCR_PROFILE ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE ?? "",
OPENSCREEN_OCR_WARMUP: process.env.OPENSCREEN_OCR_WARMUP ?? "1",
PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT: process.env.PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT ?? "False", PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT: process.env.PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT ?? "False",
PADDLE_PDX_CACHE_HOME: process.env.PADDLE_PDX_CACHE_HOME ?? runtimePaths.paddlexCachePath, PADDLE_PDX_CACHE_HOME: process.env.PADDLE_PDX_CACHE_HOME ?? runtimePaths.paddlexCachePath,
PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK: PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK:
+38 -1
View File
@@ -1,8 +1,12 @@
import { describe, expect, it } from "vitest"; import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts"; import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
import { import {
DefaultGuideOcrClient, DefaultGuideOcrClient,
normalizeOcrResponse, normalizeOcrResponse,
PaddleOcrHttpClient,
parseWindowsOcrPayload, parseWindowsOcrPayload,
} from "./paddleOcrClient"; } from "./paddleOcrClient";
@@ -16,6 +20,10 @@ const snapshot: GuideSnapshot = {
height: 800, height: 800,
}; };
afterEach(() => {
vi.unstubAllGlobals();
});
describe("normalizeOcrResponse", () => { describe("normalizeOcrResponse", () => {
it("normalizes pixel boxes into guide OCR blocks", () => { it("normalizes pixel boxes into guide OCR blocks", () => {
const blocks = normalizeOcrResponse( const blocks = normalizeOcrResponse(
@@ -67,6 +75,35 @@ describe("normalizeOcrResponse", () => {
}); });
}); });
describe("PaddleOcrHttpClient", () => {
it("sends the selected OCR profile to the local service", async () => {
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openscreen-ocr-client-"));
const imagePath = path.join(tempDir, "step.png");
await fs.writeFile(imagePath, Buffer.from([137, 80, 78, 71]));
const requests: unknown[] = [];
vi.stubGlobal(
"fetch",
vi.fn(async (_url: string, init?: RequestInit) => {
requests.push(JSON.parse(String(init?.body ?? "{}")));
return new Response(JSON.stringify({ blocks: [] }), {
status: 200,
headers: { "content-type": "application/json" },
});
}),
);
const client = new PaddleOcrHttpClient("https://ocr.example.test", "vi,en", "hybrid");
await client.recognize({ ...snapshot, path: imagePath });
expect(requests[0]).toMatchObject({
language: "vi,en",
profile: "hybrid",
path: imagePath,
});
await fs.rm(tempDir, { recursive: true, force: true });
});
});
describe("DefaultGuideOcrClient", () => { describe("DefaultGuideOcrClient", () => {
it("falls back when the HTTP OCR service is unavailable", async () => { it("falls back when the HTTP OCR service is unavailable", async () => {
const fallbackBlock: OcrBlock = { const fallbackBlock: OcrBlock = {
+45 -3
View File
@@ -1,7 +1,7 @@
import { execFile } from "node:child_process"; import { execFile } from "node:child_process";
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import { promisify } from "node:util"; import { promisify } from "node:util";
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts"; import type { GuideOcrProfile, GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
import { ensureBundledOcrServiceRunning } from "./bundledOcrService"; import { ensureBundledOcrServiceRunning } from "./bundledOcrService";
const execFileAsync = promisify(execFile); const execFileAsync = promisify(execFile);
@@ -10,6 +10,11 @@ export interface GuideOcrClient {
recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]>; recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]>;
} }
export interface GuideOcrClientConfig {
profile: GuideOcrProfile;
language: string;
}
interface PaddleOcrResponseBlock { interface PaddleOcrResponseBlock {
text?: unknown; text?: unknown;
confidence?: unknown; confidence?: unknown;
@@ -21,7 +26,8 @@ interface PaddleOcrResponseBlock {
export class PaddleOcrHttpClient implements GuideOcrClient { export class PaddleOcrHttpClient implements GuideOcrClient {
constructor( constructor(
private readonly baseUrl = process.env.OPENSCREEN_GUIDE_OCR_URL ?? "http://127.0.0.1:8866", private readonly baseUrl = process.env.OPENSCREEN_GUIDE_OCR_URL ?? "http://127.0.0.1:8866",
private readonly language = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE ?? "vi,en", private readonly language = normalizeOcrLanguage(process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
private readonly profile = normalizeOcrProfile(process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
) {} ) {}
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> { async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
@@ -36,6 +42,7 @@ export class PaddleOcrHttpClient implements GuideOcrClient {
imageBase64, imageBase64,
path: snapshot.path, path: snapshot.path,
language: this.language, language: this.language,
profile: this.profile,
}), }),
}); });
} catch (error) { } catch (error) {
@@ -54,7 +61,9 @@ export class PaddleOcrHttpClient implements GuideOcrClient {
} }
export class WindowsOcrClient implements GuideOcrClient { export class WindowsOcrClient implements GuideOcrClient {
constructor(private readonly language = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE ?? "vi,en") {} constructor(
private readonly language = normalizeOcrLanguage(process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
) {}
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> { async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
if (process.platform !== "win32") { if (process.platform !== "win32") {
@@ -96,6 +105,14 @@ export class WindowsOcrClient implements GuideOcrClient {
} }
export class DefaultGuideOcrClient implements GuideOcrClient { export class DefaultGuideOcrClient implements GuideOcrClient {
static fromConfig(config?: Partial<GuideOcrClientConfig>): DefaultGuideOcrClient {
const normalizedConfig = normalizeOcrClientConfig(config);
return new DefaultGuideOcrClient(
new PaddleOcrHttpClient(undefined, normalizedConfig.language, normalizedConfig.profile),
new WindowsOcrClient(normalizedConfig.language),
);
}
constructor( constructor(
private readonly httpClient = new PaddleOcrHttpClient(), private readonly httpClient = new PaddleOcrHttpClient(),
private readonly windowsClient = new WindowsOcrClient(), private readonly windowsClient = new WindowsOcrClient(),
@@ -119,6 +136,31 @@ export class DefaultGuideOcrClient implements GuideOcrClient {
} }
} }
function normalizeOcrClientConfig(
config: Partial<GuideOcrClientConfig> | undefined,
): GuideOcrClientConfig {
return {
profile: normalizeOcrProfile(config?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
language: normalizeOcrLanguage(config?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
};
}
function normalizeOcrProfile(value: string | undefined): GuideOcrProfile {
if (value === "fast" || value === "vietnamese" || value === "hybrid") {
return value;
}
return "vietnamese";
}
function normalizeOcrLanguage(value: string | undefined): string {
const normalized = value
?.split(",")
.map((part) => part.trim().toLowerCase())
.filter(Boolean)
.join(",");
return normalized || "vi,en";
}
export function parseWindowsOcrPayload(stdout: string): unknown { export function parseWindowsOcrPayload(stdout: string): unknown {
const normalized = stdout.replace(/^\uFEFF/, "").trim(); const normalized = stdout.replace(/^\uFEFF/, "").trim();
try { try {
+325 -27
View File
@@ -1,10 +1,11 @@
import { type ChildProcessWithoutNullStreams, spawn } from "node:child_process"; import { type ChildProcessWithoutNullStreams, execFile, spawn } from "node:child_process";
import { EventEmitter } from "node:events"; import { EventEmitter } from "node:events";
import { constants as fsConstants } from "node:fs"; import { constants as fsConstants } from "node:fs";
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import os from "node:os"; import os from "node:os";
import path from "node:path"; import path from "node:path";
import { fileURLToPath, pathToFileURL } from "node:url"; import { fileURLToPath, pathToFileURL } from "node:url";
import { promisify } from "node:util";
import type { DesktopCapturerSource, Rectangle } from "electron"; import type { DesktopCapturerSource, Rectangle } from "electron";
import { import {
app, app,
@@ -17,7 +18,7 @@ import {
shell, shell,
systemPreferences, systemPreferences,
} from "electron"; } from "electron";
import type { GuideMarkerCapturedPayload } from "../../src/guide/contracts"; import type { GuideEvent, GuideMarkerCapturedPayload } from "../../src/guide/contracts";
import type { NativeMacRecordingRequest } from "../../src/lib/nativeMacRecording"; import type { NativeMacRecordingRequest } from "../../src/lib/nativeMacRecording";
import type { NativeWindowsRecordingRequest } from "../../src/lib/nativeWindowsRecording"; import type { NativeWindowsRecordingRequest } from "../../src/lib/nativeWindowsRecording";
import { import {
@@ -56,6 +57,7 @@ const RECORDING_SESSION_SUFFIX = ".session.json";
const ALLOWED_IMPORT_VIDEO_EXTENSIONS = new Set([".webm", ".mp4", ".mov", ".avi", ".mkv"]); const ALLOWED_IMPORT_VIDEO_EXTENSIONS = new Set([".webm", ".mp4", ".mov", ".avi", ".mkv"]);
const PREVIEW_AUDIO_DIR = path.join(app.getPath("userData"), "preview-audio"); const PREVIEW_AUDIO_DIR = path.join(app.getPath("userData"), "preview-audio");
const nativeMacCaptureEvents = new EventEmitter(); const nativeMacCaptureEvents = new EventEmitter();
const execFileAsync = promisify(execFile);
/** /**
* Paths explicitly approved by the user via file picker dialogs or project loads. * Paths explicitly approved by the user via file picker dialogs or project loads.
@@ -426,6 +428,7 @@ let nativeWindowsCursorRecordingStartMs = 0;
let nativeWindowsPauseStartedAtMs: number | null = null; let nativeWindowsPauseStartedAtMs: number | null = null;
let nativeWindowsPauseRanges: Array<{ startMs: number; endMs: number }> = []; let nativeWindowsPauseRanges: Array<{ startMs: number; endMs: number }> = [];
let nativeWindowsIsPaused = false; let nativeWindowsIsPaused = false;
let nativeWindowsCaptureStopping = false;
const NATIVE_WINDOWS_CAPTURE_STOP_TIMEOUT_MS = 15_000; const NATIVE_WINDOWS_CAPTURE_STOP_TIMEOUT_MS = 15_000;
let nativeMacCaptureProcess: ChildProcessWithoutNullStreams | null = null; let nativeMacCaptureProcess: ChildProcessWithoutNullStreams | null = null;
let nativeMacCaptureOutput = ""; let nativeMacCaptureOutput = "";
@@ -453,6 +456,7 @@ let activeGuideHotkeyRecording: GuideHotkeyRecordingState | null = null;
let activeGuideHotkeySessionId: number | null = null; let activeGuideHotkeySessionId: number | null = null;
let guideMarkerHotkeyRegistered = false; let guideMarkerHotkeyRegistered = false;
let lastGuideHotkeyCaptureAtMs = 0; let lastGuideHotkeyCaptureAtMs = 0;
const guideHotkeyBackgroundJobs = new Map<string, Promise<void>>();
const GUIDE_HOTKEY_CAPTURE_DEBOUNCE_MS = 250; const GUIDE_HOTKEY_CAPTURE_DEBOUNCE_MS = 250;
function normalizeCursorSample(sample: unknown): CursorRecordingSample | null { function normalizeCursorSample(sample: unknown): CursorRecordingSample | null {
@@ -807,6 +811,203 @@ function clampGuideHotkey01(value: number): number {
return Math.min(1, Math.max(0, value)); return Math.min(1, Math.max(0, value));
} }
async function captureGuideHotkeySnapshotAndRunOcr(
guideStore: GuideStore,
event: GuideEvent,
boundsInput: GuideHotkeyBounds,
point: { normalizedX: number; normalizedY: number },
) {
try {
const bounds = sanitizeGuideHotkeyBounds(boundsInput);
const sources = await desktopCapturer.getSources({
types: ["screen"],
thumbnailSize: {
width: Math.max(1, Math.round(bounds.width)),
height: Math.max(1, Math.round(bounds.height)),
},
});
const source = findScreenSourceForGuideBounds(sources, bounds);
if (!source || source.thumbnail.isEmpty()) {
console.warn("[guide-hotkey] no live screen thumbnail was available for OCR");
return;
}
const pngBuffer = source.thumbnail.toPNG();
const imageSize = source.thumbnail.getSize();
const markedPngBuffer = await createMarkedGuideSnapshotPng(pngBuffer, {
width: imageSize.width,
height: imageSize.height,
x: point.normalizedX * imageSize.width,
y: point.normalizedY * imageSize.height,
}).catch((error) => {
console.warn("[guide-hotkey] failed to create marked live snapshot:", error);
return undefined;
});
enqueueGuideHotkeyBackgroundJob(event.recordingId, async () => {
const session = await guideStore.writeSnapshot({
recordingId: event.recordingId,
eventId: event.id,
timeMs: event.timeMs,
offsetMs: 0,
pngBytes: bufferToArrayBuffer(pngBuffer),
markedPngBytes: markedPngBuffer ? bufferToArrayBuffer(markedPngBuffer) : undefined,
width: imageSize.width,
height: imageSize.height,
});
const snapshot = session.snapshots.find((item) => item.eventId === event.id);
if (!snapshot) {
return;
}
await guideStore.runOcr({
recordingId: event.recordingId,
snapshotIds: [snapshot.id],
});
console.info("[guide-hotkey] live snapshot OCR completed", {
recordingId: event.recordingId,
eventId: event.id,
snapshotId: snapshot.id,
});
});
} catch (error) {
console.warn("[guide-hotkey] live snapshot OCR failed:", error);
}
}
function enqueueGuideHotkeyBackgroundJob(recordingId: string, job: () => Promise<void>) {
const previousJob =
guideHotkeyBackgroundJobs.get(recordingId)?.catch(() => undefined) ?? Promise.resolve();
const nextJob = previousJob
.then(job)
.catch((error) => {
console.warn("[guide-hotkey] background OCR job failed:", error);
})
.finally(() => {
if (guideHotkeyBackgroundJobs.get(recordingId) === nextJob) {
guideHotkeyBackgroundJobs.delete(recordingId);
}
});
guideHotkeyBackgroundJobs.set(recordingId, nextJob);
}
function findScreenSourceForGuideBounds(
sources: DesktopCapturerSource[],
bounds: GuideHotkeyBounds,
): DesktopCapturerSource | undefined {
const displays = screen.getAllDisplays();
const displayIndex = displays.findIndex((display) =>
rectMatchesGuideBounds(display.bounds, bounds),
);
const display = displayIndex >= 0 ? displays[displayIndex] : undefined;
if (display) {
const byDisplayId = sources.find((source) => Number(source.display_id) === display.id);
if (byDisplayId) {
return byDisplayId;
}
const bySourceIndex = sources.find(
(source) => parseDesktopCapturerScreenIndex(source.id) === displayIndex,
);
if (bySourceIndex) {
return bySourceIndex;
}
}
return sources.find((source) => source.id.startsWith("screen:")) ?? sources[0];
}
function rectMatchesGuideBounds(rect: Rectangle, bounds: GuideHotkeyBounds): boolean {
return (
Math.round(rect.x) === Math.round(bounds.x) &&
Math.round(rect.y) === Math.round(bounds.y) &&
Math.round(rect.width) === Math.round(bounds.width) &&
Math.round(rect.height) === Math.round(bounds.height)
);
}
async function createMarkedGuideSnapshotPng(
pngBuffer: Buffer,
marker: { width: number; height: number; x: number; y: number },
): Promise<Buffer> {
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openscreen-guide-marker-"));
const sourcePath = path.join(tempDir, "source.png");
const outputPath = path.join(tempDir, "marked.png");
try {
await fs.writeFile(sourcePath, pngBuffer);
await execFileAsync(
"powershell.exe",
[
"-NoProfile",
"-ExecutionPolicy",
"Bypass",
"-EncodedCommand",
buildMarkerScript(sourcePath, outputPath, marker),
],
{
timeout: 30000,
windowsHide: true,
maxBuffer: 1024 * 1024,
},
);
return await fs.readFile(outputPath);
} finally {
await fs.rm(tempDir, { recursive: true, force: true }).catch(() => undefined);
}
}
function buildMarkerScript(
sourcePath: string,
outputPath: string,
marker: { width: number; height: number; x: number; y: number },
): string {
const sourcePathBase64 = Buffer.from(sourcePath, "utf8").toString("base64");
const outputPathBase64 = Buffer.from(outputPath, "utf8").toString("base64");
const script = `
$ErrorActionPreference = "Stop"
$sourcePath = [System.Text.Encoding]::UTF8.GetString([Convert]::FromBase64String("${sourcePathBase64}"))
$outputPath = [System.Text.Encoding]::UTF8.GetString([Convert]::FromBase64String("${outputPathBase64}"))
Add-Type -AssemblyName System.Drawing
$source = [System.Drawing.Image]::FromFile($sourcePath)
$bitmap = [System.Drawing.Bitmap]::new($source.Width, $source.Height)
$graphics = [System.Drawing.Graphics]::FromImage($bitmap)
try {
$graphics.SmoothingMode = [System.Drawing.Drawing2D.SmoothingMode]::AntiAlias
$graphics.DrawImage($source, 0, 0, $source.Width, $source.Height)
$shortSide = [Math]::Max(1, [Math]::Min($source.Width, $source.Height))
$haloRadius = [Math]::Min(14, [Math]::Max(8, [Math]::Round($shortSide * 0.012)))
$dotRadius = [Math]::Min(6, [Math]::Max(3, [Math]::Round($shortSide * 0.0045)))
$lineWidth = [Math]::Max(1, [Math]::Round($shortSide * 0.0015))
$x = [Math]::Min($source.Width, [Math]::Max(0, ${marker.x.toFixed(4)}))
$y = [Math]::Min($source.Height, [Math]::Max(0, ${marker.y.toFixed(4)}))
$haloBrush = [System.Drawing.SolidBrush]::new([System.Drawing.Color]::FromArgb(87, 250, 204, 21))
$ringPen = [System.Drawing.Pen]::new([System.Drawing.Color]::FromArgb(184, 239, 68, 68), $lineWidth)
$dotBrush = [System.Drawing.SolidBrush]::new([System.Drawing.Color]::FromArgb(235, 220, 38, 38))
try {
$graphics.FillEllipse($haloBrush, $x - $haloRadius, $y - $haloRadius, $haloRadius * 2, $haloRadius * 2)
$graphics.DrawEllipse($ringPen, $x - $haloRadius, $y - $haloRadius, $haloRadius * 2, $haloRadius * 2)
$graphics.FillEllipse($dotBrush, $x - $dotRadius, $y - $dotRadius, $dotRadius * 2, $dotRadius * 2)
} finally {
$haloBrush.Dispose()
$ringPen.Dispose()
$dotBrush.Dispose()
}
$bitmap.Save($outputPath, [System.Drawing.Imaging.ImageFormat]::Png)
} finally {
$graphics.Dispose()
$bitmap.Dispose()
$source.Dispose()
}
`;
return Buffer.from(script, "utf16le").toString("base64");
}
function bufferToArrayBuffer(buffer: Buffer): ArrayBuffer {
return buffer.buffer.slice(
buffer.byteOffset,
buffer.byteOffset + buffer.byteLength,
) as ArrayBuffer;
}
async function captureGuideHotkeyMarker( async function captureGuideHotkeyMarker(
guideStore: GuideStore, guideStore: GuideStore,
trigger: GuideMarkerTrigger = "global-shortcut", trigger: GuideMarkerTrigger = "global-shortcut",
@@ -853,6 +1054,7 @@ async function captureGuideHotkeyMarker(
rawY: point.rawY, rawY: point.rawY,
bounds: point.bounds, bounds: point.bounds,
}); });
void captureGuideHotkeySnapshotAndRunOcr(guideStore, result.event, recording.bounds, point);
return { captured: true, ...result }; return { captured: true, ...result };
} catch (error) { } catch (error) {
const message = error instanceof Error ? error.message : String(error); const message = error instanceof Error ? error.message : String(error);
@@ -1337,6 +1539,81 @@ function completeNativeWindowsCursorPauseRange(endMs = Date.now()) {
nativeWindowsPauseStartedAtMs = null; nativeWindowsPauseStartedAtMs = null;
} }
function resetNativeWindowsCaptureState() {
nativeWindowsCaptureProcess = null;
nativeWindowsCaptureTargetPath = null;
nativeWindowsCaptureWebcamTargetPath = null;
nativeWindowsCaptureRecordingId = null;
nativeWindowsCursorOffsetMs = 0;
nativeWindowsCursorCaptureMode = "editable-overlay";
nativeWindowsCursorRecordingStartMs = 0;
nativeWindowsPauseStartedAtMs = null;
nativeWindowsPauseRanges = [];
nativeWindowsIsPaused = false;
nativeWindowsCaptureStopping = false;
clearGuideHotkeyRecording();
}
function hasActiveNativeWindowsCaptureProcess() {
const proc = nativeWindowsCaptureProcess;
if (!proc) {
return false;
}
if (proc.exitCode === null && !proc.killed) {
return true;
}
console.warn("[native-wgc] clearing stale Windows capture process state", {
exitCode: proc.exitCode,
killed: proc.killed,
});
resetNativeWindowsCaptureState();
return false;
}
function attachNativeWindowsCaptureLifecycle(
proc: ChildProcessWithoutNullStreams,
sourceName: string,
onRecordingStateChange?: (recording: boolean, sourceName: string) => void,
) {
const cleanupAfterUnexpectedExit = async () => {
try {
await stopCursorRecording();
} catch (error) {
console.warn("[native-wgc] failed to stop cursor recording after helper exit", error);
}
pendingCursorRecordingData = null;
resetNativeWindowsCaptureState();
onRecordingStateChange?.(false, sourceName);
};
function onClose(code: number | null, signal: NodeJS.Signals | null) {
proc.off("error", onError);
if (nativeWindowsCaptureProcess !== proc || nativeWindowsCaptureStopping) {
return;
}
console.warn("[native-wgc] Windows capture helper exited before stop was requested", {
code,
signal,
output: nativeWindowsCaptureOutput.trim(),
});
void cleanupAfterUnexpectedExit();
}
function onError(error: Error) {
proc.off("close", onClose);
if (nativeWindowsCaptureProcess !== proc || nativeWindowsCaptureStopping) {
return;
}
console.warn("[native-wgc] Windows capture helper errored before stop was requested", error);
void cleanupAfterUnexpectedExit();
}
proc.once("close", onClose);
proc.once("error", onError);
}
function waitForNativeWindowsCaptureStart(proc: ChildProcessWithoutNullStreams) { function waitForNativeWindowsCaptureStart(proc: ChildProcessWithoutNullStreams) {
return new Promise<void>((resolve, reject) => { return new Promise<void>((resolve, reject) => {
const timer = setTimeout(() => { const timer = setTimeout(() => {
@@ -1732,7 +2009,7 @@ export function registerIpcHandlers(
const sources = await desktopCapturer.getSources(opts); const sources = await desktopCapturer.getSources(opts);
lastEnumeratedSources = new Map(sources.map((source) => [source.id, source])); lastEnumeratedSources = new Map(sources.map((source) => [source.id, source]));
let screenSourceIndex = 0; let screenSourceIndex = 0;
return sources.map((source) => { const processedSources = sources.map((source) => {
const isScreenSource = source.id.startsWith("screen:"); const isScreenSource = source.id.startsWith("screen:");
const sourceIndex = isScreenSource const sourceIndex = isScreenSource
? (parseDesktopCapturerScreenIndex(source.id) ?? screenSourceIndex) ? (parseDesktopCapturerScreenIndex(source.id) ?? screenSourceIndex)
@@ -1760,6 +2037,43 @@ export function registerIpcHandlers(
bounds, bounds,
}; };
}); });
const screenDisplays = screen.getAllDisplays();
const mappedDisplayIds = new Set(
processedSources
.filter((source) => source.id.startsWith("screen:") && typeof source.displayId === "number")
.map((source) => source.displayId),
);
const fallbackScreenSources = screenDisplays
.map((display, displayIndex) => ({ display, displayIndex }))
.filter(({ display }) => !mappedDisplayIds.has(display.id))
.map(({ display, displayIndex }) => {
const bounds = toSourceBounds(display.bounds);
return {
id: `screen:${displayIndex}:fallback:${display.id}`,
name: `Screen ${displayIndex + 1}`,
display_id: String(display.id),
thumbnail: null,
appIcon: null,
displayId: display.id,
displayIndex,
screenIndex: displayIndex,
displayLabel: `Display ${displayIndex + 1} - ${bounds.width}x${bounds.height} @ ${bounds.x},${bounds.y}`,
bounds,
};
});
if (fallbackScreenSources.length > 0) {
console.warn("[desktop-capturer] added fallback display sources", {
capturerScreens: processedSources.filter((source) => source.id.startsWith("screen:"))
.length,
electronDisplays: screenDisplays.length,
fallbackScreens: fallbackScreenSources.map((source) => ({
id: source.id,
displayId: source.displayId,
bounds: source.bounds,
})),
});
}
return [...processedSources, ...fallbackScreenSources];
}); });
ipcMain.handle("select-source", async (_, source: SelectedSource) => { ipcMain.handle("select-source", async (_, source: SelectedSource) => {
@@ -1964,7 +2278,7 @@ export function registerIpcHandlers(
error: "Windows Graphics Capture requires Windows 10 build 19041 or newer.", error: "Windows Graphics Capture requires Windows 10 build 19041 or newer.",
}; };
} }
if (nativeWindowsCaptureProcess) { if (hasActiveNativeWindowsCaptureProcess()) {
return { success: false, error: "Native Windows capture is already running." }; return { success: false, error: "Native Windows capture is already running." };
} }
@@ -2113,6 +2427,7 @@ export function registerIpcHandlers(
}); });
const source = selectedSource || { name: "Screen" }; const source = selectedSource || { name: "Screen" };
attachNativeWindowsCaptureLifecycle(proc, source.name, onRecordingStateChange);
startGuideHotkeyRecording(recordingId, bounds); startGuideHotkeyRecording(recordingId, bounds);
if (onRecordingStateChange) { if (onRecordingStateChange) {
onRecordingStateChange(true, source.name); onRecordingStateChange(true, source.name);
@@ -2127,17 +2442,7 @@ export function registerIpcHandlers(
} catch (error) { } catch (error) {
console.error("Failed to start native Windows recording:", error); console.error("Failed to start native Windows recording:", error);
nativeWindowsCaptureProcess?.kill(); nativeWindowsCaptureProcess?.kill();
nativeWindowsCaptureProcess = null; resetNativeWindowsCaptureState();
nativeWindowsCaptureTargetPath = null;
nativeWindowsCaptureWebcamTargetPath = null;
nativeWindowsCaptureRecordingId = null;
nativeWindowsCursorOffsetMs = 0;
nativeWindowsCursorCaptureMode = "editable-overlay";
nativeWindowsCursorRecordingStartMs = 0;
nativeWindowsPauseStartedAtMs = null;
nativeWindowsPauseRanges = [];
nativeWindowsIsPaused = false;
clearGuideHotkeyRecording();
await stopCursorRecording(); await stopCursorRecording();
return { success: false, error: String(error) }; return { success: false, error: String(error) };
} }
@@ -2396,11 +2701,13 @@ export function registerIpcHandlers(
const recordingId = nativeWindowsCaptureRecordingId ?? Date.now(); const recordingId = nativeWindowsCaptureRecordingId ?? Date.now();
const cursorCaptureMode = nativeWindowsCursorCaptureMode; const cursorCaptureMode = nativeWindowsCursorCaptureMode;
if (!proc) { if (!proc || proc.exitCode !== null || proc.killed) {
resetNativeWindowsCaptureState();
return { success: false, error: "Native Windows capture is not running." }; return { success: false, error: "Native Windows capture is not running." };
} }
try { try {
nativeWindowsCaptureStopping = true;
completeNativeWindowsCursorPauseRange(); completeNativeWindowsCursorPauseRange();
const stoppedPathPromise = waitForNativeWindowsCaptureStop(proc); const stoppedPathPromise = waitForNativeWindowsCaptureStop(proc);
proc.stdin.write("stop\n"); proc.stdin.write("stop\n");
@@ -2462,17 +2769,7 @@ export function registerIpcHandlers(
await stopCursorRecording(); await stopCursorRecording();
return { success: false, error: String(error) }; return { success: false, error: String(error) };
} finally { } finally {
nativeWindowsCaptureProcess = null; resetNativeWindowsCaptureState();
nativeWindowsCaptureTargetPath = null;
nativeWindowsCaptureWebcamTargetPath = null;
nativeWindowsCaptureRecordingId = null;
nativeWindowsCursorOffsetMs = 0;
nativeWindowsCursorCaptureMode = "editable-overlay";
nativeWindowsCursorRecordingStartMs = 0;
nativeWindowsPauseStartedAtMs = null;
nativeWindowsPauseRanges = [];
nativeWindowsIsPaused = false;
clearGuideHotkeyRecording();
const source = selectedSource || { name: "Screen" }; const source = selectedSource || { name: "Screen" };
if (onRecordingStateChange) { if (onRecordingStateChange) {
onRecordingStateChange(false, source.name); onRecordingStateChange(false, source.name);
@@ -2637,6 +2934,7 @@ export function registerIpcHandlers(
); );
const guideStore = new GuideStore(RECORDINGS_DIR, { const guideStore = new GuideStore(RECORDINGS_DIR, {
deepSeekConfigProvider: guideAiSettingsStore, deepSeekConfigProvider: guideAiSettingsStore,
ocrConfigProvider: guideAiSettingsStore,
}); });
registerGuideMarkerHotkey(guideStore); registerGuideMarkerHotkey(guideStore);
registerGuideIpcHandlers(ipcMain, guideStore, guideAiSettingsStore, { registerGuideIpcHandlers(ipcMain, guideStore, guideAiSettingsStore, {
@@ -81,3 +81,21 @@ target_compile_options(guide-hotkey-listener PRIVATE /EHsc /W4 /utf-8)
target_link_libraries(guide-hotkey-listener PRIVATE target_link_libraries(guide-hotkey-listener PRIVATE
user32 user32
) )
add_executable(openscreen-ocr-service-wrapper
src/ocr-service-wrapper.cpp
)
target_compile_definitions(openscreen-ocr-service-wrapper PRIVATE
NOMINMAX
WIN32_LEAN_AND_MEAN
UNICODE
_UNICODE
_WIN32_WINNT=0x0A00
)
target_compile_options(openscreen-ocr-service-wrapper PRIVATE /EHsc /W4 /utf-8)
target_link_libraries(openscreen-ocr-service-wrapper PRIVATE
advapi32
)
@@ -0,0 +1,263 @@
#include <Windows.h>
#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
namespace {
constexpr const wchar_t* SERVICE_NAME = L"OpenScreenOCR";
struct ServiceConfig {
std::wstring exePath;
std::wstring resourcesPath;
std::wstring dataPath;
};
SERVICE_STATUS_HANDLE g_statusHandle = nullptr;
SERVICE_STATUS g_status{};
HANDLE g_stopEvent = nullptr;
PROCESS_INFORMATION g_childProcess{};
ServiceConfig g_config;
std::wstring quoteArg(const std::wstring& value) {
std::wstring result = L"\"";
for (wchar_t ch : value) {
if (ch == L'"') {
result += L"\\\"";
} else {
result.push_back(ch);
}
}
result += L"\"";
return result;
}
std::wstring directoryName(const std::wstring& path) {
const size_t slash = path.find_last_of(L"\\/");
return slash == std::wstring::npos ? L"." : path.substr(0, slash);
}
void createDirectoryRecursive(const std::wstring& path) {
if (path.empty()) {
return;
}
std::wstring current;
for (size_t i = 0; i < path.size(); ++i) {
current.push_back(path[i]);
if (path[i] != L'\\' && path[i] != L'/') {
continue;
}
if (current.size() > 3) {
CreateDirectoryW(current.c_str(), nullptr);
}
}
CreateDirectoryW(path.c_str(), nullptr);
}
void setEnv(const wchar_t* name, const std::wstring& value) {
SetEnvironmentVariableW(name, value.empty() ? nullptr : value.c_str());
}
void setServiceStatus(DWORD state, DWORD win32ExitCode = NO_ERROR, DWORD waitHint = 0) {
if (!g_statusHandle) {
return;
}
g_status.dwServiceType = SERVICE_WIN32_OWN_PROCESS;
g_status.dwCurrentState = state;
g_status.dwWin32ExitCode = win32ExitCode;
g_status.dwWaitHint = waitHint;
g_status.dwControlsAccepted =
state == SERVICE_RUNNING ? SERVICE_ACCEPT_STOP | SERVICE_ACCEPT_SHUTDOWN : 0;
static DWORD checkpoint = 1;
g_status.dwCheckPoint =
state == SERVICE_START_PENDING || state == SERVICE_STOP_PENDING ? checkpoint++ : 0;
SetServiceStatus(g_statusHandle, &g_status);
}
HANDLE openServiceLog(const std::wstring& dataPath) {
const std::wstring logDir = dataPath + L"\\logs";
createDirectoryRecursive(logDir);
const std::wstring logPath = logDir + L"\\ocr-service.log";
SECURITY_ATTRIBUTES securityAttributes{};
securityAttributes.nLength = sizeof(securityAttributes);
securityAttributes.bInheritHandle = TRUE;
HANDLE file = CreateFileW(
logPath.c_str(),
FILE_APPEND_DATA,
FILE_SHARE_READ | FILE_SHARE_WRITE,
&securityAttributes,
OPEN_ALWAYS,
FILE_ATTRIBUTE_NORMAL,
nullptr);
if (file != INVALID_HANDLE_VALUE) {
SetFilePointer(file, 0, nullptr, FILE_END);
}
return file;
}
bool startOcrProcess(const ServiceConfig& config) {
if (config.exePath.empty()) {
return false;
}
const std::wstring dataPath = config.dataPath.empty()
? directoryName(config.exePath) + L"\\ocr-runtime"
: config.dataPath;
const std::wstring resourcesPath = config.resourcesPath.empty()
? directoryName(directoryName(config.exePath))
: config.resourcesPath;
const std::wstring modelCachePath = dataPath + L"\\ocr-models";
const std::wstring paddlexCachePath = resourcesPath + L"\\ocr-models\\paddlex";
createDirectoryRecursive(dataPath);
createDirectoryRecursive(modelCachePath);
setEnv(L"OPENSCREEN_OCR_HOST", L"127.0.0.1");
setEnv(L"OPENSCREEN_OCR_PORT", L"8866");
setEnv(L"PADDLEOCR_DEVICE", L"cpu");
setEnv(L"PADDLEOCR_ENABLE_MKLDNN", L"0");
setEnv(L"PADDLEOCR_LANG", L"");
setEnv(L"PADDLEOCR_USE_MOBILE", L"1");
setEnv(L"OPENSCREEN_OCR_PROFILE", L"vietnamese");
setEnv(L"OPENSCREEN_OCR_WARMUP", L"1");
setEnv(L"PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT", L"False");
setEnv(L"PADDLE_PDX_CACHE_HOME", paddlexCachePath);
setEnv(L"PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", L"True");
setEnv(L"PADDLE_HOME", modelCachePath + L"\\paddle");
setEnv(L"PADDLEOCR_HOME", modelCachePath + L"\\paddleocr");
setEnv(L"PYTHONUTF8", L"1");
STARTUPINFOW startupInfo{};
startupInfo.cb = sizeof(startupInfo);
HANDLE logFile = openServiceLog(dataPath);
if (logFile != INVALID_HANDLE_VALUE) {
startupInfo.dwFlags |= STARTF_USESTDHANDLES;
startupInfo.hStdOutput = logFile;
startupInfo.hStdError = logFile;
startupInfo.hStdInput = GetStdHandle(STD_INPUT_HANDLE);
}
std::wstring commandLine = quoteArg(config.exePath);
const std::wstring cwd = directoryName(config.exePath);
ZeroMemory(&g_childProcess, sizeof(g_childProcess));
const BOOL created = CreateProcessW(
config.exePath.c_str(),
commandLine.data(),
nullptr,
nullptr,
TRUE,
CREATE_NO_WINDOW,
nullptr,
cwd.c_str(),
&startupInfo,
&g_childProcess);
if (logFile != INVALID_HANDLE_VALUE) {
CloseHandle(logFile);
}
return created == TRUE;
}
void stopOcrProcess() {
if (g_childProcess.hProcess) {
TerminateProcess(g_childProcess.hProcess, 0);
WaitForSingleObject(g_childProcess.hProcess, 10000);
CloseHandle(g_childProcess.hProcess);
g_childProcess.hProcess = nullptr;
}
if (g_childProcess.hThread) {
CloseHandle(g_childProcess.hThread);
g_childProcess.hThread = nullptr;
}
}
DWORD WINAPI serviceControlHandler(DWORD control, DWORD, LPVOID, LPVOID) {
if (control == SERVICE_CONTROL_STOP || control == SERVICE_CONTROL_SHUTDOWN) {
setServiceStatus(SERVICE_STOP_PENDING, NO_ERROR, 10000);
if (g_stopEvent) {
SetEvent(g_stopEvent);
}
stopOcrProcess();
return NO_ERROR;
}
return NO_ERROR;
}
void WINAPI serviceMain(DWORD, LPWSTR*) {
g_statusHandle = RegisterServiceCtrlHandlerExW(SERVICE_NAME, serviceControlHandler, nullptr);
if (!g_statusHandle) {
return;
}
setServiceStatus(SERVICE_START_PENDING, NO_ERROR, 30000);
g_stopEvent = CreateEventW(nullptr, TRUE, FALSE, nullptr);
if (!g_stopEvent || !startOcrProcess(g_config)) {
setServiceStatus(SERVICE_STOPPED, ERROR_SERVICE_SPECIFIC_ERROR);
return;
}
setServiceStatus(SERVICE_RUNNING);
HANDLE waitHandles[] = {g_stopEvent, g_childProcess.hProcess};
WaitForMultipleObjects(2, waitHandles, FALSE, INFINITE);
stopOcrProcess();
if (g_stopEvent) {
CloseHandle(g_stopEvent);
g_stopEvent = nullptr;
}
setServiceStatus(SERVICE_STOPPED);
}
ServiceConfig parseConfig(int argc, wchar_t* argv[]) {
ServiceConfig config;
for (int i = 1; i < argc; ++i) {
const std::wstring arg = argv[i];
auto readNext = [&](std::wstring& target) {
if (i + 1 < argc) {
target = argv[++i];
}
};
if (arg == L"--exe") {
readNext(config.exePath);
} else if (arg == L"--resources") {
readNext(config.resourcesPath);
} else if (arg == L"--data") {
readNext(config.dataPath);
}
}
return config;
}
bool hasServiceFlag(int argc, wchar_t* argv[]) {
for (int i = 1; i < argc; ++i) {
if (std::wstring(argv[i]) == L"--service") {
return true;
}
}
return false;
}
} // namespace
int wmain(int argc, wchar_t* argv[]) {
g_config = parseConfig(argc, argv);
if (hasServiceFlag(argc, argv)) {
SERVICE_TABLE_ENTRYW serviceTable[] = {
{const_cast<LPWSTR>(SERVICE_NAME), serviceMain},
{nullptr, nullptr},
};
return StartServiceCtrlDispatcherW(serviceTable) ? 0 : 1;
}
if (!startOcrProcess(g_config)) {
std::wcerr << L"Failed to start OCR service process." << std::endl;
return 1;
}
WaitForSingleObject(g_childProcess.hProcess, INFINITE);
stopOcrProcess();
return 0;
}
@@ -28,6 +28,60 @@ bool succeeded(HRESULT hr, const char* label) {
return false; return false;
} }
Microsoft::WRL::ComPtr<IDXGIAdapter1> findAdapterForMonitor(HMONITOR monitor) {
if (!monitor) {
return nullptr;
}
Microsoft::WRL::ComPtr<IDXGIFactory1> factory;
HRESULT hr = CreateDXGIFactory1(IID_PPV_ARGS(&factory));
if (FAILED(hr) || !factory) {
std::cerr << "WARNING: CreateDXGIFactory1 failed while resolving monitor adapter (hr=0x"
<< std::hex << hr << std::dec << ")" << std::endl;
return nullptr;
}
for (UINT adapterIndex = 0;; ++adapterIndex) {
Microsoft::WRL::ComPtr<IDXGIAdapter1> adapter;
hr = factory->EnumAdapters1(adapterIndex, adapter.GetAddressOf());
if (hr == DXGI_ERROR_NOT_FOUND) {
break;
}
if (FAILED(hr) || !adapter) {
continue;
}
DXGI_ADAPTER_DESC1 adapterDesc{};
if (SUCCEEDED(adapter->GetDesc1(&adapterDesc)) &&
(adapterDesc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) != 0) {
continue;
}
for (UINT outputIndex = 0;; ++outputIndex) {
Microsoft::WRL::ComPtr<IDXGIOutput> output;
hr = adapter->EnumOutputs(outputIndex, output.GetAddressOf());
if (hr == DXGI_ERROR_NOT_FOUND) {
break;
}
if (FAILED(hr) || !output) {
continue;
}
DXGI_OUTPUT_DESC outputDesc{};
if (SUCCEEDED(output->GetDesc(&outputDesc)) && outputDesc.Monitor == monitor) {
std::cout << "{\"event\":\"display-adapter-resolved\",\"schemaVersion\":2,"
<< "\"adapterIndex\":" << adapterIndex
<< ",\"outputIndex\":" << outputIndex << "}" << std::endl;
return adapter;
}
}
}
std::cerr << "WARNING: Could not resolve DXGI adapter for selected monitor; using default adapter"
<< std::endl;
return nullptr;
}
int64_t timeSpanToHns(wf::TimeSpan const& value) { int64_t timeSpanToHns(wf::TimeSpan const& value) {
return value.count(); return value.count();
} }
@@ -38,7 +92,7 @@ WgcSession::~WgcSession() {
stop(); stop();
} }
bool WgcSession::createD3DDevice() { bool WgcSession::createD3DDevice(IDXGIAdapter* adapter) {
UINT flags = D3D11_CREATE_DEVICE_BGRA_SUPPORT; UINT flags = D3D11_CREATE_DEVICE_BGRA_SUPPORT;
#if defined(_DEBUG) #if defined(_DEBUG)
flags |= D3D11_CREATE_DEVICE_DEBUG; flags |= D3D11_CREATE_DEVICE_DEBUG;
@@ -53,8 +107,8 @@ bool WgcSession::createD3DDevice() {
D3D_FEATURE_LEVEL featureLevel{}; D3D_FEATURE_LEVEL featureLevel{};
HRESULT hr = D3D11CreateDevice( HRESULT hr = D3D11CreateDevice(
nullptr, adapter,
D3D_DRIVER_TYPE_HARDWARE, adapter ? D3D_DRIVER_TYPE_UNKNOWN : D3D_DRIVER_TYPE_HARDWARE,
nullptr, nullptr,
flags, flags,
featureLevels, featureLevels,
@@ -67,6 +121,23 @@ bool WgcSession::createD3DDevice() {
#if defined(_DEBUG) #if defined(_DEBUG)
if (FAILED(hr)) { if (FAILED(hr)) {
flags &= ~D3D11_CREATE_DEVICE_DEBUG; flags &= ~D3D11_CREATE_DEVICE_DEBUG;
hr = D3D11CreateDevice(
adapter,
adapter ? D3D_DRIVER_TYPE_UNKNOWN : D3D_DRIVER_TYPE_HARDWARE,
nullptr,
flags,
featureLevels,
ARRAYSIZE(featureLevels),
D3D11_SDK_VERSION,
&d3dDevice_,
&featureLevel,
&d3dContext_);
}
#endif
if (FAILED(hr) && adapter) {
std::cerr << "WARNING: D3D11CreateDevice failed for selected monitor adapter (hr=0x"
<< std::hex << hr << std::dec << "); retrying default adapter" << std::endl;
hr = D3D11CreateDevice( hr = D3D11CreateDevice(
nullptr, nullptr,
D3D_DRIVER_TYPE_HARDWARE, D3D_DRIVER_TYPE_HARDWARE,
@@ -79,7 +150,6 @@ bool WgcSession::createD3DDevice() {
&featureLevel, &featureLevel,
&d3dContext_); &d3dContext_);
} }
#endif
if (!succeeded(hr, "D3D11CreateDevice")) { if (!succeeded(hr, "D3D11CreateDevice")) {
return false; return false;
@@ -100,6 +170,11 @@ bool WgcSession::createD3DDevice() {
return true; return true;
} }
bool WgcSession::createD3DDeviceForMonitor(HMONITOR monitor) {
auto adapter = findAdapterForMonitor(monitor);
return createD3DDevice(adapter.Get());
}
bool WgcSession::createCaptureItem(HMONITOR monitor) { bool WgcSession::createCaptureItem(HMONITOR monitor) {
auto factory = winrt::get_activation_factory<wgcap::GraphicsCaptureItem>(); auto factory = winrt::get_activation_factory<wgcap::GraphicsCaptureItem>();
auto interop = factory.as<IGraphicsCaptureItemInterop>(); auto interop = factory.as<IGraphicsCaptureItemInterop>();
@@ -188,7 +263,7 @@ bool WgcSession::applySessionOptions(bool captureCursor) {
bool WgcSession::initialize(HMONITOR monitor, int fps, bool captureCursor) { bool WgcSession::initialize(HMONITOR monitor, int fps, bool captureCursor) {
fps_ = fps > 0 ? fps : 60; fps_ = fps > 0 ? fps : 60;
if (!createD3DDevice()) { if (!createD3DDeviceForMonitor(monitor)) {
return false; return false;
} }
if (!createCaptureItem(monitor)) { if (!createCaptureItem(monitor)) {
@@ -2,6 +2,7 @@
#include <Windows.h> #include <Windows.h>
#include <d3d11.h> #include <d3d11.h>
#include <dxgi.h>
#include <windows.graphics.capture.h> #include <windows.graphics.capture.h>
#include <windows.graphics.directx.direct3d11.interop.h> #include <windows.graphics.directx.direct3d11.interop.h>
#include <winrt/Windows.Foundation.h> #include <winrt/Windows.Foundation.h>
@@ -34,7 +35,8 @@ public:
ID3D11DeviceContext* context() const; ID3D11DeviceContext* context() const;
private: private:
bool createD3DDevice(); bool createD3DDevice(IDXGIAdapter* adapter = nullptr);
bool createD3DDeviceForMonitor(HMONITOR monitor);
bool createCaptureItem(HMONITOR monitor); bool createCaptureItem(HMONITOR monitor);
bool createCaptureItem(HWND window); bool createCaptureItem(HWND window);
bool applySessionOptions(bool captureCursor); bool applySessionOptions(bool captureCursor);
+2 -2
View File
@@ -1,12 +1,12 @@
{ {
"name": "openscreen", "name": "openscreen",
"version": "1.4.2", "version": "1.4.8",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "openscreen", "name": "openscreen",
"version": "1.4.2", "version": "1.4.8",
"dependencies": { "dependencies": {
"@fix-webm-duration/fix": "^1.0.1", "@fix-webm-duration/fix": "^1.0.1",
"@pixi/filter-drop-shadow": "^5.2.0", "@pixi/filter-drop-shadow": "^5.2.0",
+1 -1
View File
@@ -1,7 +1,7 @@
{ {
"name": "openscreen", "name": "openscreen",
"private": true, "private": true,
"version": "1.4.2", "version": "1.4.8",
"type": "module", "type": "module",
"packageManager": "npm@10.9.4", "packageManager": "npm@10.9.4",
"engines": { "engines": {
+10
View File
@@ -131,6 +131,11 @@ if (!fs.existsSync(guideHotkeyListenerOutputPath)) {
throw new Error(`WGC helper build completed but ${guideHotkeyListenerOutputPath} was not found.`); throw new Error(`WGC helper build completed but ${guideHotkeyListenerOutputPath} was not found.`);
} }
const ocrServiceWrapperOutputPath = path.join(BUILD_DIR, "openscreen-ocr-service-wrapper.exe");
if (!fs.existsSync(ocrServiceWrapperOutputPath)) {
throw new Error(`WGC helper build completed but ${ocrServiceWrapperOutputPath} was not found.`);
}
fs.mkdirSync(BIN_DIR, { recursive: true }); fs.mkdirSync(BIN_DIR, { recursive: true });
const distributablePath = path.join(BIN_DIR, "wgc-capture.exe"); const distributablePath = path.join(BIN_DIR, "wgc-capture.exe");
fs.copyFileSync(outputPath, distributablePath); fs.copyFileSync(outputPath, distributablePath);
@@ -141,9 +146,14 @@ fs.copyFileSync(cursorSamplerOutputPath, cursorSamplerDistributablePath);
const guideHotkeyListenerDistributablePath = path.join(BIN_DIR, "guide-hotkey-listener.exe"); const guideHotkeyListenerDistributablePath = path.join(BIN_DIR, "guide-hotkey-listener.exe");
fs.copyFileSync(guideHotkeyListenerOutputPath, guideHotkeyListenerDistributablePath); fs.copyFileSync(guideHotkeyListenerOutputPath, guideHotkeyListenerDistributablePath);
const ocrServiceWrapperDistributablePath = path.join(BIN_DIR, "openscreen-ocr-service-wrapper.exe");
fs.copyFileSync(ocrServiceWrapperOutputPath, ocrServiceWrapperDistributablePath);
console.log(`Built ${outputPath}`); console.log(`Built ${outputPath}`);
console.log(`Copied ${distributablePath}`); console.log(`Copied ${distributablePath}`);
console.log(`Built ${cursorSamplerOutputPath}`); console.log(`Built ${cursorSamplerOutputPath}`);
console.log(`Copied ${cursorSamplerDistributablePath}`); console.log(`Copied ${cursorSamplerDistributablePath}`);
console.log(`Built ${guideHotkeyListenerOutputPath}`); console.log(`Built ${guideHotkeyListenerOutputPath}`);
console.log(`Copied ${guideHotkeyListenerDistributablePath}`); console.log(`Copied ${guideHotkeyListenerDistributablePath}`);
console.log(`Built ${ocrServiceWrapperOutputPath}`);
console.log(`Copied ${ocrServiceWrapperDistributablePath}`);
+18 -6
View File
@@ -65,7 +65,13 @@ export function SourceSelector() {
fetchSources(); fetchSources();
}, []); }, []);
const screenSources = sources.filter((s) => s.id.startsWith("screen:")); const screenSources = sources
.filter((s) => s.id.startsWith("screen:"))
.sort(
(left, right) =>
(left.displayIndex ?? left.screenIndex ?? Number.MAX_SAFE_INTEGER) -
(right.displayIndex ?? right.screenIndex ?? Number.MAX_SAFE_INTEGER),
);
const windowSources = sources.filter((s) => s.id.startsWith("window:")); const windowSources = sources.filter((s) => s.id.startsWith("window:"));
const handleSourceSelect = (source: DesktopSource) => setSelectedSource(source); const handleSourceSelect = (source: DesktopSource) => setSelectedSource(source);
@@ -96,11 +102,17 @@ export function SourceSelector() {
onClick={() => handleSourceSelect(source)} onClick={() => handleSourceSelect(source)}
> >
<div className="relative mb-1.5 overflow-hidden rounded-lg border border-white/[0.06] bg-black/30"> <div className="relative mb-1.5 overflow-hidden rounded-lg border border-white/[0.06] bg-black/30">
<img {source.thumbnail ? (
src={source.thumbnail || ""} <img
alt={source.name} src={source.thumbnail}
className="w-full aspect-video object-cover" alt={source.name}
/> className="w-full aspect-video object-cover"
/>
) : (
<div className="flex aspect-video w-full items-center justify-center bg-zinc-950 text-center text-[11px] font-medium text-zinc-400">
{source.displayLabel ?? source.name}
</div>
)}
{isSelected && ( {isSelected && (
<div className="absolute right-1.5 top-1.5"> <div className="absolute right-1.5 top-1.5">
<div className={styles.checkBadge}> <div className={styles.checkBadge}>
@@ -7,6 +7,7 @@ import type {
GuideAiProvider, GuideAiProvider,
GuideAiSettings, GuideAiSettings,
GuideLanguage, GuideLanguage,
GuideOcrProfile,
GuideSession, GuideSession,
} from "@/guide/contracts"; } from "@/guide/contracts";
import { captureGuideSnapshots } from "@/guide/snapshot/extractGuideSnapshots"; import { captureGuideSnapshots } from "@/guide/snapshot/extractGuideSnapshots";
@@ -42,13 +43,19 @@ const COPY = {
captureStep: "Capture step", captureStep: "Capture step",
captureLabel: "Manual capture", captureLabel: "Manual capture",
settings: "Settings", settings: "Settings",
guideSettings: "Guide settings",
apiKey: "API key env", apiKey: "API key env",
apiKeyPlaceholder: "DEEPSEEK_API_KEY", apiKeyPlaceholder: "DEEPSEEK_API_KEY",
baseUrl: "Base URL", baseUrl: "Base URL",
model: "Model", model: "Model",
ocrProfile: "OCR profile",
ocrLanguage: "OCR languages",
ocrFast: "Fast Latin",
ocrVietnamese: "Vietnamese Enhanced",
ocrHybrid: "Hybrid Vi + Latin",
saveSettings: "Save", saveSettings: "Save",
clearKey: "Reset env", clearKey: "Reset env",
keySaved: "DeepSeek settings saved.", settingsSaved: "Guide settings saved.",
keyMissing: "Set a DeepSeek API key environment variable before generating with DeepSeek.", keyMissing: "Set a DeepSeek API key environment variable before generating with DeepSeek.",
keyConfigured: "Env ready", keyConfigured: "Env ready",
keyNotConfigured: "Env value missing", keyNotConfigured: "Env value missing",
@@ -78,13 +85,19 @@ const COPY = {
captureStep: "Chụp bước", captureStep: "Chụp bước",
captureLabel: "Chụp thủ công", captureLabel: "Chụp thủ công",
settings: "Cài đặt", settings: "Cài đặt",
guideSettings: "Guide settings",
apiKey: "API key env", apiKey: "API key env",
apiKeyPlaceholder: "DEEPSEEK_API_KEY", apiKeyPlaceholder: "DEEPSEEK_API_KEY",
baseUrl: "Base URL", baseUrl: "Base URL",
model: "Model", model: "Model",
ocrProfile: "OCR profile",
ocrLanguage: "OCR languages",
ocrFast: "Fast Latin",
ocrVietnamese: "Vietnamese Enhanced",
ocrHybrid: "Hybrid Vi + Latin",
saveSettings: "Lưu", saveSettings: "Lưu",
clearKey: "Reset env", clearKey: "Reset env",
keySaved: "Đã lưu cài đặt DeepSeek.", settingsSaved: "Da luu cai dat guide.",
keyMissing: "Hãy set biến môi trường DeepSeek API key trước khi tạo draft bằng DeepSeek.", keyMissing: "Hãy set biến môi trường DeepSeek API key trước khi tạo draft bằng DeepSeek.",
keyConfigured: "Env ready", keyConfigured: "Env ready",
keyNotConfigured: "Chưa thấy giá trị env", keyNotConfigured: "Chưa thấy giá trị env",
@@ -108,6 +121,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
const [deepSeekApiKeyEnvName, setDeepSeekApiKeyEnvName] = useState("DEEPSEEK_API_KEY"); const [deepSeekApiKeyEnvName, setDeepSeekApiKeyEnvName] = useState("DEEPSEEK_API_KEY");
const [deepSeekBaseUrl, setDeepSeekBaseUrl] = useState("https://api.deepseek.com"); const [deepSeekBaseUrl, setDeepSeekBaseUrl] = useState("https://api.deepseek.com");
const [deepSeekModel, setDeepSeekModel] = useState("deepseek-chat"); const [deepSeekModel, setDeepSeekModel] = useState("deepseek-chat");
const [ocrProfile, setOcrProfile] = useState<GuideOcrProfile>("vietnamese");
const [ocrLanguage, setOcrLanguage] = useState("vi,en");
const [message, setMessage] = useState<string | null>(null); const [message, setMessage] = useState<string | null>(null);
const isBusy = busyAction !== null; const isBusy = busyAction !== null;
@@ -138,6 +153,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
setDeepSeekBaseUrl(result.data.deepseek.baseUrl); setDeepSeekBaseUrl(result.data.deepseek.baseUrl);
setDeepSeekModel(result.data.deepseek.model); setDeepSeekModel(result.data.deepseek.model);
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName); setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
setOcrProfile(result.data.ocr.profile);
setOcrLanguage(result.data.ocr.language);
}, []); }, []);
useEffect(() => { useEffect(() => {
@@ -269,6 +286,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
deepseekApiKeyEnvName: deepSeekApiKeyEnvName, deepseekApiKeyEnvName: deepSeekApiKeyEnvName,
baseUrl: deepSeekBaseUrl, baseUrl: deepSeekBaseUrl,
model: deepSeekModel, model: deepSeekModel,
ocrProfile,
ocrLanguage,
}); });
if (!result.success) { if (!result.success) {
throw new Error(result.error); throw new Error(result.error);
@@ -277,7 +296,9 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName); setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
setDeepSeekBaseUrl(result.data.deepseek.baseUrl); setDeepSeekBaseUrl(result.data.deepseek.baseUrl);
setDeepSeekModel(result.data.deepseek.model); setDeepSeekModel(result.data.deepseek.model);
toast.success(copy.keySaved); setOcrProfile(result.data.ocr.profile);
setOcrLanguage(result.data.ocr.language);
toast.success(copy.settingsSaved);
} catch (error) { } catch (error) {
const text = error instanceof Error ? error.message : String(error); const text = error instanceof Error ? error.message : String(error);
setMessage(text); setMessage(text);
@@ -285,7 +306,14 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
} finally { } finally {
setSettingsBusy(false); setSettingsBusy(false);
} }
}, [copy.keySaved, deepSeekApiKeyEnvName, deepSeekBaseUrl, deepSeekModel]); }, [
copy.settingsSaved,
deepSeekApiKeyEnvName,
deepSeekBaseUrl,
deepSeekModel,
ocrLanguage,
ocrProfile,
]);
const handleClearDeepSeekKey = useCallback(async () => { const handleClearDeepSeekKey = useCallback(async () => {
if (!window.electronAPI?.guide?.saveAiSettings) { if (!window.electronAPI?.guide?.saveAiSettings) {
@@ -298,13 +326,17 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
clearDeepseekApiKeyEnvName: true, clearDeepseekApiKeyEnvName: true,
baseUrl: deepSeekBaseUrl, baseUrl: deepSeekBaseUrl,
model: deepSeekModel, model: deepSeekModel,
ocrProfile,
ocrLanguage,
}); });
if (!result.success) { if (!result.success) {
throw new Error(result.error); throw new Error(result.error);
} }
setAiSettings(result.data); setAiSettings(result.data);
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName); setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
toast.success(copy.keySaved); setOcrProfile(result.data.ocr.profile);
setOcrLanguage(result.data.ocr.language);
toast.success(copy.settingsSaved);
} catch (error) { } catch (error) {
const text = error instanceof Error ? error.message : String(error); const text = error instanceof Error ? error.message : String(error);
setMessage(text); setMessage(text);
@@ -312,7 +344,7 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
} finally { } finally {
setSettingsBusy(false); setSettingsBusy(false);
} }
}, [copy.keySaved, deepSeekBaseUrl, deepSeekModel]); }, [copy.settingsSaved, deepSeekBaseUrl, deepSeekModel, ocrLanguage, ocrProfile]);
const handleGenerateGuide = useCallback(() => { const handleGenerateGuide = useCallback(() => {
void runAction("generate", async () => { void runAction("generate", async () => {
@@ -455,7 +487,7 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
<div className="flex items-center justify-between gap-2"> <div className="flex items-center justify-between gap-2">
<div className="min-w-0"> <div className="min-w-0">
<div className="truncate text-[11px] font-semibold text-slate-100"> <div className="truncate text-[11px] font-semibold text-slate-100">
{copy.deepseek} {copy.settings} {copy.guideSettings}
</div> </div>
<div className="truncate text-[10px] text-slate-500"> <div className="truncate text-[10px] text-slate-500">
{aiSettings?.deepseek.hasApiKey {aiSettings?.deepseek.hasApiKey
@@ -470,6 +502,33 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
</span> </span>
</div> </div>
<div className="grid grid-cols-2 gap-1.5">
<label className="block min-w-0 text-[10px] font-medium text-slate-400">
{copy.ocrProfile}
<select
value={ocrProfile}
onChange={(event) => setOcrProfile(event.target.value as GuideOcrProfile)}
disabled={settingsBusy}
className="mt-1 h-8 w-full rounded-md border border-white/[0.08] bg-black/20 px-2 text-[11px] text-slate-100 outline-none"
>
<option value="vietnamese">{copy.ocrVietnamese}</option>
<option value="hybrid">{copy.ocrHybrid}</option>
<option value="fast">{copy.ocrFast}</option>
</select>
</label>
<label className="block min-w-0 text-[10px] font-medium text-slate-400">
{copy.ocrLanguage}
<input
type="text"
value={ocrLanguage}
onChange={(event) => setOcrLanguage(event.target.value)}
placeholder="vi,en"
disabled={settingsBusy}
className="mt-1 h-8 w-full rounded-md border border-white/[0.08] bg-black/20 px-2 text-[11px] text-slate-100 outline-none placeholder:text-slate-600"
/>
</label>
</div>
<label className="block text-[10px] font-medium text-slate-400"> <label className="block text-[10px] font-medium text-slate-400">
{copy.apiKey} {copy.apiKey}
<input <input
+10
View File
@@ -9,6 +9,7 @@ export type GuideTargetRole = "button" | "menu" | "tab" | "field" | "link" | "un
export type GuideLanguage = "vi" | "en"; export type GuideLanguage = "vi" | "en";
export type GuideAiProvider = "deepseek" | "local"; export type GuideAiProvider = "deepseek" | "local";
export type GuideSecretStorage = "environment" | "none"; export type GuideSecretStorage = "environment" | "none";
export type GuideOcrProfile = "fast" | "vietnamese" | "hybrid";
export type GuideSessionStatus = export type GuideSessionStatus =
| "recording" | "recording"
@@ -54,6 +55,7 @@ export interface GuideSnapshot {
timeMs: number; timeMs: number;
offsetMs: number; offsetMs: number;
path: string; path: string;
markedPath?: string;
width: number; width: number;
height: number; height: number;
} }
@@ -162,6 +164,7 @@ export interface WriteGuideSnapshotInput {
timeMs: number; timeMs: number;
offsetMs: number; offsetMs: number;
pngBytes: ArrayBuffer; pngBytes: ArrayBuffer;
markedPngBytes?: ArrayBuffer;
width: number; width: number;
height: number; height: number;
} }
@@ -178,6 +181,11 @@ export interface GenerateGuideDraftInput {
} }
export interface GuideAiSettings { export interface GuideAiSettings {
ocr: {
profile: GuideOcrProfile;
language: string;
updatedAt?: string;
};
deepseek: { deepseek: {
hasApiKey: boolean; hasApiKey: boolean;
apiKeyEnvName: string; apiKeyEnvName: string;
@@ -194,6 +202,8 @@ export interface SaveGuideAiSettingsInput {
clearDeepseekApiKeyEnvName?: boolean; clearDeepseekApiKeyEnvName?: boolean;
baseUrl?: string; baseUrl?: string;
model?: string; model?: string;
ocrProfile?: GuideOcrProfile;
ocrLanguage?: string;
} }
export interface SaveGuideInput { export interface SaveGuideInput {
+21 -7
View File
@@ -29,6 +29,7 @@ const session: GuideSession = {
timeMs: 1500, timeMs: 1500,
offsetMs: 500, offsetMs: 500,
path: "/tmp/recording-guide/step-001.png", path: "/tmp/recording-guide/step-001.png",
markedPath: "/tmp/recording-guide/step-001-marked.png",
width: 1280, width: 1280,
height: 720, height: 720,
}, },
@@ -71,7 +72,7 @@ describe("guide exporters", () => {
expect(markdown).toContain("# User guide"); expect(markdown).toContain("# User guide");
expect(markdown).toContain("## 1. Open Settings"); expect(markdown).toContain("## 1. Open Settings");
expect(markdown).toContain("](step-001.png)"); expect(markdown).toContain("](step-001-marked.png)");
}); });
it("exports escaped HTML", () => { it("exports escaped HTML", () => {
@@ -79,12 +80,11 @@ describe("guide exporters", () => {
expect(html).toContain("<!doctype html>"); expect(html).toContain("<!doctype html>");
expect(html).toContain("<h1>User guide</h1>"); expect(html).toContain("<h1>User guide</h1>");
expect(html).toContain('src="step-001.png"'); expect(html).toContain('src="step-001-marked.png"');
expect(html).toContain("click-marker"); expect(html).not.toContain("click-marker");
expect(html).toContain("left: 25.00%; top: 75.00%;");
}); });
it("draws click markers for hotkey events with coordinates", () => { it("uses marker snapshots for hotkey events with coordinates", () => {
const hotkeySession: GuideSession = { const hotkeySession: GuideSession = {
...session, ...session,
events: [ events: [
@@ -98,7 +98,21 @@ describe("guide exporters", () => {
const html = exportGuideToHtml(hotkeySession); const html = exportGuideToHtml(hotkeySession);
expect(html).toContain("click-marker"); expect(html).toContain('src="step-001-marked.png"');
expect(html).toContain("left: 25.00%; top: 75.00%;"); expect(html).not.toContain("click-marker");
});
it("falls back to the unmarked screenshot when no marker snapshot exists", () => {
const unmarkedSession: GuideSession = {
...session,
snapshots: session.snapshots.map((snapshot) => ({
...snapshot,
markedPath: undefined,
})),
};
const markdown = exportGuideToMarkdown(unmarkedSession);
expect(markdown).toContain("](step-001.png)");
}); });
}); });
+25 -51
View File
@@ -10,8 +10,9 @@ export function exportGuideToMarkdown(session: GuideSession): string {
for (const step of guide.steps) { for (const step of guide.steps) {
lines.push(`## ${step.order}. ${step.title}`, "", step.instruction, ""); lines.push(`## ${step.order}. ${step.title}`, "", step.instruction, "");
if (step.screenshotPath) { const screenshotPath = resolveStepScreenshotPath(step, session);
lines.push(`![${escapeMarkdownAlt(step.title)}](${path.basename(step.screenshotPath)})`, ""); if (screenshotPath) {
lines.push(`![${escapeMarkdownAlt(step.title)}](${path.basename(screenshotPath)})`, "");
} }
} }
@@ -36,10 +37,8 @@ export function exportGuideToHtml(session: GuideSession): string {
.step { border-top: 1px solid #e5e7eb; padding: 22px 0; } .step { border-top: 1px solid #e5e7eb; padding: 22px 0; }
.step h2 { font-size: 18px; margin: 0 0 8px; } .step h2 { font-size: 18px; margin: 0 0 8px; }
.step p { margin: 0 0 12px; } .step p { margin: 0 0 12px; }
.shot { display: inline-block; position: relative; max-width: 100%; margin: 0; } .shot { display: inline-block; max-width: 100%; margin: 0; }
img { display: block; max-width: 100%; border: 1px solid #e5e7eb; border-radius: 6px; } img { display: block; max-width: 100%; border: 1px solid #e5e7eb; border-radius: 6px; }
.click-marker { position: absolute; width: 26px; height: 26px; border: 3px solid #ef4444; border-radius: 999px; box-shadow: 0 0 0 4px rgba(239, 68, 68, 0.18), 0 2px 8px rgba(17, 24, 39, 0.28); transform: translate(-50%, -50%); pointer-events: none; }
.click-marker::after { content: ""; position: absolute; left: 50%; top: 50%; width: 6px; height: 6px; border-radius: 999px; background: #ef4444; transform: translate(-50%, -50%); }
</style> </style>
</head> </head>
<body> <body>
@@ -54,12 +53,9 @@ export function exportGuideToHtml(session: GuideSession): string {
} }
function renderStepHtml(step: GeneratedGuideStep, session: GuideSession): string { function renderStepHtml(step: GeneratedGuideStep, session: GuideSession): string {
const clickPoint = resolveStepClickPoint(step, session); const screenshotPath = resolveStepScreenshotPath(step, session);
const marker = clickPoint const image = screenshotPath
? `<span class="click-marker" style="left: ${formatPercent(clickPoint.x)}%; top: ${formatPercent(clickPoint.y)}%;" aria-label="Click position"></span>` ? `<figure class="shot"><img src="${escapeHtml(path.basename(screenshotPath))}" alt="${escapeHtml(step.title)}"></figure>`
: "";
const image = step.screenshotPath
? `<figure class="shot"><img src="${escapeHtml(path.basename(step.screenshotPath))}" alt="${escapeHtml(step.title)}">${marker}</figure>`
: ""; : "";
return `<section class="step"> return `<section class="step">
<h2>${step.order}. ${escapeHtml(step.title)}</h2> <h2>${step.order}. ${escapeHtml(step.title)}</h2>
@@ -88,54 +84,32 @@ function escapeHtml(value: string): string {
.replace(/'/g, "&#39;"); .replace(/'/g, "&#39;");
} }
function resolveStepClickPoint( function resolveStepScreenshotPath(
step: GeneratedGuideStep, step: GeneratedGuideStep,
session: GuideSession, session: GuideSession,
): { x: number; y: number } | null { ): string | undefined {
const snapshot = resolveStepSnapshot(step, session);
return snapshot?.markedPath ?? step.screenshotPath ?? snapshot?.path;
}
function resolveStepSnapshot(step: GeneratedGuideStep, session: GuideSession) {
const candidate = step.sourceCandidateId const candidate = step.sourceCandidateId
? session.candidates.find((item) => item.id === step.sourceCandidateId) ? session.candidates.find((item) => item.id === step.sourceCandidateId)
: undefined; : undefined;
const eventId = candidate?.eventId;
const event = eventId ? session.events.find((item) => item.id === eventId) : undefined;
if (!event || (event.kind !== "click" && event.kind !== "hotkey")) {
return null;
}
if (isNormalizedNumber(event.normalizedX) && isNormalizedNumber(event.normalizedY)) {
return { x: clamp01(event.normalizedX), y: clamp01(event.normalizedY) };
}
const screenshotFileName = step.screenshotPath ? path.basename(step.screenshotPath) : undefined; const screenshotFileName = step.screenshotPath ? path.basename(step.screenshotPath) : undefined;
const snapshot = return (
(candidate?.snapshotId (candidate?.snapshotId
? session.snapshots.find((item) => item.id === candidate.snapshotId) ? session.snapshots.find((item) => item.id === candidate.snapshotId)
: undefined) ?? : undefined) ??
(candidate?.eventId
? session.snapshots.find((item) => item.eventId === candidate.eventId)
: undefined) ??
(screenshotFileName (screenshotFileName
? session.snapshots.find((item) => path.basename(item.path) === screenshotFileName) ? session.snapshots.find(
: undefined); (item) =>
if ( path.basename(item.path) === screenshotFileName ||
!snapshot || (item.markedPath ? path.basename(item.markedPath) === screenshotFileName : false),
typeof event.x !== "number" || )
typeof event.y !== "number" || : undefined)
snapshot.width <= 0 || );
snapshot.height <= 0
) {
return null;
}
return {
x: clamp01(event.x / snapshot.width),
y: clamp01(event.y / snapshot.height),
};
}
function formatPercent(value: number): string {
return (clamp01(value) * 100).toFixed(2);
}
function isNormalizedNumber(value: unknown): value is number {
return typeof value === "number" && Number.isFinite(value) && value >= 0 && value <= 1;
}
function clamp01(value: number): number {
return Math.min(1, Math.max(0, value));
} }
@@ -35,18 +35,29 @@ export async function captureGuideSnapshots(
canvas.height = Math.max(1, Math.round(sourceHeight * scale)); canvas.height = Math.max(1, Math.round(sourceHeight * scale));
let latestSession = input.session; let latestSession = input.session;
const existingSnapshotsByEventId = new Set(
input.session.snapshots.map((snapshot) => snapshot.eventId),
);
for (const event of events) { for (const event of events) {
if (existingSnapshotsByEventId.has(event.id)) {
continue;
}
const offsetMs = event.screenshotOffsetMs ?? 500; const offsetMs = event.screenshotOffsetMs ?? 500;
const timeMs = getSnapshotTimeMs(event, offsetMs, video.duration); const timeMs = getSnapshotTimeMs(event, offsetMs, video.duration);
await seekVideo(video, timeMs / 1000); await seekVideo(video, timeMs / 1000);
context.drawImage(video, 0, 0, canvas.width, canvas.height); context.drawImage(video, 0, 0, canvas.width, canvas.height);
const pngBytes = await canvasToPngBytes(canvas); const pngBytes = await canvasToPngBytes(canvas);
const markerPoint = getSnapshotMarkerPoint(event, canvas.width, canvas.height);
const markedPngBytes = markerPoint
? await canvasToMarkedPngBytes(canvas, markerPoint)
: undefined;
const result = await window.electronAPI.guide.writeSnapshot({ const result = await window.electronAPI.guide.writeSnapshot({
recordingId: input.session.recordingId, recordingId: input.session.recordingId,
eventId: event.id, eventId: event.id,
timeMs, timeMs,
offsetMs, offsetMs,
pngBytes, pngBytes,
markedPngBytes,
width: canvas.width, width: canvas.width,
height: canvas.height, height: canvas.height,
}); });
@@ -143,3 +154,85 @@ function canvasToPngBytes(canvas: HTMLCanvasElement): Promise<ArrayBuffer> {
}, "image/png"); }, "image/png");
}); });
} }
async function canvasToMarkedPngBytes(
canvas: HTMLCanvasElement,
point: { x: number; y: number },
): Promise<ArrayBuffer> {
const markedCanvas = document.createElement("canvas");
markedCanvas.width = canvas.width;
markedCanvas.height = canvas.height;
const markedContext = markedCanvas.getContext("2d");
if (!markedContext) {
throw new Error("Canvas 2D context is unavailable.");
}
markedContext.drawImage(canvas, 0, 0);
drawSnapshotMarker(markedContext, markedCanvas, point);
return await canvasToPngBytes(markedCanvas);
}
function drawSnapshotMarker(
context: CanvasRenderingContext2D,
canvas: HTMLCanvasElement,
point: { x: number; y: number },
) {
const shortSide = Math.max(1, Math.min(canvas.width, canvas.height));
const haloRadius = clampNumber(Math.round(shortSide * 0.012), 8, 14);
const dotRadius = clampNumber(Math.round(shortSide * 0.0045), 3, 6);
const lineWidth = Math.max(1, Math.round(shortSide * 0.0015));
context.beginPath();
context.arc(point.x, point.y, haloRadius, 0, Math.PI * 2);
context.fillStyle = "rgba(250, 204, 21, 0.34)";
context.fill();
context.lineWidth = lineWidth;
context.strokeStyle = "rgba(239, 68, 68, 0.72)";
context.stroke();
context.beginPath();
context.arc(point.x, point.y, dotRadius, 0, Math.PI * 2);
context.fillStyle = "rgba(220, 38, 38, 0.92)";
context.fill();
}
function getSnapshotMarkerPoint(
event: GuideEvent,
width: number,
height: number,
): { x: number; y: number } | null {
if (event.kind !== "click" && event.kind !== "hotkey") {
return null;
}
if (isNormalizedNumber(event.normalizedX) && isNormalizedNumber(event.normalizedY)) {
return {
x: clampNumber(event.normalizedX * width, 0, width),
y: clampNumber(event.normalizedY * height, 0, height),
};
}
if (isNormalizedNumber(event.x) && isNormalizedNumber(event.y)) {
return {
x: clampNumber(event.x * width, 0, width),
y: clampNumber(event.y * height, 0, height),
};
}
if (
typeof event.x === "number" &&
typeof event.y === "number" &&
Number.isFinite(event.x) &&
Number.isFinite(event.y)
) {
return {
x: clampNumber(event.x, 0, width),
y: clampNumber(event.y, 0, height),
};
}
return null;
}
function isNormalizedNumber(value: unknown): value is number {
return typeof value === "number" && Number.isFinite(value) && value >= 0 && value <= 1;
}
function clampNumber(value: number, min = 0, max = Number.POSITIVE_INFINITY): number {
return Math.min(max, Math.max(min, value));
}
+327 -19
View File
@@ -5,8 +5,9 @@ import importlib.util
import os import os
import sys import sys
import tempfile import tempfile
from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from threading import Lock from threading import Lock, Thread
from typing import Any from typing import Any
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
@@ -17,6 +18,67 @@ app = FastAPI(title="OpenScreen PaddleOCR service")
_engines: dict[str, Any] = {} _engines: dict[str, Any] = {}
_engine_lock = Lock() _engine_lock = Lock()
_warmup_lock = Lock()
_warmup_started = False
_LATIN_RECOGNITION_LANGS = {
"af",
"az",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"en",
"es",
"et",
"eu",
"fi",
"fr",
"ga",
"gl",
"hr",
"hu",
"id",
"is",
"it",
"ku",
"la",
"latin",
"lb",
"lt",
"lv",
"mi",
"ms",
"mt",
"nl",
"no",
"oc",
"pi",
"pl",
"pt",
"qu",
"rm",
"ro",
"rs_latin",
"rslatin",
"sk",
"sl",
"sq",
"sv",
"sw",
"tl",
"tr",
"uz",
"vi",
}
@dataclass(frozen=True)
class PreparedImage:
path: str
scale: float = 1.0
should_delete: bool = False
class OcrRequest(BaseModel): class OcrRequest(BaseModel):
@@ -24,6 +86,21 @@ class OcrRequest(BaseModel):
path: str | None = None path: str | None = None
imagePath: str | None = None imagePath: str | None = None
language: str | None = None language: str | None = None
profile: str | None = None
@app.on_event("startup")
def start_ocr_warmup() -> None:
if os.getenv("OPENSCREEN_OCR_WARMUP", "0") != "1":
return
global _warmup_started
with _warmup_lock:
if _warmup_started:
return
_warmup_started = True
Thread(target=_warmup_default_engines, name="openscreen-ocr-warmup", daemon=True).start()
@app.get("/health") @app.get("/health")
@@ -33,16 +110,31 @@ def health() -> dict[str, Any]:
"paddleocrInstalled": importlib.util.find_spec("paddleocr") is not None, "paddleocrInstalled": importlib.util.find_spec("paddleocr") is not None,
"paddleInstalled": importlib.util.find_spec("paddle") is not None, "paddleInstalled": importlib.util.find_spec("paddle") is not None,
"engineReady": bool(_engines), "engineReady": bool(_engines),
"defaultLanguage": os.getenv("PADDLEOCR_LANG", "latin"), "defaultLanguage": os.getenv("PADDLEOCR_LANG") or "vi,en",
"defaultProfile": os.getenv("OPENSCREEN_OCR_PROFILE") or "vietnamese",
"loadedEngines": sorted(_engines.keys()),
} }
def _warmup_default_engines() -> None:
try:
profile = _resolve_ocr_profile(None)
for paddle_lang in _resolve_paddle_languages(None, profile):
_get_engine(paddle_lang)
except Exception as error:
print(f"OpenScreen OCR warmup failed: {error}", file=sys.stderr, flush=True)
@app.post("/ocr") @app.post("/ocr")
async def ocr(request: OcrRequest) -> dict[str, Any]: async def ocr(request: OcrRequest) -> dict[str, Any]:
image_path, should_delete = _resolve_image_path(request) image_path, should_delete = _resolve_image_path(request)
try: try:
engine = _get_engine(request.language) blocks = await run_in_threadpool(
blocks = await run_in_threadpool(_recognize_blocks, engine, image_path) _recognize_profile_blocks,
image_path,
request.language,
request.profile,
)
return {"blocks": blocks} return {"blocks": blocks}
finally: finally:
if should_delete: if should_delete:
@@ -73,8 +165,7 @@ def _resolve_image_path(request: OcrRequest) -> tuple[str, bool]:
return handle.name, True return handle.name, True
def _get_engine(language: str | None) -> Any: def _get_engine(paddle_lang: str) -> Any:
paddle_lang = _resolve_paddle_language(language)
cache_key = f"{paddle_lang}|{os.getenv('PADDLEOCR_DEVICE', 'cpu')}" cache_key = f"{paddle_lang}|{os.getenv('PADDLEOCR_DEVICE', 'cpu')}"
with _engine_lock: with _engine_lock:
if cache_key not in _engines: if cache_key not in _engines:
@@ -105,13 +196,17 @@ def _create_engine(paddle_lang: str) -> Any:
"enable_mkldnn": os.getenv("PADDLEOCR_ENABLE_MKLDNN", "0") == "1", "enable_mkldnn": os.getenv("PADDLEOCR_ENABLE_MKLDNN", "0") == "1",
"use_doc_orientation_classify": False, "use_doc_orientation_classify": False,
"use_doc_unwarping": False, "use_doc_unwarping": False,
"use_textline_orientation": False, "use_textline_orientation": os.getenv("PADDLEOCR_USE_TEXTLINE_ORIENTATION", "0") == "1",
} }
if os.getenv("PADDLEOCR_USE_MOBILE", "1") != "0": if os.getenv("PADDLEOCR_USE_MOBILE", "1") != "0":
modern_kwargs.update( modern_kwargs.update(
{ {
"text_detection_model_name": "PP-OCRv5_mobile_det", "text_detection_model_name": os.getenv(
"text_recognition_model_name": _mobile_recognition_model(paddle_lang), "PADDLEOCR_DET_MODEL",
"PP-OCRv5_mobile_det",
),
"text_recognition_model_name": os.getenv("PADDLEOCR_REC_MODEL")
or _mobile_recognition_model(paddle_lang),
} }
) )
@@ -150,23 +245,236 @@ def _patch_paddlex_frozen_ocr_extra_gate() -> None:
deps._openscreen_ocr_extra_patch = True deps._openscreen_ocr_extra_patch = True
def _resolve_paddle_language(language: str | None) -> str: def _recognize_profile_blocks(
explicit = os.getenv("PADDLEOCR_LANG") image_path: str,
language: str | None,
profile: str | None,
) -> list[dict[str, Any]]:
ocr_profile = _resolve_ocr_profile(profile)
languages = _resolve_paddle_languages(language, ocr_profile)
prepared = _prepare_image_for_profile(image_path, ocr_profile)
try:
blocks: list[dict[str, Any]] = []
for paddle_lang in languages:
engine = _get_engine(paddle_lang)
recognized = _recognize_blocks(engine, prepared.path)
blocks.extend(_scale_blocks(recognized, prepared.scale))
return _merge_blocks(blocks)
finally:
if prepared.should_delete:
Path(prepared.path).unlink(missing_ok=True)
def _resolve_ocr_profile(profile: str | None) -> str:
explicit = (os.getenv("OPENSCREEN_OCR_PROFILE") or "").strip().lower()
value = explicit or (profile or "").strip().lower()
if value in {"fast", "vietnamese", "hybrid"}:
return value
return "vietnamese"
def _resolve_paddle_languages(language: str | None, profile: str) -> list[str]:
explicit = (os.getenv("PADDLEOCR_LANG") or "").strip().lower()
if explicit: if explicit:
return explicit return [explicit]
language_value = (language or "vi,en").lower() language_value = (language or "vi,en").lower()
if "vi" in language_value or "latin" in language_value: has_vietnamese = "vi" in _split_language_tags(language_value)
if profile == "fast":
return [_resolve_primary_paddle_language(language_value, prefer_vietnamese=False)]
if profile == "hybrid":
languages = ["vi"] if has_vietnamese else []
languages.append("latin")
return _dedupe_languages(languages)
return [_resolve_primary_paddle_language(language_value, prefer_vietnamese=True)]
def _split_language_tags(language: str) -> set[str]:
return {part.strip().lower() for part in language.split(",") if part.strip()}
def _dedupe_languages(languages: list[str]) -> list[str]:
seen: set[str] = set()
result: list[str] = []
for language in languages:
if language not in seen:
seen.add(language)
result.append(language)
return result
def _resolve_primary_paddle_language(language_value: str, *, prefer_vietnamese: bool) -> str:
tags = _split_language_tags(language_value)
if prefer_vietnamese and "vi" in tags:
return "vi"
if "latin" in tags or "vi" in tags or "en" in tags:
return "latin" return "latin"
if "en" in language_value: for tag in tags:
return "en" return tag
return language_value.split(",")[0].strip() or "latin" return "latin"
def _prepare_image_for_profile(image_path: str, profile: str) -> PreparedImage:
if profile == "fast":
return PreparedImage(image_path)
try:
from PIL import Image, ImageEnhance, ImageOps
except Exception:
return PreparedImage(image_path)
try:
with Image.open(image_path) as source:
image = source.convert("RGB")
except Exception:
return PreparedImage(image_path)
scale = _resolve_enhancement_scale(image.width, image.height)
if scale <= 1:
return PreparedImage(image_path)
resampling = getattr(getattr(Image, "Resampling", Image), "LANCZOS")
enhanced = image.resize((round(image.width * scale), round(image.height * scale)), resampling)
enhanced = ImageOps.autocontrast(enhanced)
enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
enhanced = ImageEnhance.Sharpness(enhanced).enhance(1.35)
handle = tempfile.NamedTemporaryFile(prefix="openscreen-ocr-enhanced-", suffix=".png", delete=False)
try:
handle.close()
enhanced.save(handle.name, format="PNG")
return PreparedImage(handle.name, scale=scale, should_delete=True)
except Exception:
Path(handle.name).unlink(missing_ok=True)
return PreparedImage(image_path)
def _resolve_enhancement_scale(width: int, height: int) -> float:
try:
requested_scale = float(os.getenv("OPENSCREEN_OCR_ENHANCE_SCALE", "2"))
except ValueError:
requested_scale = 2.0
scale = max(1.0, min(3.0, requested_scale))
try:
max_side = int(os.getenv("OPENSCREEN_OCR_ENHANCE_MAX_SIDE", "2400"))
except ValueError:
max_side = 2400
largest_side = max(width, height)
if largest_side <= 0:
return 1.0
return max(1.0, min(scale, max_side / largest_side))
def _scale_blocks(blocks: list[dict[str, Any]], scale: float) -> list[dict[str, Any]]:
if scale <= 1:
return blocks
scaled_blocks: list[dict[str, Any]] = []
for block in blocks:
box = block.get("box")
if not isinstance(box, dict) or not _box_uses_pixels(box):
scaled_blocks.append(block)
continue
scaled_box = {
"x": float(box["x"]) / scale,
"y": float(box["y"]) / scale,
"width": float(box["width"]) / scale,
"height": float(box["height"]) / scale,
}
scaled_blocks.append({**block, "box": scaled_box})
return scaled_blocks
def _box_uses_pixels(box: dict[str, Any]) -> bool:
try:
x = float(box["x"])
y = float(box["y"])
width = float(box["width"])
height = float(box["height"])
except (KeyError, TypeError, ValueError):
return False
return x > 1 or y > 1 or width > 1 or height > 1 or x + width > 1 or y + height > 1
def _merge_blocks(blocks: list[dict[str, Any]]) -> list[dict[str, Any]]:
merged: list[dict[str, Any]] = []
for block in sorted(blocks, key=_block_quality, reverse=True):
box = block.get("box")
if not isinstance(box, dict):
continue
overlapping_index = next(
(
index
for index, existing in enumerate(merged)
if _box_iou(box, existing.get("box")) >= 0.62
),
None,
)
if overlapping_index is None:
merged.append(block)
continue
if _block_quality(block) > _block_quality(merged[overlapping_index]):
merged[overlapping_index] = block
return sorted(merged, key=lambda block: _box_sort_key(block.get("box")))
def _block_quality(block: dict[str, Any]) -> float:
text = str(block.get("text") or "")
score = _score_to_float(block.get("confidence"))
if _has_vietnamese_diacritics(text):
score += 0.08
if len(text) >= 2:
score += min(0.04, len(text) * 0.002)
return score
def _has_vietnamese_diacritics(text: str) -> bool:
return any(
character
in "ăâđêôơưĂÂĐÊÔƠƯáàảãạắằẳẵặấầẩẫậéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựýỳỷỹỵ"
for character in text
)
def _box_iou(left: Any, right: Any) -> float:
if not isinstance(left, dict) or not isinstance(right, dict):
return 0.0
try:
left_x = float(left["x"])
left_y = float(left["y"])
left_width = float(left["width"])
left_height = float(left["height"])
right_x = float(right["x"])
right_y = float(right["y"])
right_width = float(right["width"])
right_height = float(right["height"])
except (KeyError, TypeError, ValueError):
return 0.0
intersection_left = max(left_x, right_x)
intersection_top = max(left_y, right_y)
intersection_right = min(left_x + left_width, right_x + right_width)
intersection_bottom = min(left_y + left_height, right_y + right_height)
intersection_width = max(0.0, intersection_right - intersection_left)
intersection_height = max(0.0, intersection_bottom - intersection_top)
intersection_area = intersection_width * intersection_height
if intersection_area <= 0:
return 0.0
union_area = left_width * left_height + right_width * right_height - intersection_area
return intersection_area / union_area if union_area > 0 else 0.0
def _box_sort_key(box: Any) -> tuple[float, float]:
if not isinstance(box, dict):
return (0.0, 0.0)
try:
return (float(box["y"]), float(box["x"]))
except (KeyError, TypeError, ValueError):
return (0.0, 0.0)
def _mobile_recognition_model(paddle_lang: str) -> str: def _mobile_recognition_model(paddle_lang: str) -> str:
if paddle_lang == "en": if paddle_lang in _LATIN_RECOGNITION_LANGS:
return "en_PP-OCRv5_mobile_rec"
if paddle_lang == "latin":
return "latin_PP-OCRv5_mobile_rec" return "latin_PP-OCRv5_mobile_rec"
return "PP-OCRv5_mobile_rec" return "PP-OCRv5_mobile_rec"