4 Commits

Author SHA1 Message Date
huanld 7823507a18 Fix Windows native capture state and monitor adapter 2026-05-28 13:22:24 +07:00
huanld 0b78ff6f7d Release OpenScreen 1.4.4 2026-05-28 12:25:23 +07:00
huanld 198dc022b0 Release OpenScreen 1.4.2 2026-05-28 10:01:22 +07:00
huanld 69804c41c7 Release OpenScreen 1.4.1 2026-05-28 08:52:11 +07:00
43 changed files with 2331 additions and 127 deletions
+8
View File
@@ -0,0 +1,8 @@
# Copy to .env.signing.local for a local signing machine. Do not commit real values.
AZURE_TENANT_ID=
AZURE_CLIENT_ID=
AZURE_CLIENT_SECRET=
AZURE_TRUSTED_SIGNING_ENDPOINT=https://<region>.codesigning.azure.net/
AZURE_TRUSTED_SIGNING_ACCOUNT_NAME=
AZURE_TRUSTED_SIGNING_CERTIFICATE_PROFILE_NAME=
AZURE_TRUSTED_SIGNING_PUBLISHER_NAME=
+1
View File
@@ -13,6 +13,7 @@ dist-electron
dist-ssr
*.local
.env
.env.signing.local
# Native helper build outputs
/electron/native/wgc-capture/build/
+8 -4
View File
@@ -6,7 +6,7 @@ OpenScreen calls OCR through a local HTTP service. The default endpoint is:
http://127.0.0.1:8866/ocr
```
The app sends either `imageBase64` or `path` and expects OCR blocks:
The app sends either `imageBase64` or `path`, plus optional `language` and `profile`, and expects OCR blocks:
```json
{
@@ -38,7 +38,7 @@ If `paddle` is still missing after installing `paddleocr`, install the CPU Paddl
```powershell
.\.venv-ocr\Scripts\Activate.ps1
$env:PADDLEOCR_DEVICE="cpu"
$env:PADDLEOCR_LANG="latin"
$env:OPENSCREEN_OCR_PROFILE="vietnamese"
npm run ocr:paddle
```
@@ -58,7 +58,8 @@ Expected healthy environment:
"paddleocrInstalled": true,
"paddleInstalled": true,
"engineReady": false,
"defaultLanguage": "latin"
"defaultLanguage": "vi,en",
"defaultProfile": "vietnamese"
}
```
@@ -67,7 +68,10 @@ Expected healthy environment:
## Configuration
- `PADDLEOCR_DEVICE`: `cpu`, `gpu:0`, or another PaddleOCR device string.
- `PADDLEOCR_LANG`: defaults to `latin`; this is preferred for Vietnamese UI text because it uses a Latin-script recognition model.
- `OPENSCREEN_OCR_PROFILE`: `fast`, `vietnamese`, or `hybrid`. The default `vietnamese` profile upscales and sharpens focused UI screenshots before OCR.
- `OPENSCREEN_GUIDE_OCR_LANGUAGE`: defaults to `vi,en`.
- `PADDLEOCR_LANG`: optional hard override. Leave unset for the app profile/language settings to work.
- `PADDLEOCR_VERSION`: defaults to `PP-OCRv5`.
- `PADDLEOCR_USE_MOBILE`: defaults to `1`; set to `0` to use the default/server models.
- `PADDLEOCR_REC_MODEL`: optional recognizer model override. The bundled profile uses `latin_PP-OCRv5_mobile_rec`, which supports Vietnamese Latin-script text.
- `OPENSCREEN_GUIDE_OCR_URL`: OpenScreen OCR endpoint override; defaults to `http://127.0.0.1:8866`.
@@ -0,0 +1,84 @@
# Windows Private Trust Signing
OpenScreen supports Microsoft Trusted Signing private trust profiles for Windows
builds. Secrets and signing resource names are read from environment variables;
no certificate, client secret, or API key should be committed.
For a local signing machine, copy `.env.signing.example` to
`.env.signing.local` and fill in values there. `.env.signing.local` is ignored
by Git. Explicit shell environment variables override values in that local file.
## Required Azure Resource Variables
Set these values for the Trusted Signing account and certificate profile:
```powershell
$env:AZURE_TRUSTED_SIGNING_ENDPOINT = "https://<region>.codesigning.azure.net/"
$env:AZURE_TRUSTED_SIGNING_ACCOUNT_NAME = "<trusted-signing-account-name>"
$env:AZURE_TRUSTED_SIGNING_CERTIFICATE_PROFILE_NAME = "<private-trust-profile-name>"
$env:AZURE_TRUSTED_SIGNING_PUBLISHER_NAME = "<certificate-common-name>"
```
`AZURE_TRUSTED_SIGNING_CERTIFICATE_PROFILE_NAME` must point to a certificate
profile created with the `PrivateTrust` profile type.
## Required Azure Auth Variables
Electron Builder uses Azure environment credentials. Set the tenant and client:
```powershell
$env:AZURE_TENANT_ID = "<tenant-id>"
$env:AZURE_CLIENT_ID = "<app-registration-client-id>"
```
Then set one authentication mode. Service principal secret is the simplest for
local signing:
```powershell
$env:AZURE_CLIENT_SECRET = "<client-secret>"
```
Certificate auth is also supported:
```powershell
$env:AZURE_CLIENT_CERTIFICATE_PATH = "C:\secure\signing-auth.pfx"
$env:AZURE_CLIENT_CERTIFICATE_PASSWORD = "<pfx-password>"
```
## Sign Existing Installer
This signs the installer already built at
`release/<version>/Openscreen Setup <version>.exe`:
```powershell
npm run sign:win:private-trust
```
To sign a specific file:
```powershell
npm run sign:win:private-trust -- --file "D:\Code\OpenScreen\release\1.4.0\Openscreen Setup 1.4.0.exe"
```
## Build And Sign
This signs the packaged app executable, bundled OCR service executable, and NSIS
installer during the Windows build:
```powershell
npm run build:win:private-trust
```
The regular `npm run build:win` remains unsigned for local development builds.
## Verification
After signing:
```powershell
Get-AuthenticodeSignature "release\1.4.0\Openscreen Setup 1.4.0.exe" | Format-List
```
Private trust signatures are valid only on machines that trust the private trust
certificate chain/publisher. For public downloads that must be trusted on any
Windows machine, use a public trust certificate profile instead.
+73
View File
@@ -0,0 +1,73 @@
const fs = require("node:fs");
const path = require("node:path");
const JSON5 = require("json5");
function loadLocalSigningEnv() {
const envPath = path.join(__dirname, ".env.signing.local");
if (!fs.existsSync(envPath)) {
return;
}
const lines = fs.readFileSync(envPath, "utf8").split(/\r?\n/);
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) {
continue;
}
const match = trimmed.match(/^([A-Za-z_][A-Za-z0-9_]*)=(.*)$/);
if (!match || process.env[match[1]]) {
continue;
}
process.env[match[1]] = match[2].replace(/^['"]|['"]$/g, "");
}
}
function readBaseConfig() {
const configPath = path.join(__dirname, "electron-builder.json5");
return JSON5.parse(fs.readFileSync(configPath, "utf8"));
}
function requireEnv(name) {
const value = process.env[name]?.trim();
if (!value) {
throw new Error(`Missing required environment variable: ${name}`);
}
return value;
}
function requireAnyEnv(names) {
for (const name of names) {
const value = process.env[name]?.trim();
if (value) {
return value;
}
}
throw new Error(`Missing required environment variable: ${names.join(" or ")}`);
}
loadLocalSigningEnv();
const config = readBaseConfig();
config.win = {
...config.win,
signAndEditExecutable: true,
azureSignOptions: {
publisherName: requireAnyEnv([
"AZURE_TRUSTED_SIGNING_PUBLISHER_NAME",
"OPENSCREEN_SIGNING_PUBLISHER_NAME",
]),
endpoint: requireEnv("AZURE_TRUSTED_SIGNING_ENDPOINT"),
certificateProfileName: requireEnv("AZURE_TRUSTED_SIGNING_CERTIFICATE_PROFILE_NAME"),
codeSigningAccountName: requireEnv("AZURE_TRUSTED_SIGNING_ACCOUNT_NAME"),
fileDigest: process.env.AZURE_TRUSTED_SIGNING_FILE_DIGEST?.trim() || "SHA256",
timestampRfc3161:
process.env.AZURE_TRUSTED_SIGNING_TIMESTAMP_RFC3161?.trim() ||
"http://timestamp.acs.microsoft.com",
timestampDigest: process.env.AZURE_TRUSTED_SIGNING_TIMESTAMP_DIGEST?.trim() || "SHA256",
},
};
delete config.win.signExts;
module.exports = config;
+18
View File
@@ -48,6 +48,14 @@ interface Window {
event: import("../src/guide/contracts").GuideEvent;
}>
>;
capturePointerMarker: () => Promise<
import("../src/guide/contracts").GuideIpcResult<
import("../src/guide/contracts").CaptureGuidePointerMarkerResult
>
>;
onMarkerCaptured: (
callback: (payload: import("../src/guide/contracts").GuideMarkerCapturedPayload) => void,
) => () => void;
finalizeEvents: (
input: import("../src/guide/contracts").FinalizeGuideEventsInput,
) => Promise<
@@ -371,6 +379,16 @@ interface ProcessedDesktopSource {
display_id: string;
thumbnail: string | null;
appIcon: string | null;
displayId?: number;
displayIndex?: number;
screenIndex?: number;
displayLabel?: string;
bounds?: {
x: number;
y: number;
width: number;
height: number;
};
}
interface CursorTelemetryPoint {
@@ -0,0 +1,66 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { DeepSeekSettingsStore } from "./deepseekSettingsStore";
const tempDirs: string[] = [];
const originalOcrProfile = process.env.OPENSCREEN_GUIDE_OCR_PROFILE;
const originalOcrLanguage = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE;
beforeEach(() => {
delete process.env.OPENSCREEN_GUIDE_OCR_PROFILE;
delete process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE;
});
afterEach(async () => {
restoreEnv("OPENSCREEN_GUIDE_OCR_PROFILE", originalOcrProfile);
restoreEnv("OPENSCREEN_GUIDE_OCR_LANGUAGE", originalOcrLanguage);
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true })));
});
function restoreEnv(name: string, value: string | undefined): void {
if (value === undefined) {
delete process.env[name];
return;
}
process.env[name] = value;
}
async function createStore(): Promise<DeepSeekSettingsStore> {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openscreen-guide-settings-"));
tempDirs.push(dir);
return new DeepSeekSettingsStore(path.join(dir, "guide-ai-settings.json"));
}
describe("DeepSeekSettingsStore OCR settings", () => {
it("defaults to the Vietnamese enhanced OCR profile", async () => {
const store = await createStore();
await expect(store.getOcrConfig()).resolves.toEqual({
profile: "vietnamese",
language: "vi,en",
});
});
it("persists OCR profile changes alongside DeepSeek settings", async () => {
const store = await createStore();
const status = await store.save({
deepseekApiKeyEnvName: "DEEPSEEK_API_KEY",
baseUrl: "https://api.deepseek.com",
model: "deepseek-chat",
ocrProfile: "hybrid",
ocrLanguage: "vi,en",
});
expect(status.ocr).toMatchObject({
profile: "hybrid",
language: "vi,en",
});
await expect(store.getOcrConfig()).resolves.toEqual({
profile: "hybrid",
language: "vi,en",
});
});
});
+68 -2
View File
@@ -1,6 +1,10 @@
import fs from "node:fs/promises";
import path from "node:path";
import type { GuideAiSettings, SaveGuideAiSettingsInput } from "../../../src/guide/contracts";
import type {
GuideAiSettings,
GuideOcrProfile,
SaveGuideAiSettingsInput,
} from "../../../src/guide/contracts";
export interface DeepSeekGuideConfig {
apiKey?: string;
@@ -12,8 +16,22 @@ export interface DeepSeekGuideConfigProvider {
getDeepSeekConfig(): Promise<DeepSeekGuideConfig>;
}
export interface GuideOcrConfig {
profile: GuideOcrProfile;
language: string;
}
export interface GuideOcrConfigProvider {
getOcrConfig(): Promise<GuideOcrConfig>;
}
interface PersistedGuideAiSettings {
schemaVersion: 1;
ocr?: {
profile?: GuideOcrProfile;
language?: string;
updatedAt?: string;
};
deepseek?: {
apiKeyEnvName?: string;
baseUrl?: string;
@@ -25,8 +43,10 @@ interface PersistedGuideAiSettings {
const DEFAULT_DEEPSEEK_API_KEY_ENV_NAME = "DEEPSEEK_API_KEY";
const DEFAULT_DEEPSEEK_BASE_URL = "https://api.deepseek.com";
const DEFAULT_DEEPSEEK_MODEL = "deepseek-chat";
const DEFAULT_OCR_PROFILE: GuideOcrProfile = "vietnamese";
const DEFAULT_OCR_LANGUAGE = "vi,en";
export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider, GuideOcrConfigProvider {
constructor(private readonly filePath: string) {}
async getStatus(): Promise<GuideAiSettings> {
@@ -35,6 +55,13 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
const activeApiKey = process.env[apiKeyEnvName];
return {
ocr: {
profile: normalizeOcrProfile(raw?.ocr?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
language: normalizeOcrLanguage(
raw?.ocr?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE,
),
updatedAt: raw?.ocr?.updatedAt,
},
deepseek: {
hasApiKey: Boolean(activeApiKey),
apiKeyEnvName,
@@ -49,7 +76,14 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
async save(input: SaveGuideAiSettingsInput): Promise<GuideAiSettings> {
const current = (await this.readSettings()) ?? { schemaVersion: 1 };
const currentOcr = current.ocr ?? {};
const currentDeepSeek = current.deepseek ?? {};
const nextOcr = {
...currentOcr,
profile: normalizeOcrProfile(input.ocrProfile ?? currentOcr.profile),
language: normalizeOcrLanguage(input.ocrLanguage ?? currentOcr.language),
updatedAt: new Date().toISOString(),
};
const nextDeepSeek = {
...currentDeepSeek,
baseUrl: normalizeBaseUrl(input.baseUrl ?? currentDeepSeek.baseUrl),
@@ -65,6 +99,7 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
await this.writeSettings({
schemaVersion: 1,
ocr: nextOcr,
deepseek: nextDeepSeek,
});
return await this.getStatus();
@@ -80,6 +115,16 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
};
}
async getOcrConfig(): Promise<GuideOcrConfig> {
const raw = await this.readSettings();
return {
profile: normalizeOcrProfile(raw?.ocr?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
language: normalizeOcrLanguage(
raw?.ocr?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE,
),
};
}
private async readSettings(): Promise<PersistedGuideAiSettings | null> {
try {
const content = await fs.readFile(this.filePath, "utf-8");
@@ -120,6 +165,11 @@ function normalizePersistedSettings(input: unknown): PersistedGuideAiSettings |
}
return {
schemaVersion: 1,
ocr: {
profile: normalizeOcrProfile(raw.ocr?.profile),
language: normalizeOcrLanguage(raw.ocr?.language),
updatedAt: raw.ocr?.updatedAt,
},
deepseek: {
apiKeyEnvName: normalizeEnvName(raw.deepseek?.apiKeyEnvName),
baseUrl: raw.deepseek?.baseUrl,
@@ -155,3 +205,19 @@ function normalizeBaseUrl(value: string | undefined): string {
function normalizeModel(value: string | undefined): string {
return value?.trim() || DEFAULT_DEEPSEEK_MODEL;
}
function normalizeOcrProfile(value: string | undefined): GuideOcrProfile {
if (value === "fast" || value === "vietnamese" || value === "hybrid") {
return value;
}
return DEFAULT_OCR_PROFILE;
}
function normalizeOcrLanguage(value: string | undefined): string {
const normalized = value
?.split(",")
.map((part) => part.trim().toLowerCase())
.filter(Boolean)
.join(",");
return normalized || DEFAULT_OCR_LANGUAGE;
}
+22 -4
View File
@@ -18,15 +18,25 @@ import type {
import type { DeepSeekSettingsStore } from "./ai/deepseekSettingsStore";
import { GuideStore, GuideStoreError } from "./guideStore";
export interface GuideIpcLifecycle {
onSessionStarted?: (session: GuideSession) => void;
onSessionEnded?: (recordingId: unknown) => void;
}
export function registerGuideIpcHandlers(
ipcMain: IpcMain,
store: GuideStore,
aiSettingsStore?: DeepSeekSettingsStore,
lifecycle: GuideIpcLifecycle = {},
): void {
ipcMain.handle(
"guide:start-session",
async (_, recordingId): Promise<GuideIpcResult<GuideSession>> => {
return await toGuideResult(() => store.startSession(recordingId));
const result = await toGuideResult(() => store.startSession(recordingId));
if (result.success) {
lifecycle.onSessionStarted?.(result.data);
}
return result;
},
);
@@ -50,7 +60,11 @@ export function registerGuideIpcHandlers(
ipcMain.handle(
"guide:finalize-events",
async (_, input: FinalizeGuideEventsInput): Promise<GuideIpcResult<GuideSession>> => {
return await toGuideResult(() => store.finalizeEvents(input));
const result = await toGuideResult(() => store.finalizeEvents(input));
if (result.success) {
lifecycle.onSessionEnded?.(input.recordingId);
}
return result;
},
);
@@ -110,10 +124,14 @@ export function registerGuideIpcHandlers(
ipcMain.handle(
"guide:discard-session",
async (_, input: DiscardGuideSessionInput): Promise<GuideIpcResult<{ discarded: true }>> => {
return await toGuideResult(async () => {
const result = await toGuideResult(async () => {
await store.discardSession(input);
return { discarded: true };
return { discarded: true as const };
});
if (result.success) {
lifecycle.onSessionEnded?.(input.recordingId);
}
return result;
},
);
}
+76
View File
@@ -42,9 +42,17 @@ describe("GuideStore", () => {
kind: "hotkey",
timeMs: 500,
label: "First",
normalizedX: 0.25,
normalizedY: 0.75,
});
expect(result.event.kind).toBe("hotkey");
expect(result.event).toMatchObject({
x: 0.25,
y: 0.75,
normalizedX: 0.25,
normalizedY: 0.75,
});
expect(result.session.events.map((event) => event.timeMs)).toEqual([500, 2000]);
expect(result.session.events[0]?.source).toBe("guide-hotkey");
expect(result.session.events[1]?.source).toBe("review-ui");
@@ -220,6 +228,74 @@ describe("GuideStore", () => {
await expect(fs.readFile(html.path, "utf-8")).resolves.toContain("<!doctype html>");
});
it("repairs generic hotkey marker text and attaches AI draft artifacts", async () => {
const store = new GuideStore(recordingsDir, {
ocrClient: {
recognize: async (snapshot) => [
{
id: `ocr-${snapshot.id}-1`,
snapshotId: snapshot.id,
text: "Save",
confidence: 0.95,
box: { x: 0.45, y: 0.45, width: 0.15, height: 0.08 },
},
],
},
draftClient: {
generate: async () => ({
title: "Guide",
steps: [
{
id: "guide-step-1",
order: 1,
title: "Step 1: Click Ctrl+F12 marker",
instruction: "Click Ctrl+F12 marker.",
},
],
}),
},
});
await store.startSession(114);
await store.addMarker({
recordingId: 114,
kind: "hotkey",
timeMs: 200,
label: "Ctrl+F12 marker",
normalizedX: 0.5,
normalizedY: 0.5,
});
const videoPath = path.join(recordingsDir, "recording-114.mp4");
await fs.writeFile(videoPath, "");
const eventsSession = await store.finalizeEvents({ recordingId: 114, videoPath });
await store.writeSnapshot({
recordingId: 114,
eventId: eventsSession.events[0]?.id ?? "",
timeMs: 700,
offsetMs: 500,
width: 800,
height: 600,
pngBytes: new Uint8Array([1, 2, 3]).buffer,
});
await store.runOcr({ recordingId: 114 });
const draftSession = await store.generateDraft({
recordingId: 114,
language: "en",
provider: "deepseek",
});
expect(draftSession.candidates[0]).toMatchObject({
targetText: "Save",
position: { xPercent: 50, yPercent: 50 },
});
expect(draftSession.generatedGuide?.steps[0]).toMatchObject({
title: "Step 1: Save",
instruction: 'Click "Save".',
sourceCandidateId: draftSession.candidates[0]?.id,
screenshotPath: draftSession.snapshots[0]?.path,
});
});
it("discards a guide session and output directory", async () => {
const store = new GuideStore(recordingsDir);
const session = await store.startSession(111);
+161 -3
View File
@@ -34,7 +34,10 @@ import {
DeepSeekGuideClientError,
type GuideDraftClient,
} from "./ai/deepseekGuideClient";
import type { DeepSeekGuideConfigProvider } from "./ai/deepseekSettingsStore";
import type {
DeepSeekGuideConfigProvider,
GuideOcrConfigProvider,
} from "./ai/deepseekSettingsStore";
import { type GuidePaths, normalizeGuideRecordingId, resolveGuidePaths } from "./guidePaths";
import { createFocusedOcrSnapshot, remapFocusedOcrBlocks } from "./ocr/focusedOcrSnapshot";
import { DefaultGuideOcrClient, type GuideOcrClient } from "./ocr/paddleOcrClient";
@@ -70,6 +73,7 @@ export interface GuideStoreDependencies {
ocrClient?: GuideOcrClient;
draftClient?: GuideDraftClient;
deepSeekConfigProvider?: DeepSeekGuideConfigProvider;
ocrConfigProvider?: GuideOcrConfigProvider;
focusOcrSnapshots?: boolean;
}
@@ -127,6 +131,7 @@ export class GuideStore {
kind: input.kind,
source: input.kind === "hotkey" ? "guide-hotkey" : "review-ui",
timeMs: Math.max(0, input.timeMs),
...normalizeMarkerPoint(input),
label: normalizeOptionalString(input.label),
screenshotOffsetMs: 500,
createdAt: new Date().toISOString(),
@@ -254,7 +259,9 @@ export class GuideStore {
throw new GuideStoreError("guide-invalid-input", "No guide snapshots are available for OCR.");
}
const ocrClient = this.dependencies.ocrClient ?? new DefaultGuideOcrClient();
const ocrClient =
this.dependencies.ocrClient ??
DefaultGuideOcrClient.fromConfig(await this.dependencies.ocrConfigProvider?.getOcrConfig());
const shouldFocusOcrSnapshots =
this.dependencies.focusOcrSnapshots ?? this.dependencies.ocrClient === undefined;
const eventsById = new Map(session.events.map((event) => [event.id, event]));
@@ -335,10 +342,11 @@ export class GuideStore {
}
}
const normalizedGuide = normalizeGeneratedGuide(generatedGuide) ?? generatedGuide;
const updatedSession = touchSession({
...session,
candidates,
generatedGuide: normalizeGeneratedGuide(generatedGuide) ?? generatedGuide,
generatedGuide: enrichGeneratedGuide(normalizedGuide, session, candidates, input.language),
status: "draft-ready",
});
await this.writeSession(updatedSession);
@@ -742,11 +750,41 @@ function normalizeGuideStepCandidate(input: unknown): GuideStepCandidate | null
input.targetRole === "unknown"
? input.targetRole
: undefined,
position: normalizeGuideCandidatePosition(input.position),
nearbyText,
confidence,
};
}
function normalizeGuideCandidatePosition(
input: unknown,
): GuideStepCandidate["position"] | undefined {
if (!isRecord(input)) {
return undefined;
}
const normalizedX = normalizeOptionalNormalizedNumber(input.normalizedX);
const normalizedY = normalizeOptionalNormalizedNumber(input.normalizedY);
const xPercent = normalizeOptionalNumber(input.xPercent);
const yPercent = normalizeOptionalNumber(input.yPercent);
const description = normalizeOptionalString(input.description);
if (
normalizedX === undefined ||
normalizedY === undefined ||
xPercent === undefined ||
yPercent === undefined ||
!description
) {
return undefined;
}
return {
normalizedX,
normalizedY,
xPercent,
yPercent,
description,
};
}
function normalizeGeneratedGuide(input: unknown): GeneratedGuide | null {
if (!isRecord(input)) {
return null;
@@ -784,6 +822,101 @@ function normalizeGeneratedGuide(input: unknown): GeneratedGuide | null {
};
}
function enrichGeneratedGuide(
guide: GeneratedGuide,
session: GuideSession,
candidates: GuideStepCandidate[],
language: GenerateGuideDraftInput["language"],
): GeneratedGuide {
const sortedCandidates = [...candidates].sort((left, right) => left.timeMs - right.timeMs);
const candidatesById = new Map(candidates.map((candidate) => [candidate.id, candidate]));
const snapshotsById = new Map(session.snapshots.map((snapshot) => [snapshot.id, snapshot]));
const snapshotsByEventId = new Map(
session.snapshots.map((snapshot) => [snapshot.eventId, snapshot]),
);
return {
...guide,
steps: guide.steps.map((step, index) => {
const candidate =
(step.sourceCandidateId ? candidatesById.get(step.sourceCandidateId) : undefined) ??
sortedCandidates[index];
const snapshot = candidate
? ((candidate.snapshotId ? snapshotsById.get(candidate.snapshotId) : undefined) ??
snapshotsByEventId.get(candidate.eventId))
: undefined;
const repairedStep = repairGenericMarkerStep(step, candidate, language);
return {
...repairedStep,
sourceCandidateId: candidate?.id ?? repairedStep.sourceCandidateId,
screenshotPath: repairedStep.screenshotPath ?? snapshot?.path,
};
}),
};
}
function repairGenericMarkerStep(
step: GeneratedGuideStep,
candidate: GuideStepCandidate | undefined,
language: GenerateGuideDraftInput["language"],
): GeneratedGuideStep {
if (
!candidate ||
(!containsGenericMarkerText(step.title) && !containsGenericMarkerText(step.instruction))
) {
return step;
}
return {
...step,
title: buildRepairedStepTitle(candidate, step.order, language),
instruction: buildRepairedStepInstruction(candidate, language),
};
}
function containsGenericMarkerText(value: string): boolean {
return /\b(?:ctrl|control)(?:\s*\+\s*f12)?\s+marker\b/i.test(value);
}
function buildRepairedStepTitle(
candidate: GuideStepCandidate,
order: number,
language: GenerateGuideDraftInput["language"],
): string {
if (candidate.targetText) {
return language === "vi"
? `Bước ${order}: ${candidate.targetText}`
: `Step ${order}: ${candidate.targetText}`;
}
if (candidate.position) {
return language === "vi"
? `Bước ${order}: Vị trí x ${candidate.position.xPercent}%, y ${candidate.position.yPercent}%`
: `Step ${order}: Position x ${candidate.position.xPercent}%, y ${candidate.position.yPercent}%`;
}
return stepTitleFallback(order, language);
}
function buildRepairedStepInstruction(
candidate: GuideStepCandidate,
language: GenerateGuideDraftInput["language"],
): string {
if (candidate.targetText) {
return language === "vi"
? `Nhấn vào "${candidate.targetText}".`
: `Click "${candidate.targetText}".`;
}
if (candidate.position) {
return language === "vi"
? `Nhấn tại vùng ${candidate.position.description} (x ${candidate.position.xPercent}%, y ${candidate.position.yPercent}%).`
: `Click the ${candidate.position.description} area (x ${candidate.position.xPercent}%, y ${candidate.position.yPercent}%).`;
}
return language === "vi" ? "Thực hiện thao tác tại mốc đã ghi." : "Perform the recorded action.";
}
function stepTitleFallback(order: number, language: GenerateGuideDraftInput["language"]): string {
return language === "vi" ? `Bước ${order}` : `Step ${order}`;
}
function normalizeArray<T>(input: unknown, normalize: (value: unknown) => T | null): T[] {
return Array.isArray(input)
? input.map((value) => normalize(value)).filter((value): value is T => value !== null)
@@ -813,6 +946,31 @@ function normalizeOptionalNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function normalizeMarkerPoint(
input: AddGuideMarkerInput,
): Pick<GuideEvent, "x" | "y" | "normalizedX" | "normalizedY"> {
const normalizedX = normalizeOptionalNormalizedNumber(input.normalizedX ?? input.x);
const normalizedY = normalizeOptionalNormalizedNumber(input.normalizedY ?? input.y);
if (normalizedX === undefined || normalizedY === undefined) {
return {};
}
return {
x: normalizedX,
y: normalizedY,
normalizedX,
normalizedY,
};
}
function normalizeOptionalNormalizedNumber(value: unknown): number | undefined {
if (typeof value !== "number" || !Number.isFinite(value)) {
return undefined;
}
return Math.min(1, Math.max(0, value));
}
function normalizePositiveInteger(value: unknown): number | null {
return typeof value === "number" && Number.isFinite(value) && value > 0
? Math.round(value)
+3 -1
View File
@@ -156,8 +156,10 @@ function startOcrServiceProcess(
OPENSCREEN_OCR_PORT: DEFAULT_OCR_PORT,
PADDLEOCR_DEVICE: process.env.PADDLEOCR_DEVICE ?? "cpu",
PADDLEOCR_ENABLE_MKLDNN: process.env.PADDLEOCR_ENABLE_MKLDNN ?? "0",
PADDLEOCR_LANG: process.env.PADDLEOCR_LANG ?? "latin",
PADDLEOCR_LANG: process.env.PADDLEOCR_LANG ?? "",
PADDLEOCR_USE_MOBILE: process.env.PADDLEOCR_USE_MOBILE ?? "1",
OPENSCREEN_OCR_PROFILE:
process.env.OPENSCREEN_OCR_PROFILE ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE ?? "",
PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT: process.env.PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT ?? "False",
PADDLE_PDX_CACHE_HOME: process.env.PADDLE_PDX_CACHE_HOME ?? runtimePaths.paddlexCachePath,
PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK:
+38 -1
View File
@@ -1,8 +1,12 @@
import { describe, expect, it } from "vitest";
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
import {
DefaultGuideOcrClient,
normalizeOcrResponse,
PaddleOcrHttpClient,
parseWindowsOcrPayload,
} from "./paddleOcrClient";
@@ -16,6 +20,10 @@ const snapshot: GuideSnapshot = {
height: 800,
};
afterEach(() => {
vi.unstubAllGlobals();
});
describe("normalizeOcrResponse", () => {
it("normalizes pixel boxes into guide OCR blocks", () => {
const blocks = normalizeOcrResponse(
@@ -67,6 +75,35 @@ describe("normalizeOcrResponse", () => {
});
});
describe("PaddleOcrHttpClient", () => {
it("sends the selected OCR profile to the local service", async () => {
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openscreen-ocr-client-"));
const imagePath = path.join(tempDir, "step.png");
await fs.writeFile(imagePath, Buffer.from([137, 80, 78, 71]));
const requests: unknown[] = [];
vi.stubGlobal(
"fetch",
vi.fn(async (_url: string, init?: RequestInit) => {
requests.push(JSON.parse(String(init?.body ?? "{}")));
return new Response(JSON.stringify({ blocks: [] }), {
status: 200,
headers: { "content-type": "application/json" },
});
}),
);
const client = new PaddleOcrHttpClient("https://ocr.example.test", "vi,en", "hybrid");
await client.recognize({ ...snapshot, path: imagePath });
expect(requests[0]).toMatchObject({
language: "vi,en",
profile: "hybrid",
path: imagePath,
});
await fs.rm(tempDir, { recursive: true, force: true });
});
});
describe("DefaultGuideOcrClient", () => {
it("falls back when the HTTP OCR service is unavailable", async () => {
const fallbackBlock: OcrBlock = {
+45 -3
View File
@@ -1,7 +1,7 @@
import { execFile } from "node:child_process";
import fs from "node:fs/promises";
import { promisify } from "node:util";
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
import type { GuideOcrProfile, GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
import { ensureBundledOcrServiceRunning } from "./bundledOcrService";
const execFileAsync = promisify(execFile);
@@ -10,6 +10,11 @@ export interface GuideOcrClient {
recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]>;
}
export interface GuideOcrClientConfig {
profile: GuideOcrProfile;
language: string;
}
interface PaddleOcrResponseBlock {
text?: unknown;
confidence?: unknown;
@@ -21,7 +26,8 @@ interface PaddleOcrResponseBlock {
export class PaddleOcrHttpClient implements GuideOcrClient {
constructor(
private readonly baseUrl = process.env.OPENSCREEN_GUIDE_OCR_URL ?? "http://127.0.0.1:8866",
private readonly language = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE ?? "vi,en",
private readonly language = normalizeOcrLanguage(process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
private readonly profile = normalizeOcrProfile(process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
) {}
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
@@ -36,6 +42,7 @@ export class PaddleOcrHttpClient implements GuideOcrClient {
imageBase64,
path: snapshot.path,
language: this.language,
profile: this.profile,
}),
});
} catch (error) {
@@ -54,7 +61,9 @@ export class PaddleOcrHttpClient implements GuideOcrClient {
}
export class WindowsOcrClient implements GuideOcrClient {
constructor(private readonly language = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE ?? "vi,en") {}
constructor(
private readonly language = normalizeOcrLanguage(process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
) {}
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
if (process.platform !== "win32") {
@@ -96,6 +105,14 @@ export class WindowsOcrClient implements GuideOcrClient {
}
export class DefaultGuideOcrClient implements GuideOcrClient {
static fromConfig(config?: Partial<GuideOcrClientConfig>): DefaultGuideOcrClient {
const normalizedConfig = normalizeOcrClientConfig(config);
return new DefaultGuideOcrClient(
new PaddleOcrHttpClient(undefined, normalizedConfig.language, normalizedConfig.profile),
new WindowsOcrClient(normalizedConfig.language),
);
}
constructor(
private readonly httpClient = new PaddleOcrHttpClient(),
private readonly windowsClient = new WindowsOcrClient(),
@@ -119,6 +136,31 @@ export class DefaultGuideOcrClient implements GuideOcrClient {
}
}
function normalizeOcrClientConfig(
config: Partial<GuideOcrClientConfig> | undefined,
): GuideOcrClientConfig {
return {
profile: normalizeOcrProfile(config?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
language: normalizeOcrLanguage(config?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
};
}
function normalizeOcrProfile(value: string | undefined): GuideOcrProfile {
if (value === "fast" || value === "vietnamese" || value === "hybrid") {
return value;
}
return "vietnamese";
}
function normalizeOcrLanguage(value: string | undefined): string {
const normalized = value
?.split(",")
.map((part) => part.trim().toLowerCase())
.filter(Boolean)
.join(",");
return normalized || "vi,en";
}
export function parseWindowsOcrPayload(stdout: string): unknown {
const normalized = stdout.replace(/^\uFEFF/, "").trim();
try {
+623 -48
View File
@@ -5,17 +5,19 @@ import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { fileURLToPath, pathToFileURL } from "node:url";
import type { DesktopCapturerSource } from "electron";
import type { DesktopCapturerSource, Rectangle } from "electron";
import {
app,
BrowserWindow,
desktopCapturer,
dialog,
globalShortcut,
ipcMain,
screen,
shell,
systemPreferences,
} from "electron";
import type { GuideMarkerCapturedPayload } from "../../src/guide/contracts";
import type { NativeMacRecordingRequest } from "../../src/lib/nativeMacRecording";
import type { NativeWindowsRecordingRequest } from "../../src/lib/nativeWindowsRecording";
import {
@@ -343,9 +345,16 @@ type SelectedSource = {
name: string;
id?: string;
display_id?: string;
displayId?: number;
displayIndex?: number;
screenIndex?: number;
displayLabel?: string;
bounds?: SourceBounds;
[key: string]: unknown;
};
type SourceBounds = { x: number; y: number; width: number; height: number };
type AttachNativeMacWebcamRecordingInput = {
screenVideoPath?: string;
recordingId?: number;
@@ -417,6 +426,7 @@ let nativeWindowsCursorRecordingStartMs = 0;
let nativeWindowsPauseStartedAtMs: number | null = null;
let nativeWindowsPauseRanges: Array<{ startMs: number; endMs: number }> = [];
let nativeWindowsIsPaused = false;
let nativeWindowsCaptureStopping = false;
const NATIVE_WINDOWS_CAPTURE_STOP_TIMEOUT_MS = 15_000;
let nativeMacCaptureProcess: ChildProcessWithoutNullStreams | null = null;
let nativeMacCaptureOutput = "";
@@ -428,6 +438,23 @@ let nativeMacCursorRecordingStartMs = 0;
let nativeMacPauseStartedAtMs: number | null = null;
let nativeMacPauseRanges: Array<{ startMs: number; endMs: number }> = [];
let nativeMacIsPaused = false;
let guideHotkeyListenerProcess: ChildProcessWithoutNullStreams | null = null;
const GUIDE_MARKER_HOTKEY = "Control+F12";
const GUIDE_MARKER_HOTKEY_LABEL = "Ctrl+F12";
type GuideMarkerTrigger = GuideMarkerCapturedPayload["trigger"];
type GuideHotkeyBounds = { x: number; y: number; width: number; height: number };
type GuideHotkeyRecordingState = {
recordingId: number;
startedAtMs: number;
accumulatedPausedMs: number;
pausedAtMs: number | null;
bounds: GuideHotkeyBounds;
};
let activeGuideHotkeyRecording: GuideHotkeyRecordingState | null = null;
let activeGuideHotkeySessionId: number | null = null;
let guideMarkerHotkeyRegistered = false;
let lastGuideHotkeyCaptureAtMs = 0;
const GUIDE_HOTKEY_CAPTURE_DEBOUNCE_MS = 250;
function normalizeCursorSample(sample: unknown): CursorRecordingSample | null {
if (!sample || typeof sample !== "object") {
@@ -576,26 +603,389 @@ function resolveAssetBasePath() {
}
}
function parseDesktopCapturerScreenIndex(sourceId?: string | null): number | null {
if (!sourceId?.startsWith("screen:")) {
return null;
}
const indexPart = sourceId.split(":")[1];
if (!indexPart || !/^\d+$/.test(indexPart)) {
return null;
}
const index = Number(indexPart);
return Number.isInteger(index) && index >= 0 ? index : null;
}
function normalizeSourceBounds(input: unknown): SourceBounds | undefined {
if (!input || typeof input !== "object") {
return undefined;
}
const bounds = input as Partial<SourceBounds>;
const x = Number(bounds.x);
const y = Number(bounds.y);
const width = Number(bounds.width);
const height = Number(bounds.height);
if (
!Number.isFinite(x) ||
!Number.isFinite(y) ||
!Number.isFinite(width) ||
!Number.isFinite(height) ||
width <= 0 ||
height <= 0
) {
return undefined;
}
return {
x: Math.round(x),
y: Math.round(y),
width: Math.round(width),
height: Math.round(height),
};
}
function toSourceBounds(bounds: Rectangle): SourceBounds {
return {
x: Math.round(bounds.x),
y: Math.round(bounds.y),
width: Math.round(bounds.width),
height: Math.round(bounds.height),
};
}
function findDisplayForSource(
source: Pick<DesktopCapturerSource, "id" | "display_id">,
screenSourceIndex?: number,
) {
const displays = screen.getAllDisplays();
const displayId = Number(source.display_id);
const displayById = Number.isFinite(displayId)
? displays.find((display) => display.id === displayId)
: undefined;
if (displayById) {
return { display: displayById, displayIndex: displays.indexOf(displayById) };
}
const sourceIndex = parseDesktopCapturerScreenIndex(source.id) ?? screenSourceIndex;
if (sourceIndex !== null && sourceIndex !== undefined && sourceIndex < displays.length) {
return { display: displays[sourceIndex], displayIndex: sourceIndex };
}
return { display: null, displayIndex: undefined };
}
function getSelectedSourceDisplay() {
const displays = screen.getAllDisplays();
const explicitDisplayId =
typeof selectedSource?.displayId === "number"
? selectedSource.displayId
: Number(selectedSource?.display_id);
const displayById = Number.isFinite(explicitDisplayId)
? displays.find((display) => display.id === explicitDisplayId)
: undefined;
if (displayById) {
return displayById;
}
const sourceIndex =
typeof selectedSource?.displayIndex === "number"
? selectedSource.displayIndex
: typeof selectedSource?.screenIndex === "number"
? selectedSource.screenIndex
: parseDesktopCapturerScreenIndex(selectedSource?.id);
if (sourceIndex !== null && sourceIndex !== undefined && sourceIndex < displays.length) {
return displays[sourceIndex];
}
return null;
}
function getSelectedSourceBounds() {
const cursor = screen.getCursorScreenPoint();
const sourceDisplayId = Number(selectedSource?.display_id);
const sourceDisplay = Number.isFinite(sourceDisplayId)
? (screen.getAllDisplays().find((display) => display.id === sourceDisplayId) ?? null)
: null;
const selectedBounds = normalizeSourceBounds(selectedSource?.bounds);
if (selectedBounds) {
return selectedBounds;
}
const sourceDisplay = getSelectedSourceDisplay();
return (sourceDisplay ?? screen.getDisplayNearestPoint(cursor)).bounds;
}
function normalizeGuideHotkeyRecordingId(recordingId: unknown): number | null {
if (typeof recordingId === "number" && Number.isFinite(recordingId)) {
return Math.trunc(recordingId);
}
if (typeof recordingId === "string" && recordingId.trim()) {
const numeric = Number(recordingId);
return Number.isFinite(numeric) ? Math.trunc(numeric) : null;
}
return null;
}
function sanitizeGuideHotkeyBounds(bounds: GuideHotkeyBounds): GuideHotkeyBounds {
return {
x: Number.isFinite(bounds.x) ? bounds.x : 0,
y: Number.isFinite(bounds.y) ? bounds.y : 0,
width: Number.isFinite(bounds.width) && bounds.width > 0 ? bounds.width : 1,
height: Number.isFinite(bounds.height) && bounds.height > 0 ? bounds.height : 1,
};
}
function startGuideHotkeyRecording(
recordingIdInput: unknown,
bounds: GuideHotkeyBounds = getSelectedSourceBounds(),
) {
const recordingId = normalizeGuideHotkeyRecordingId(recordingIdInput);
if (recordingId === null) {
return;
}
activeGuideHotkeyRecording = {
recordingId,
startedAtMs: Date.now(),
accumulatedPausedMs: 0,
pausedAtMs: null,
bounds: sanitizeGuideHotkeyBounds(bounds),
};
}
function clearGuideHotkeyRecording() {
activeGuideHotkeyRecording = null;
activeGuideHotkeySessionId = null;
}
function activateGuideHotkeySession(recordingIdInput: unknown) {
const recordingId = normalizeGuideHotkeyRecordingId(recordingIdInput);
if (recordingId !== null) {
activeGuideHotkeySessionId = recordingId;
}
}
function deactivateGuideHotkeySession(recordingIdInput: unknown) {
const recordingId = normalizeGuideHotkeyRecordingId(recordingIdInput);
if (recordingId === null || activeGuideHotkeySessionId === recordingId) {
activeGuideHotkeySessionId = null;
}
}
function pauseGuideHotkeyRecording() {
if (activeGuideHotkeyRecording && activeGuideHotkeyRecording.pausedAtMs === null) {
activeGuideHotkeyRecording.pausedAtMs = Date.now();
}
}
function resumeGuideHotkeyRecording() {
if (!activeGuideHotkeyRecording || activeGuideHotkeyRecording.pausedAtMs === null) {
return;
}
activeGuideHotkeyRecording.accumulatedPausedMs += Math.max(
0,
Date.now() - activeGuideHotkeyRecording.pausedAtMs,
);
activeGuideHotkeyRecording.pausedAtMs = null;
}
function getGuideHotkeyRecordingTimeMs(recording: GuideHotkeyRecordingState): number {
const now = recording.pausedAtMs ?? Date.now();
return Math.max(0, now - recording.startedAtMs - recording.accumulatedPausedMs);
}
function getGuideHotkeyPoint(boundsInput: GuideHotkeyBounds) {
const bounds = sanitizeGuideHotkeyBounds(boundsInput);
const cursor = screen.getCursorScreenPoint();
return {
normalizedX: clampGuideHotkey01((cursor.x - bounds.x) / bounds.width),
normalizedY: clampGuideHotkey01((cursor.y - bounds.y) / bounds.height),
rawX: cursor.x,
rawY: cursor.y,
bounds,
};
}
function clampGuideHotkey01(value: number): number {
if (!Number.isFinite(value)) {
return 0;
}
return Math.min(1, Math.max(0, value));
}
async function captureGuideHotkeyMarker(
guideStore: GuideStore,
trigger: GuideMarkerTrigger = "global-shortcut",
) {
const recording = activeGuideHotkeyRecording;
if (!recording || activeGuideHotkeySessionId !== recording.recordingId) {
return { captured: false };
}
const captureRequestedAtMs = Date.now();
if (captureRequestedAtMs - lastGuideHotkeyCaptureAtMs < GUIDE_HOTKEY_CAPTURE_DEBOUNCE_MS) {
return { captured: false };
}
lastGuideHotkeyCaptureAtMs = captureRequestedAtMs;
const point = getGuideHotkeyPoint(recording.bounds);
try {
const result = await guideStore.addMarker({
recordingId: recording.recordingId,
kind: "hotkey",
timeMs: getGuideHotkeyRecordingTimeMs(recording),
x: point.normalizedX,
y: point.normalizedY,
normalizedX: point.normalizedX,
normalizedY: point.normalizedY,
});
notifyGuideMarkerCaptured({
recordingId: result.event.recordingId,
eventId: result.event.id,
timeMs: result.event.timeMs,
trigger,
normalizedX: result.event.normalizedX,
normalizedY: result.event.normalizedY,
rawX: point.rawX,
rawY: point.rawY,
});
console.info("[guide-hotkey] marker captured", {
recordingId: recording.recordingId,
timeMs: result.event.timeMs,
trigger,
normalizedX: result.event.normalizedX,
normalizedY: result.event.normalizedY,
rawX: point.rawX,
rawY: point.rawY,
bounds: point.bounds,
});
return { captured: true, ...result };
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.warn("[guide-hotkey] failed to capture marker:", message);
return { captured: false, error: message };
}
}
function notifyGuideMarkerCaptured(payload: GuideMarkerCapturedPayload) {
for (const window of BrowserWindow.getAllWindows()) {
if (!window.isDestroyed()) {
window.webContents.send("guide:marker-captured", payload);
}
}
}
function handleGuideHotkeyListenerLine(line: string, guideStore: GuideStore) {
const text = line.trim();
if (!text) {
return;
}
try {
const event = JSON.parse(text) as {
event?: unknown;
key?: unknown;
state?: unknown;
};
if (event.event === "ready") {
console.info("[guide-hotkey] native Ctrl listener ready");
return;
}
if (event.event === "guide-hotkey" && event.key === "control" && event.state === "down") {
void captureGuideHotkeyMarker(guideStore, "global-control");
return;
}
} catch {
console.warn("[guide-hotkey] native listener emitted invalid JSON:", text);
}
}
async function startNativeGuideHotkeyListener(guideStore: GuideStore) {
if (process.platform !== "win32" || guideHotkeyListenerProcess) {
return;
}
const helperPath = await findNativeGuideHotkeyListenerPath();
if (!helperPath) {
console.warn("[guide-hotkey] native Ctrl listener is unavailable");
return;
}
const proc = spawn(helperPath, [], {
cwd: path.dirname(helperPath),
stdio: ["pipe", "pipe", "pipe"],
windowsHide: true,
});
proc.stdin.end();
guideHotkeyListenerProcess = proc;
let stdoutBuffer = "";
proc.stdout.setEncoding("utf-8");
proc.stdout.on("data", (chunk: string) => {
stdoutBuffer += chunk;
const lines = stdoutBuffer.split(/\r?\n/);
stdoutBuffer = lines.pop() ?? "";
for (const line of lines) {
handleGuideHotkeyListenerLine(line, guideStore);
}
});
proc.stderr.setEncoding("utf-8");
proc.stderr.on("data", (chunk: string) => {
const message = chunk.trim();
if (message) {
console.warn("[guide-hotkey] native listener:", message);
}
});
proc.once("error", (error) => {
console.warn("[guide-hotkey] failed to start native Ctrl listener:", error);
if (guideHotkeyListenerProcess === proc) {
guideHotkeyListenerProcess = null;
}
});
proc.once("exit", (code, signal) => {
if (guideHotkeyListenerProcess === proc) {
guideHotkeyListenerProcess = null;
}
if (code !== 0 && code !== null) {
console.warn("[guide-hotkey] native Ctrl listener exited", { code, signal });
}
});
}
function stopNativeGuideHotkeyListener() {
const proc = guideHotkeyListenerProcess;
guideHotkeyListenerProcess = null;
if (proc && !proc.killed) {
proc.kill();
}
}
function registerGuideMarkerHotkey(guideStore: GuideStore) {
if (guideMarkerHotkeyRegistered) {
return;
}
void startNativeGuideHotkeyListener(guideStore);
guideMarkerHotkeyRegistered = globalShortcut.register(GUIDE_MARKER_HOTKEY, () => {
void captureGuideHotkeyMarker(guideStore, "global-shortcut");
});
if (!guideMarkerHotkeyRegistered) {
console.warn(`[guide-hotkey] failed to register ${GUIDE_MARKER_HOTKEY_LABEL}`);
return;
}
app.once("will-quit", () => {
globalShortcut.unregister(GUIDE_MARKER_HOTKEY);
stopNativeGuideHotkeyListener();
guideMarkerHotkeyRegistered = false;
});
}
function getSelectedSourceId() {
return typeof selectedSource?.id === "string" ? selectedSource.id : null;
}
function getSelectedDisplay() {
const sourceDisplayId = Number(selectedSource?.display_id);
if (!Number.isFinite(sourceDisplayId)) {
return null;
}
return screen.getAllDisplays().find((display) => display.id === sourceDisplayId) ?? null;
return getSelectedSourceDisplay();
}
function resolveUnpackedAppPath(...segments: string[]) {
@@ -634,6 +1024,19 @@ function getNativeWindowsCaptureHelperCandidates() {
].filter((candidate): candidate is string => Boolean(candidate));
}
function getNativeGuideHotkeyListenerCandidates() {
const envPath = process.env.OPENSCREEN_GUIDE_HOTKEY_LISTENER_EXE?.trim();
const archTag = process.arch === "arm64" ? "win32-arm64" : "win32-x64";
const helperName = "guide-hotkey-listener.exe";
return [
envPath,
resolveUnpackedAppPath("electron", "native", "wgc-capture", "build", "Release", helperName),
resolveUnpackedAppPath("electron", "native", "wgc-capture", "build", helperName),
resolveUnpackedAppPath("electron", "native", "bin", archTag, helperName),
resolvePackagedResourcePath("electron", "native", "bin", archTag, helperName),
].filter((candidate): candidate is string => Boolean(candidate));
}
async function findNativeWindowsCaptureHelperPath() {
if (process.platform !== "win32") {
return null;
@@ -651,6 +1054,23 @@ async function findNativeWindowsCaptureHelperPath() {
return null;
}
async function findNativeGuideHotkeyListenerPath() {
if (process.platform !== "win32") {
return null;
}
for (const candidate of getNativeGuideHotkeyListenerCandidates()) {
try {
await fs.access(candidate, fsConstants.X_OK);
return candidate;
} catch {
// Try the next configured helper location.
}
}
return null;
}
function getNativeMacCaptureHelperCandidates() {
const envPath = process.env.OPENSCREEN_SCK_CAPTURE_EXE?.trim();
const archTag = process.arch === "arm64" ? "darwin-arm64" : "darwin-x64";
@@ -918,6 +1338,81 @@ function completeNativeWindowsCursorPauseRange(endMs = Date.now()) {
nativeWindowsPauseStartedAtMs = null;
}
function resetNativeWindowsCaptureState() {
nativeWindowsCaptureProcess = null;
nativeWindowsCaptureTargetPath = null;
nativeWindowsCaptureWebcamTargetPath = null;
nativeWindowsCaptureRecordingId = null;
nativeWindowsCursorOffsetMs = 0;
nativeWindowsCursorCaptureMode = "editable-overlay";
nativeWindowsCursorRecordingStartMs = 0;
nativeWindowsPauseStartedAtMs = null;
nativeWindowsPauseRanges = [];
nativeWindowsIsPaused = false;
nativeWindowsCaptureStopping = false;
clearGuideHotkeyRecording();
}
function hasActiveNativeWindowsCaptureProcess() {
const proc = nativeWindowsCaptureProcess;
if (!proc) {
return false;
}
if (proc.exitCode === null && !proc.killed) {
return true;
}
console.warn("[native-wgc] clearing stale Windows capture process state", {
exitCode: proc.exitCode,
killed: proc.killed,
});
resetNativeWindowsCaptureState();
return false;
}
function attachNativeWindowsCaptureLifecycle(
proc: ChildProcessWithoutNullStreams,
sourceName: string,
onRecordingStateChange?: (recording: boolean, sourceName: string) => void,
) {
const cleanupAfterUnexpectedExit = async () => {
try {
await stopCursorRecording();
} catch (error) {
console.warn("[native-wgc] failed to stop cursor recording after helper exit", error);
}
pendingCursorRecordingData = null;
resetNativeWindowsCaptureState();
onRecordingStateChange?.(false, sourceName);
};
function onClose(code: number | null, signal: NodeJS.Signals | null) {
proc.off("error", onError);
if (nativeWindowsCaptureProcess !== proc || nativeWindowsCaptureStopping) {
return;
}
console.warn("[native-wgc] Windows capture helper exited before stop was requested", {
code,
signal,
output: nativeWindowsCaptureOutput.trim(),
});
void cleanupAfterUnexpectedExit();
}
function onError(error: Error) {
proc.off("close", onClose);
if (nativeWindowsCaptureProcess !== proc || nativeWindowsCaptureStopping) {
return;
}
console.warn("[native-wgc] Windows capture helper errored before stop was requested", error);
void cleanupAfterUnexpectedExit();
}
proc.once("close", onClose);
proc.once("error", onError);
}
function waitForNativeWindowsCaptureStart(proc: ChildProcessWithoutNullStreams) {
return new Promise<void>((resolve, reject) => {
const timer = setTimeout(() => {
@@ -1312,17 +1807,79 @@ export function registerIpcHandlers(
ipcMain.handle("get-sources", async (_, opts) => {
const sources = await desktopCapturer.getSources(opts);
lastEnumeratedSources = new Map(sources.map((source) => [source.id, source]));
return sources.map((source) => ({
id: source.id,
name: source.name,
display_id: source.display_id,
thumbnail: source.thumbnail ? source.thumbnail.toDataURL() : null,
appIcon: source.appIcon ? source.appIcon.toDataURL() : null,
}));
let screenSourceIndex = 0;
const processedSources = sources.map((source) => {
const isScreenSource = source.id.startsWith("screen:");
const sourceIndex = isScreenSource
? (parseDesktopCapturerScreenIndex(source.id) ?? screenSourceIndex)
: undefined;
const { display, displayIndex } = isScreenSource
? findDisplayForSource(source, screenSourceIndex)
: { display: null, displayIndex: undefined };
if (isScreenSource) {
screenSourceIndex += 1;
}
const bounds = display ? toSourceBounds(display.bounds) : undefined;
const displayLabel = bounds
? `Display ${(displayIndex ?? sourceIndex ?? 0) + 1} - ${bounds.width}x${bounds.height} @ ${bounds.x},${bounds.y}`
: undefined;
return {
id: source.id,
name: source.name,
display_id: source.display_id,
thumbnail: source.thumbnail ? source.thumbnail.toDataURL() : null,
appIcon: source.appIcon ? source.appIcon.toDataURL() : null,
displayId: display?.id,
displayIndex,
screenIndex: sourceIndex,
displayLabel,
bounds,
};
});
const screenDisplays = screen.getAllDisplays();
const mappedDisplayIds = new Set(
processedSources
.filter((source) => source.id.startsWith("screen:") && typeof source.displayId === "number")
.map((source) => source.displayId),
);
const fallbackScreenSources = screenDisplays
.map((display, displayIndex) => ({ display, displayIndex }))
.filter(({ display }) => !mappedDisplayIds.has(display.id))
.map(({ display, displayIndex }) => {
const bounds = toSourceBounds(display.bounds);
return {
id: `screen:${displayIndex}:fallback:${display.id}`,
name: `Screen ${displayIndex + 1}`,
display_id: String(display.id),
thumbnail: null,
appIcon: null,
displayId: display.id,
displayIndex,
screenIndex: displayIndex,
displayLabel: `Display ${displayIndex + 1} - ${bounds.width}x${bounds.height} @ ${bounds.x},${bounds.y}`,
bounds,
};
});
if (fallbackScreenSources.length > 0) {
console.warn("[desktop-capturer] added fallback display sources", {
capturerScreens: processedSources.filter((source) => source.id.startsWith("screen:"))
.length,
electronDisplays: screenDisplays.length,
fallbackScreens: fallbackScreenSources.map((source) => ({
id: source.id,
displayId: source.displayId,
bounds: source.bounds,
})),
});
}
return [...processedSources, ...fallbackScreenSources];
});
ipcMain.handle("select-source", async (_, source: SelectedSource) => {
selectedSource = source;
selectedSource = {
...source,
bounds: normalizeSourceBounds(source.bounds),
};
// Reuse the exact source object returned during enumeration to avoid
// Windows window-source id mismatches across separate getSources() calls.
selectedDesktopSource =
@@ -1520,7 +2077,7 @@ export function registerIpcHandlers(
error: "Windows Graphics Capture requires Windows 10 build 19041 or newer.",
};
}
if (nativeWindowsCaptureProcess) {
if (hasActiveNativeWindowsCaptureProcess()) {
return { success: false, error: "Native Windows capture is already running." };
}
@@ -1545,16 +2102,19 @@ export function registerIpcHandlers(
RECORDINGS_DIR,
`${RECORDING_FILE_PREFIX}${recordingId}-webcam.mp4`,
);
const requestBounds = normalizeSourceBounds(request.source.bounds);
const sourceDisplay =
request.source.type === "display" && typeof request.source.displayId === "number"
? (screen.getAllDisplays().find((display) => display.id === request.source.displayId) ??
null)
: getSelectedDisplay();
const bounds = sourceDisplay?.bounds ?? getSelectedSourceBounds();
const bounds = requestBounds ?? sourceDisplay?.bounds ?? getSelectedSourceBounds();
const displayId =
typeof request.source.displayId === "number" && Number.isFinite(request.source.displayId)
? request.source.displayId
: Number(selectedSource?.display_id);
: typeof selectedSource?.displayId === "number"
? selectedSource.displayId
: Number(selectedSource?.display_id);
const webcamDirectShowClsid = request.webcam.enabled
? await resolveDirectShowWebcamClsid(request.webcam.deviceName)
: null;
@@ -1666,6 +2226,8 @@ export function registerIpcHandlers(
});
const source = selectedSource || { name: "Screen" };
attachNativeWindowsCaptureLifecycle(proc, source.name, onRecordingStateChange);
startGuideHotkeyRecording(recordingId, bounds);
if (onRecordingStateChange) {
onRecordingStateChange(true, source.name);
}
@@ -1679,16 +2241,7 @@ export function registerIpcHandlers(
} catch (error) {
console.error("Failed to start native Windows recording:", error);
nativeWindowsCaptureProcess?.kill();
nativeWindowsCaptureProcess = null;
nativeWindowsCaptureTargetPath = null;
nativeWindowsCaptureWebcamTargetPath = null;
nativeWindowsCaptureRecordingId = null;
nativeWindowsCursorOffsetMs = 0;
nativeWindowsCursorCaptureMode = "editable-overlay";
nativeWindowsCursorRecordingStartMs = 0;
nativeWindowsPauseStartedAtMs = null;
nativeWindowsPauseRanges = [];
nativeWindowsIsPaused = false;
resetNativeWindowsCaptureState();
await stopCursorRecording();
return { success: false, error: String(error) };
}
@@ -1811,6 +2364,7 @@ export function registerIpcHandlers(
: 0;
const source = selectedSource || { name: "Screen" };
startGuideHotkeyRecording(recordingId, bounds);
if (onRecordingStateChange) {
onRecordingStateChange(true, source.name);
}
@@ -1833,6 +2387,7 @@ export function registerIpcHandlers(
nativeMacPauseStartedAtMs = null;
nativeMacPauseRanges = [];
nativeMacIsPaused = false;
clearGuideHotkeyRecording();
await stopCursorRecording();
return { success: false, error: error instanceof Error ? error.message : String(error) };
}
@@ -1858,6 +2413,7 @@ export function registerIpcHandlers(
proc.stdin.write("pause\n");
nativeMacIsPaused = true;
nativeMacPauseStartedAtMs = Date.now();
pauseGuideHotkeyRecording();
return { success: true };
} catch (error) {
return { success: false, error: error instanceof Error ? error.message : String(error) };
@@ -1884,6 +2440,7 @@ export function registerIpcHandlers(
proc.stdin.write("resume\n");
completeNativeMacCursorPauseRange();
nativeMacIsPaused = false;
resumeGuideHotkeyRecording();
return { success: true };
} catch (error) {
return { success: false, error: error instanceof Error ? error.message : String(error) };
@@ -1906,6 +2463,7 @@ export function registerIpcHandlers(
proc.stdin.write("pause\n");
nativeWindowsIsPaused = true;
nativeWindowsPauseStartedAtMs = Date.now();
pauseGuideHotkeyRecording();
return { success: true };
} catch (error) {
return { success: false, error: error instanceof Error ? error.message : String(error) };
@@ -1928,6 +2486,7 @@ export function registerIpcHandlers(
proc.stdin.write("resume\n");
completeNativeWindowsCursorPauseRange();
nativeWindowsIsPaused = false;
resumeGuideHotkeyRecording();
return { success: true };
} catch (error) {
return { success: false, error: error instanceof Error ? error.message : String(error) };
@@ -1941,11 +2500,13 @@ export function registerIpcHandlers(
const recordingId = nativeWindowsCaptureRecordingId ?? Date.now();
const cursorCaptureMode = nativeWindowsCursorCaptureMode;
if (!proc) {
if (!proc || proc.exitCode !== null || proc.killed) {
resetNativeWindowsCaptureState();
return { success: false, error: "Native Windows capture is not running." };
}
try {
nativeWindowsCaptureStopping = true;
completeNativeWindowsCursorPauseRange();
const stoppedPathPromise = waitForNativeWindowsCaptureStop(proc);
proc.stdin.write("stop\n");
@@ -2007,16 +2568,7 @@ export function registerIpcHandlers(
await stopCursorRecording();
return { success: false, error: String(error) };
} finally {
nativeWindowsCaptureProcess = null;
nativeWindowsCaptureTargetPath = null;
nativeWindowsCaptureWebcamTargetPath = null;
nativeWindowsCaptureRecordingId = null;
nativeWindowsCursorOffsetMs = 0;
nativeWindowsCursorCaptureMode = "editable-overlay";
nativeWindowsCursorRecordingStartMs = 0;
nativeWindowsPauseStartedAtMs = null;
nativeWindowsPauseRanges = [];
nativeWindowsIsPaused = false;
resetNativeWindowsCaptureState();
const source = selectedSource || { name: "Screen" };
if (onRecordingStateChange) {
onRecordingStateChange(false, source.name);
@@ -2102,6 +2654,7 @@ export function registerIpcHandlers(
nativeMacPauseStartedAtMs = null;
nativeMacPauseRanges = [];
nativeMacIsPaused = false;
clearGuideHotkeyRecording();
const source = selectedSource || { name: "Screen" };
if (onRecordingStateChange) {
onRecordingStateChange(false, source.name);
@@ -2178,11 +2731,28 @@ export function registerIpcHandlers(
const guideAiSettingsStore = new DeepSeekSettingsStore(
path.join(app.getPath("userData"), "guide-ai-settings.json"),
);
registerGuideIpcHandlers(
ipcMain,
new GuideStore(RECORDINGS_DIR, { deepSeekConfigProvider: guideAiSettingsStore }),
guideAiSettingsStore,
);
const guideStore = new GuideStore(RECORDINGS_DIR, {
deepSeekConfigProvider: guideAiSettingsStore,
ocrConfigProvider: guideAiSettingsStore,
});
registerGuideMarkerHotkey(guideStore);
registerGuideIpcHandlers(ipcMain, guideStore, guideAiSettingsStore, {
onSessionStarted: (session) => activateGuideHotkeySession(session.recordingId),
onSessionEnded: (recordingId) => deactivateGuideHotkeySession(recordingId),
});
ipcMain.handle("guide:capture-pointer-marker", async () => {
const result = await captureGuideHotkeyMarker(guideStore, "button");
if (result.error) {
return {
success: false,
code: "guide-internal-error",
error: result.error,
retryable: true,
};
}
return { success: true, data: result };
});
ipcMain.handle("store-recorded-session", async (_, payload: StoreRecordedSessionInput) => {
try {
@@ -2315,6 +2885,11 @@ export function registerIpcHandlers(
} else {
await stopCursorRecording();
}
if (recording) {
startGuideHotkeyRecording(recordingId, getSelectedSourceBounds());
} else {
clearGuideHotkeyRecording();
}
const source = selectedSource || { name: "Screen" };
if (onRecordingStateChange) {
+1 -1
View File
@@ -46,7 +46,7 @@ Build the Windows helper with:
npm run build:native:win
```
The build writes the CMake output to `electron/native/wgc-capture/build/wgc-capture.exe` and copies the redistributable binary to `electron/native/bin/win32-x64/wgc-capture.exe`.
The build writes the CMake output to `electron/native/wgc-capture/build/wgc-capture.exe` and copies the redistributable binary to `electron/native/bin/win32-x64/wgc-capture.exe`. It also builds `cursor-sampler.exe` for editable cursor telemetry and `guide-hotkey-listener.exe` for the Guide Mode global Ctrl capture hook.
The helper contract is process-based: the app starts the process with one JSON argument and sends commands on stdin. `stop\n` finalizes the recording. During migration the helper prints both newline-delimited JSON events and the legacy text messages `Recording started` / `Recording stopped. Output path: <path>`.
@@ -65,3 +65,19 @@ target_link_libraries(cursor-sampler PRIVATE
gdi32
gdiplus
)
add_executable(guide-hotkey-listener
src/guide-hotkey-listener.cpp
)
target_compile_definitions(guide-hotkey-listener PRIVATE
NOMINMAX
WIN32_LEAN_AND_MEAN
_WIN32_WINNT=0x0A00
)
target_compile_options(guide-hotkey-listener PRIVATE /EHsc /W4 /utf-8)
target_link_libraries(guide-hotkey-listener PRIVATE
user32
)
@@ -0,0 +1,91 @@
#include <windows.h>
#include <atomic>
#include <chrono>
#include <cstdint>
#include <iostream>
#include <mutex>
#include <string>
static HHOOK g_keyboardHook = nullptr;
static DWORD g_mainThreadId = 0;
static std::atomic<bool> g_ctrlDown{false};
static std::mutex g_stdoutMutex;
static int64_t nowMs() {
return static_cast<int64_t>(
std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count());
}
static void writeJsonLine(const std::string& json) {
std::lock_guard<std::mutex> lock(g_stdoutMutex);
std::cout << json << '\n';
std::cout.flush();
}
static bool isCtrlKey(DWORD vkCode) {
return vkCode == VK_CONTROL || vkCode == VK_LCONTROL || vkCode == VK_RCONTROL;
}
static LRESULT CALLBACK LowLevelKeyboardProc(int nCode, WPARAM wParam, LPARAM lParam) {
if (nCode >= 0) {
const auto* event = reinterpret_cast<KBDLLHOOKSTRUCT*>(lParam);
if (event && isCtrlKey(event->vkCode)) {
if (wParam == WM_KEYDOWN || wParam == WM_SYSKEYDOWN) {
const bool wasDown = g_ctrlDown.exchange(true, std::memory_order_acq_rel);
if (!wasDown) {
writeJsonLine(
"{\"event\":\"guide-hotkey\",\"key\":\"control\",\"state\":\"down\",\"timeMs\":" +
std::to_string(nowMs()) + "}");
}
} else if (wParam == WM_KEYUP || wParam == WM_SYSKEYUP) {
g_ctrlDown.store(false, std::memory_order_release);
}
}
}
return CallNextHookEx(g_keyboardHook, nCode, wParam, lParam);
}
static BOOL WINAPI consoleCtrlHandler(DWORD signal) {
if (
signal == CTRL_C_EVENT ||
signal == CTRL_BREAK_EVENT ||
signal == CTRL_CLOSE_EVENT ||
signal == CTRL_LOGOFF_EVENT ||
signal == CTRL_SHUTDOWN_EVENT
) {
PostThreadMessage(g_mainThreadId, WM_QUIT, 0, 0);
return TRUE;
}
return FALSE;
}
int main() {
g_mainThreadId = GetCurrentThreadId();
SetConsoleCtrlHandler(consoleCtrlHandler, TRUE);
g_keyboardHook = SetWindowsHookExW(WH_KEYBOARD_LL, LowLevelKeyboardProc, GetModuleHandleW(nullptr), 0);
if (!g_keyboardHook) {
std::cerr << "Failed to install guide hotkey keyboard hook. error=" << GetLastError() << std::endl;
return 1;
}
writeJsonLine("{\"event\":\"ready\"}");
MSG msg{};
while (GetMessageW(&msg, nullptr, 0, 0) > 0) {
TranslateMessage(&msg);
DispatchMessageW(&msg);
}
if (g_keyboardHook) {
UnhookWindowsHookEx(g_keyboardHook);
g_keyboardHook = nullptr;
}
return 0;
}
+1
View File
@@ -400,6 +400,7 @@ int main(int argc, char* argv[]) {
if (config.sourceType == "display") {
HMONITOR monitor = findMonitorForCapture(
config.displayId,
config.sourceId,
config.hasDisplayBounds ? &config.bounds : nullptr);
if (!monitor) {
std::cerr << "ERROR: Could not resolve monitor" << std::endl;
@@ -2,6 +2,7 @@
#include <algorithm>
#include <cmath>
#include <string>
#include <vector>
namespace {
@@ -43,9 +44,36 @@ int64_t overlapArea(const RECT& rect, const MonitorBounds& bounds) {
return static_cast<int64_t>(right - left) * static_cast<int64_t>(bottom - top);
}
int parseScreenSourceIndex(const std::string& sourceId) {
constexpr char prefix[] = "screen:";
if (sourceId.rfind(prefix, 0) != 0) {
return -1;
}
const size_t start = sizeof(prefix) - 1;
const size_t end = sourceId.find(':', start);
const std::string indexText = sourceId.substr(
start,
end == std::string::npos ? std::string::npos : end - start);
if (indexText.empty()) {
return -1;
}
try {
size_t parsed = 0;
const int index = std::stoi(indexText, &parsed, 10);
return parsed == indexText.size() && index >= 0 ? index : -1;
} catch (...) {
return -1;
}
}
} // namespace
HMONITOR findMonitorForCapture(int64_t displayId, const MonitorBounds* bounds) {
HMONITOR findMonitorForCapture(
int64_t displayId,
const std::string& sourceId,
const MonitorBounds* bounds) {
const auto monitors = enumerateMonitors();
if (monitors.empty()) {
return MonitorFromPoint({0, 0}, MONITOR_DEFAULTTOPRIMARY);
@@ -84,5 +112,10 @@ HMONITOR findMonitorForCapture(int64_t displayId, const MonitorBounds* bounds) {
}
}
const int sourceIndex = parseScreenSourceIndex(sourceId);
if (sourceIndex >= 0 && static_cast<size_t>(sourceIndex) < monitors.size()) {
return monitors[static_cast<size_t>(sourceIndex)].monitor;
}
return MonitorFromPoint({0, 0}, MONITOR_DEFAULTTOPRIMARY);
}
@@ -3,6 +3,7 @@
#include <Windows.h>
#include <cstdint>
#include <string>
struct MonitorBounds {
int x = 0;
@@ -11,4 +12,7 @@ struct MonitorBounds {
int height = 0;
};
HMONITOR findMonitorForCapture(int64_t displayId, const MonitorBounds* bounds);
HMONITOR findMonitorForCapture(
int64_t displayId,
const std::string& sourceId,
const MonitorBounds* bounds);
@@ -28,6 +28,60 @@ bool succeeded(HRESULT hr, const char* label) {
return false;
}
Microsoft::WRL::ComPtr<IDXGIAdapter1> findAdapterForMonitor(HMONITOR monitor) {
if (!monitor) {
return nullptr;
}
Microsoft::WRL::ComPtr<IDXGIFactory1> factory;
HRESULT hr = CreateDXGIFactory1(IID_PPV_ARGS(&factory));
if (FAILED(hr) || !factory) {
std::cerr << "WARNING: CreateDXGIFactory1 failed while resolving monitor adapter (hr=0x"
<< std::hex << hr << std::dec << ")" << std::endl;
return nullptr;
}
for (UINT adapterIndex = 0;; ++adapterIndex) {
Microsoft::WRL::ComPtr<IDXGIAdapter1> adapter;
hr = factory->EnumAdapters1(adapterIndex, adapter.GetAddressOf());
if (hr == DXGI_ERROR_NOT_FOUND) {
break;
}
if (FAILED(hr) || !adapter) {
continue;
}
DXGI_ADAPTER_DESC1 adapterDesc{};
if (SUCCEEDED(adapter->GetDesc1(&adapterDesc)) &&
(adapterDesc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) != 0) {
continue;
}
for (UINT outputIndex = 0;; ++outputIndex) {
Microsoft::WRL::ComPtr<IDXGIOutput> output;
hr = adapter->EnumOutputs(outputIndex, output.GetAddressOf());
if (hr == DXGI_ERROR_NOT_FOUND) {
break;
}
if (FAILED(hr) || !output) {
continue;
}
DXGI_OUTPUT_DESC outputDesc{};
if (SUCCEEDED(output->GetDesc(&outputDesc)) && outputDesc.Monitor == monitor) {
std::cout << "{\"event\":\"display-adapter-resolved\",\"schemaVersion\":2,"
<< "\"adapterIndex\":" << adapterIndex
<< ",\"outputIndex\":" << outputIndex << "}" << std::endl;
return adapter;
}
}
}
std::cerr << "WARNING: Could not resolve DXGI adapter for selected monitor; using default adapter"
<< std::endl;
return nullptr;
}
int64_t timeSpanToHns(wf::TimeSpan const& value) {
return value.count();
}
@@ -38,7 +92,7 @@ WgcSession::~WgcSession() {
stop();
}
bool WgcSession::createD3DDevice() {
bool WgcSession::createD3DDevice(IDXGIAdapter* adapter) {
UINT flags = D3D11_CREATE_DEVICE_BGRA_SUPPORT;
#if defined(_DEBUG)
flags |= D3D11_CREATE_DEVICE_DEBUG;
@@ -53,8 +107,8 @@ bool WgcSession::createD3DDevice() {
D3D_FEATURE_LEVEL featureLevel{};
HRESULT hr = D3D11CreateDevice(
nullptr,
D3D_DRIVER_TYPE_HARDWARE,
adapter,
adapter ? D3D_DRIVER_TYPE_UNKNOWN : D3D_DRIVER_TYPE_HARDWARE,
nullptr,
flags,
featureLevels,
@@ -67,6 +121,23 @@ bool WgcSession::createD3DDevice() {
#if defined(_DEBUG)
if (FAILED(hr)) {
flags &= ~D3D11_CREATE_DEVICE_DEBUG;
hr = D3D11CreateDevice(
adapter,
adapter ? D3D_DRIVER_TYPE_UNKNOWN : D3D_DRIVER_TYPE_HARDWARE,
nullptr,
flags,
featureLevels,
ARRAYSIZE(featureLevels),
D3D11_SDK_VERSION,
&d3dDevice_,
&featureLevel,
&d3dContext_);
}
#endif
if (FAILED(hr) && adapter) {
std::cerr << "WARNING: D3D11CreateDevice failed for selected monitor adapter (hr=0x"
<< std::hex << hr << std::dec << "); retrying default adapter" << std::endl;
hr = D3D11CreateDevice(
nullptr,
D3D_DRIVER_TYPE_HARDWARE,
@@ -79,7 +150,6 @@ bool WgcSession::createD3DDevice() {
&featureLevel,
&d3dContext_);
}
#endif
if (!succeeded(hr, "D3D11CreateDevice")) {
return false;
@@ -100,6 +170,11 @@ bool WgcSession::createD3DDevice() {
return true;
}
bool WgcSession::createD3DDeviceForMonitor(HMONITOR monitor) {
auto adapter = findAdapterForMonitor(monitor);
return createD3DDevice(adapter.Get());
}
bool WgcSession::createCaptureItem(HMONITOR monitor) {
auto factory = winrt::get_activation_factory<wgcap::GraphicsCaptureItem>();
auto interop = factory.as<IGraphicsCaptureItemInterop>();
@@ -188,7 +263,7 @@ bool WgcSession::applySessionOptions(bool captureCursor) {
bool WgcSession::initialize(HMONITOR monitor, int fps, bool captureCursor) {
fps_ = fps > 0 ? fps : 60;
if (!createD3DDevice()) {
if (!createD3DDeviceForMonitor(monitor)) {
return false;
}
if (!createCaptureItem(monitor)) {
@@ -2,6 +2,7 @@
#include <Windows.h>
#include <d3d11.h>
#include <dxgi.h>
#include <windows.graphics.capture.h>
#include <windows.graphics.directx.direct3d11.interop.h>
#include <winrt/Windows.Foundation.h>
@@ -34,7 +35,8 @@ public:
ID3D11DeviceContext* context() const;
private:
bool createD3DDevice();
bool createD3DDevice(IDXGIAdapter* adapter = nullptr);
bool createD3DDeviceForMonitor(HMONITOR monitor);
bool createCaptureItem(HMONITOR monitor);
bool createCaptureItem(HWND window);
bool applySessionOptions(bool captureCursor);
+14
View File
@@ -1,10 +1,12 @@
import { contextBridge, ipcRenderer } from "electron";
import type {
AddGuideMarkerInput,
CaptureGuidePointerMarkerResult,
DiscardGuideSessionInput,
ExportGuideInput,
FinalizeGuideEventsInput,
GenerateGuideDraftInput,
GuideMarkerCapturedPayload,
RunGuideOcrInput,
SaveGuideAiSettingsInput,
SaveGuideInput,
@@ -37,6 +39,18 @@ contextBridge.exposeInMainWorld("electronAPI", {
addMarker: (input: AddGuideMarkerInput) => {
return ipcRenderer.invoke("guide:add-marker", input);
},
capturePointerMarker: () => {
return ipcRenderer.invoke("guide:capture-pointer-marker") as Promise<
import("../src/guide/contracts").GuideIpcResult<CaptureGuidePointerMarkerResult>
>;
},
onMarkerCaptured: (callback: (payload: GuideMarkerCapturedPayload) => void) => {
const listener = (_event: Electron.IpcRendererEvent, payload: GuideMarkerCapturedPayload) => {
callback(payload);
};
ipcRenderer.on("guide:marker-captured", listener);
return () => ipcRenderer.removeListener("guide:marker-captured", listener);
},
finalizeEvents: (input: FinalizeGuideEventsInput) => {
return ipcRenderer.invoke("guide:finalize-events", input);
},
+2 -2
View File
@@ -1,12 +1,12 @@
{
"name": "openscreen",
"version": "1.4.0",
"version": "1.4.6",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "openscreen",
"version": "1.4.0",
"version": "1.4.6",
"dependencies": {
"@fix-webm-duration/fix": "^1.0.1",
"@pixi/filter-drop-shadow": "^5.2.0",
+3 -1
View File
@@ -1,7 +1,7 @@
{
"name": "openscreen",
"private": true,
"version": "1.4.0",
"version": "1.4.6",
"type": "module",
"packageManager": "npm@10.9.4",
"engines": {
@@ -25,6 +25,8 @@
"build:native:win": "node scripts/build-windows-wgc-helper.mjs",
"build:ocr:win": "node scripts/build-windows-ocr-service.mjs",
"build:win": "npm run build:native:win && npm run build:ocr:win && tsc && vite build && electron-builder --win --config electron-builder.json5 --config.npmRebuild=false",
"build:win:private-trust": "npm run build:native:win && npm run build:ocr:win && tsc && vite build && electron-builder --win --config electron-builder.private-trust.cjs --config.npmRebuild=false",
"sign:win:private-trust": "node scripts/sign-windows-private-trust.mjs",
"build:linux": "tsc && vite build && electron-builder --linux AppImage deb pacman --config electron-builder.json5 --config.npmRebuild=false",
"test": "vitest --run",
"test:watch": "vitest",
+10
View File
@@ -126,6 +126,11 @@ if (!fs.existsSync(cursorSamplerOutputPath)) {
throw new Error(`WGC helper build completed but ${cursorSamplerOutputPath} was not found.`);
}
const guideHotkeyListenerOutputPath = path.join(BUILD_DIR, "guide-hotkey-listener.exe");
if (!fs.existsSync(guideHotkeyListenerOutputPath)) {
throw new Error(`WGC helper build completed but ${guideHotkeyListenerOutputPath} was not found.`);
}
fs.mkdirSync(BIN_DIR, { recursive: true });
const distributablePath = path.join(BIN_DIR, "wgc-capture.exe");
fs.copyFileSync(outputPath, distributablePath);
@@ -133,7 +138,12 @@ fs.copyFileSync(outputPath, distributablePath);
const cursorSamplerDistributablePath = path.join(BIN_DIR, "cursor-sampler.exe");
fs.copyFileSync(cursorSamplerOutputPath, cursorSamplerDistributablePath);
const guideHotkeyListenerDistributablePath = path.join(BIN_DIR, "guide-hotkey-listener.exe");
fs.copyFileSync(guideHotkeyListenerOutputPath, guideHotkeyListenerDistributablePath);
console.log(`Built ${outputPath}`);
console.log(`Copied ${distributablePath}`);
console.log(`Built ${cursorSamplerOutputPath}`);
console.log(`Copied ${cursorSamplerDistributablePath}`);
console.log(`Built ${guideHotkeyListenerOutputPath}`);
console.log(`Copied ${guideHotkeyListenerDistributablePath}`);
+173
View File
@@ -0,0 +1,173 @@
import { spawn } from "node:child_process";
import fs from "node:fs";
import path from "node:path";
import process from "node:process";
const rootDir = process.cwd();
const packageJson = JSON.parse(fs.readFileSync(path.join(rootDir, "package.json"), "utf8"));
function loadLocalSigningEnv() {
const envPath = path.join(rootDir, ".env.signing.local");
if (!fs.existsSync(envPath)) {
return;
}
const lines = fs.readFileSync(envPath, "utf8").split(/\r?\n/);
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) {
continue;
}
const match = trimmed.match(/^([A-Za-z_][A-Za-z0-9_]*)=(.*)$/);
if (!match || process.env[match[1]]) {
continue;
}
process.env[match[1]] = match[2].replace(/^['"]|['"]$/g, "");
}
}
function usage() {
return [
"Usage:",
" node scripts/sign-windows-private-trust.mjs [--file <path>]",
"",
"Defaults to release/<version>/Openscreen Setup <version>.exe",
].join("\n");
}
function parseArgs(argv) {
const args = { file: null };
for (let i = 0; i < argv.length; i += 1) {
const arg = argv[i];
if (arg === "--help" || arg === "-h") {
console.log(usage());
process.exit(0);
}
if (arg === "--file") {
args.file = argv[i + 1];
i += 1;
continue;
}
throw new Error(`Unknown argument: ${arg}\n${usage()}`);
}
return args;
}
function requireEnv(name) {
const value = process.env[name]?.trim();
if (!value) {
throw new Error(`Missing required environment variable: ${name}`);
}
return value;
}
function hasAnyAuthMode() {
const hasClientSecret = Boolean(process.env.AZURE_CLIENT_SECRET?.trim());
const hasClientCertificate = Boolean(process.env.AZURE_CLIENT_CERTIFICATE_PATH?.trim());
const hasUsernamePassword = Boolean(
process.env.AZURE_USERNAME?.trim() && process.env.AZURE_PASSWORD?.trim(),
);
return hasClientSecret || hasClientCertificate || hasUsernamePassword;
}
function psQuote(value) {
return `'${String(value).replaceAll("'", "''")}'`;
}
function runPowerShell(command) {
return new Promise((resolve, reject) => {
const candidates = ["pwsh.exe", "powershell.exe"];
const tryCandidate = (index, lastError) => {
if (index >= candidates.length) {
reject(lastError ?? new Error("Unable to find PowerShell"));
return;
}
const child = spawn(
candidates[index],
["-NoProfile", "-NonInteractive", "-Command", command],
{
stdio: "inherit",
windowsHide: true,
},
);
child.on("error", (error) => tryCandidate(index + 1, error));
child.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`${candidates[index]} exited with code ${code}`));
});
};
tryCandidate(0);
});
}
async function main() {
const args = parseArgs(process.argv.slice(2));
const defaultInstaller = path.join(
rootDir,
"release",
packageJson.version,
`Openscreen Setup ${packageJson.version}.exe`,
);
const fileToSign = path.resolve(rootDir, args.file ?? defaultInstaller);
if (!fs.existsSync(fileToSign)) {
throw new Error(`Installer not found: ${fileToSign}`);
}
requireEnv("AZURE_TENANT_ID");
requireEnv("AZURE_CLIENT_ID");
if (!hasAnyAuthMode()) {
throw new Error(
"Missing Azure auth mode. Set AZURE_CLIENT_SECRET, or AZURE_CLIENT_CERTIFICATE_PATH, or AZURE_USERNAME/AZURE_PASSWORD.",
);
}
const endpoint = requireEnv("AZURE_TRUSTED_SIGNING_ENDPOINT");
const accountName = requireEnv("AZURE_TRUSTED_SIGNING_ACCOUNT_NAME");
const profileName = requireEnv("AZURE_TRUSTED_SIGNING_CERTIFICATE_PROFILE_NAME");
const timestampUrl =
process.env.AZURE_TRUSTED_SIGNING_TIMESTAMP_RFC3161?.trim() ||
"http://timestamp.acs.microsoft.com";
const installCommand = [
"Install-PackageProvider -Name NuGet -MinimumVersion 2.8.5.201 -Force -Scope CurrentUser",
"Install-Module -Name TrustedSigning -MinimumVersion 0.5.0 -Force -Repository PSGallery -Scope CurrentUser",
].join("; ");
const signCommand = [
"Invoke-TrustedSigning",
`-Endpoint ${psQuote(endpoint)}`,
`-CertificateProfileName ${psQuote(profileName)}`,
`-CodeSigningAccountName ${psQuote(accountName)}`,
`-TimestampRfc3161 ${psQuote(timestampUrl)}`,
"-TimestampDigest SHA256",
"-FileDigest SHA256",
`-Files ${psQuote(fileToSign)}`,
].join(" ");
const verifyCommand = [
"$signature = Get-AuthenticodeSignature -FilePath",
psQuote(fileToSign),
"; $signature | Format-List Status,StatusMessage,SignerCertificate,TimeStamperCertificate",
].join(" ");
console.log(`Signing ${fileToSign}`);
await runPowerShell(installCommand);
await runPowerShell(signCommand);
await runPowerShell(verifyCommand);
}
loadLocalSigningEnv();
try {
await main();
} catch (error) {
console.error(error instanceof Error ? error.message : String(error));
process.exit(1);
}
+18
View File
@@ -19,6 +19,7 @@ import {
MdVolumeUp,
} from "react-icons/md";
import { RxDragHandleDots2 } from "react-icons/rx";
import { toast } from "sonner";
import { useI18n, useScopedT } from "@/contexts/I18nContext";
import { getAvailableLocales, getLocaleName } from "@/i18n/loader";
import { nativeBridgeClient } from "@/native";
@@ -305,6 +306,23 @@ export function LaunchWindow() {
setHudMouseEventsEnabled(isLanguageMenuOpen);
}, [isLanguageMenuOpen, setHudMouseEventsEnabled]);
useEffect(() => {
const unsubscribe = window.electronAPI?.guide.onMarkerCaptured?.((payload) => {
const position =
typeof payload.normalizedX === "number" && typeof payload.normalizedY === "number"
? `x ${Math.round(payload.normalizedX * 100)}%, y ${Math.round(payload.normalizedY * 100)}%`
: undefined;
toast.success("Guide event captured", {
id: `guide-marker-${payload.eventId}`,
description: position,
duration: 1400,
});
});
return () => {
unsubscribe?.();
};
}, []);
const [selectedSource, setSelectedSource] = useState("Screen");
const [hasSelectedSource, setHasSelectedSource] = useState(false);
const [, setRecordPointerDownCount] = useState(0);
+41 -7
View File
@@ -11,6 +11,16 @@ interface DesktopSource {
thumbnail: string | null;
display_id: string;
appIcon: string | null;
displayId?: number;
displayIndex?: number;
screenIndex?: number;
displayLabel?: string;
bounds?: {
x: number;
y: number;
width: number;
height: number;
};
}
export function SourceSelector() {
@@ -39,6 +49,11 @@ export function SourceSelector() {
thumbnail: source.thumbnail,
display_id: source.display_id,
appIcon: source.appIcon,
displayId: source.displayId,
displayIndex: source.displayIndex,
screenIndex: source.screenIndex,
displayLabel: source.displayLabel,
bounds: source.bounds,
})),
);
} catch (error) {
@@ -50,7 +65,13 @@ export function SourceSelector() {
fetchSources();
}, []);
const screenSources = sources.filter((s) => s.id.startsWith("screen:"));
const screenSources = sources
.filter((s) => s.id.startsWith("screen:"))
.sort(
(left, right) =>
(left.displayIndex ?? left.screenIndex ?? Number.MAX_SAFE_INTEGER) -
(right.displayIndex ?? right.screenIndex ?? Number.MAX_SAFE_INTEGER),
);
const windowSources = sources.filter((s) => s.id.startsWith("window:"));
const handleSourceSelect = (source: DesktopSource) => setSelectedSource(source);
@@ -81,11 +102,17 @@ export function SourceSelector() {
onClick={() => handleSourceSelect(source)}
>
<div className="relative mb-1.5 overflow-hidden rounded-lg border border-white/[0.06] bg-black/30">
<img
src={source.thumbnail || ""}
alt={source.name}
className="w-full aspect-video object-cover"
/>
{source.thumbnail ? (
<img
src={source.thumbnail}
alt={source.name}
className="w-full aspect-video object-cover"
/>
) : (
<div className="flex aspect-video w-full items-center justify-center bg-zinc-950 text-center text-[11px] font-medium text-zinc-400">
{source.displayLabel ?? source.name}
</div>
)}
{isSelected && (
<div className="absolute right-1.5 top-1.5">
<div className={styles.checkBadge}>
@@ -98,7 +125,14 @@ export function SourceSelector() {
{source.appIcon && (
<img src={source.appIcon} alt="" className={`${styles.icon} flex-shrink-0`} />
)}
<div className={`${styles.name} truncate`}>{source.name}</div>
<div className="min-w-0">
<div className={`${styles.name} truncate`}>{source.name}</div>
{source.displayLabel && (
<div className="truncate text-[9px] leading-3 text-zinc-500">
{source.displayLabel}
</div>
)}
</div>
</div>
</div>
);
@@ -7,6 +7,7 @@ import type {
GuideAiProvider,
GuideAiSettings,
GuideLanguage,
GuideOcrProfile,
GuideSession,
} from "@/guide/contracts";
import { captureGuideSnapshots } from "@/guide/snapshot/extractGuideSnapshots";
@@ -42,13 +43,19 @@ const COPY = {
captureStep: "Capture step",
captureLabel: "Manual capture",
settings: "Settings",
guideSettings: "Guide settings",
apiKey: "API key env",
apiKeyPlaceholder: "DEEPSEEK_API_KEY",
baseUrl: "Base URL",
model: "Model",
ocrProfile: "OCR profile",
ocrLanguage: "OCR languages",
ocrFast: "Fast Latin",
ocrVietnamese: "Vietnamese Enhanced",
ocrHybrid: "Hybrid Vi + Latin",
saveSettings: "Save",
clearKey: "Reset env",
keySaved: "DeepSeek settings saved.",
settingsSaved: "Guide settings saved.",
keyMissing: "Set a DeepSeek API key environment variable before generating with DeepSeek.",
keyConfigured: "Env ready",
keyNotConfigured: "Env value missing",
@@ -78,13 +85,19 @@ const COPY = {
captureStep: "Chụp bước",
captureLabel: "Chụp thủ công",
settings: "Cài đặt",
guideSettings: "Guide settings",
apiKey: "API key env",
apiKeyPlaceholder: "DEEPSEEK_API_KEY",
baseUrl: "Base URL",
model: "Model",
ocrProfile: "OCR profile",
ocrLanguage: "OCR languages",
ocrFast: "Fast Latin",
ocrVietnamese: "Vietnamese Enhanced",
ocrHybrid: "Hybrid Vi + Latin",
saveSettings: "Lưu",
clearKey: "Reset env",
keySaved: "Đã lưu cài đặt DeepSeek.",
settingsSaved: "Da luu cai dat guide.",
keyMissing: "Hãy set biến môi trường DeepSeek API key trước khi tạo draft bằng DeepSeek.",
keyConfigured: "Env ready",
keyNotConfigured: "Chưa thấy giá trị env",
@@ -108,6 +121,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
const [deepSeekApiKeyEnvName, setDeepSeekApiKeyEnvName] = useState("DEEPSEEK_API_KEY");
const [deepSeekBaseUrl, setDeepSeekBaseUrl] = useState("https://api.deepseek.com");
const [deepSeekModel, setDeepSeekModel] = useState("deepseek-chat");
const [ocrProfile, setOcrProfile] = useState<GuideOcrProfile>("vietnamese");
const [ocrLanguage, setOcrLanguage] = useState("vi,en");
const [message, setMessage] = useState<string | null>(null);
const isBusy = busyAction !== null;
@@ -138,6 +153,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
setDeepSeekBaseUrl(result.data.deepseek.baseUrl);
setDeepSeekModel(result.data.deepseek.model);
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
setOcrProfile(result.data.ocr.profile);
setOcrLanguage(result.data.ocr.language);
}, []);
useEffect(() => {
@@ -269,6 +286,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
deepseekApiKeyEnvName: deepSeekApiKeyEnvName,
baseUrl: deepSeekBaseUrl,
model: deepSeekModel,
ocrProfile,
ocrLanguage,
});
if (!result.success) {
throw new Error(result.error);
@@ -277,7 +296,9 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
setDeepSeekBaseUrl(result.data.deepseek.baseUrl);
setDeepSeekModel(result.data.deepseek.model);
toast.success(copy.keySaved);
setOcrProfile(result.data.ocr.profile);
setOcrLanguage(result.data.ocr.language);
toast.success(copy.settingsSaved);
} catch (error) {
const text = error instanceof Error ? error.message : String(error);
setMessage(text);
@@ -285,7 +306,14 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
} finally {
setSettingsBusy(false);
}
}, [copy.keySaved, deepSeekApiKeyEnvName, deepSeekBaseUrl, deepSeekModel]);
}, [
copy.settingsSaved,
deepSeekApiKeyEnvName,
deepSeekBaseUrl,
deepSeekModel,
ocrLanguage,
ocrProfile,
]);
const handleClearDeepSeekKey = useCallback(async () => {
if (!window.electronAPI?.guide?.saveAiSettings) {
@@ -298,13 +326,17 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
clearDeepseekApiKeyEnvName: true,
baseUrl: deepSeekBaseUrl,
model: deepSeekModel,
ocrProfile,
ocrLanguage,
});
if (!result.success) {
throw new Error(result.error);
}
setAiSettings(result.data);
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
toast.success(copy.keySaved);
setOcrProfile(result.data.ocr.profile);
setOcrLanguage(result.data.ocr.language);
toast.success(copy.settingsSaved);
} catch (error) {
const text = error instanceof Error ? error.message : String(error);
setMessage(text);
@@ -312,7 +344,7 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
} finally {
setSettingsBusy(false);
}
}, [copy.keySaved, deepSeekBaseUrl, deepSeekModel]);
}, [copy.settingsSaved, deepSeekBaseUrl, deepSeekModel, ocrLanguage, ocrProfile]);
const handleGenerateGuide = useCallback(() => {
void runAction("generate", async () => {
@@ -455,7 +487,7 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
<div className="flex items-center justify-between gap-2">
<div className="min-w-0">
<div className="truncate text-[11px] font-semibold text-slate-100">
{copy.deepseek} {copy.settings}
{copy.guideSettings}
</div>
<div className="truncate text-[10px] text-slate-500">
{aiSettings?.deepseek.hasApiKey
@@ -470,6 +502,33 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
</span>
</div>
<div className="grid grid-cols-2 gap-1.5">
<label className="block min-w-0 text-[10px] font-medium text-slate-400">
{copy.ocrProfile}
<select
value={ocrProfile}
onChange={(event) => setOcrProfile(event.target.value as GuideOcrProfile)}
disabled={settingsBusy}
className="mt-1 h-8 w-full rounded-md border border-white/[0.08] bg-black/20 px-2 text-[11px] text-slate-100 outline-none"
>
<option value="vietnamese">{copy.ocrVietnamese}</option>
<option value="hybrid">{copy.ocrHybrid}</option>
<option value="fast">{copy.ocrFast}</option>
</select>
</label>
<label className="block min-w-0 text-[10px] font-medium text-slate-400">
{copy.ocrLanguage}
<input
type="text"
value={ocrLanguage}
onChange={(event) => setOcrLanguage(event.target.value)}
placeholder="vi,en"
disabled={settingsBusy}
className="mt-1 h-8 w-full rounded-md border border-white/[0.08] bg-black/20 px-2 text-[11px] text-slate-100 outline-none placeholder:text-slate-600"
/>
</label>
</div>
<label className="block text-[10px] font-medium text-slate-400">
{copy.apiKey}
<input
+36
View File
@@ -9,6 +9,7 @@ export type GuideTargetRole = "button" | "menu" | "tab" | "field" | "link" | "un
export type GuideLanguage = "vi" | "en";
export type GuideAiProvider = "deepseek" | "local";
export type GuideSecretStorage = "environment" | "none";
export type GuideOcrProfile = "fast" | "vietnamese" | "hybrid";
export type GuideSessionStatus =
| "recording"
@@ -79,10 +80,28 @@ export interface GuideStepCandidate {
action: GuideAction;
targetText?: string;
targetRole?: GuideTargetRole;
position?: {
normalizedX: number;
normalizedY: number;
xPercent: number;
yPercent: number;
description: string;
};
nearbyText: string[];
confidence: number;
}
export interface GuideMarkerCapturedPayload {
recordingId: string;
eventId: string;
timeMs: number;
trigger: "button" | "global-control" | "global-shortcut";
normalizedX?: number;
normalizedY?: number;
rawX?: number;
rawY?: number;
}
export interface GeneratedGuideStep {
id: string;
order: number;
@@ -115,11 +134,21 @@ export interface GuideSession {
updatedAt: string;
}
export interface CaptureGuidePointerMarkerResult {
captured: boolean;
session?: GuideSession;
event?: GuideEvent;
}
export interface AddGuideMarkerInput {
recordingId: GuideRecordingIdInput;
timeMs: number;
kind: "hotkey" | "manual";
label?: string;
x?: number;
y?: number;
normalizedX?: number;
normalizedY?: number;
}
export interface FinalizeGuideEventsInput {
@@ -150,6 +179,11 @@ export interface GenerateGuideDraftInput {
}
export interface GuideAiSettings {
ocr: {
profile: GuideOcrProfile;
language: string;
updatedAt?: string;
};
deepseek: {
hasApiKey: boolean;
apiKeyEnvName: string;
@@ -166,6 +200,8 @@ export interface SaveGuideAiSettingsInput {
clearDeepseekApiKeyEnvName?: boolean;
baseUrl?: string;
model?: string;
ocrProfile?: GuideOcrProfile;
ocrLanguage?: string;
}
export interface SaveGuideInput {
+18
View File
@@ -83,4 +83,22 @@ describe("guide exporters", () => {
expect(html).toContain("click-marker");
expect(html).toContain("left: 25.00%; top: 75.00%;");
});
it("draws click markers for hotkey events with coordinates", () => {
const hotkeySession: GuideSession = {
...session,
events: [
{
...session.events[0],
kind: "hotkey",
source: "guide-hotkey",
},
],
};
const html = exportGuideToHtml(hotkeySession);
expect(html).toContain("click-marker");
expect(html).toContain("left: 25.00%; top: 75.00%;");
});
});
+1 -1
View File
@@ -97,7 +97,7 @@ function resolveStepClickPoint(
: undefined;
const eventId = candidate?.eventId;
const event = eventId ? session.events.find((item) => item.id === eventId) : undefined;
if (!event || event.kind !== "click") {
if (!event || (event.kind !== "click" && event.kind !== "hotkey")) {
return null;
}
if (isNormalizedNumber(event.normalizedX) && isNormalizedNumber(event.normalizedY)) {
+9
View File
@@ -36,6 +36,13 @@ const candidates: GuideStepCandidate[] = [
action: "click",
targetText: "Save",
targetRole: "button",
position: {
normalizedX: 0.5,
normalizedY: 0.5,
xPercent: 50,
yPercent: 50,
description: "center",
},
nearbyText: ["Save"],
confidence: 0.9,
},
@@ -46,7 +53,9 @@ describe("guide draft helpers", () => {
const prompt = buildGuideDraftPrompt({ session, candidates, language: "en" });
expect(prompt).toContain("Return JSON only");
expect(prompt).toContain('"sourceCandidateId": "candidate-1"');
expect(prompt).toContain('"targetText": "Save"');
expect(prompt).toContain('"xPercent": 50');
expect(prompt).toContain('"id":"guide-step-1"');
});
+11 -1
View File
@@ -17,10 +17,12 @@ export function buildGuideDraftPrompt(input: GuidePromptInput): string {
const candidatesJson = JSON.stringify(
input.candidates.map((candidate, index) => ({
order: index + 1,
sourceCandidateId: candidate.id,
timeMs: Math.round(candidate.timeMs),
action: candidate.action,
targetText: candidate.targetText,
targetRole: candidate.targetRole,
position: candidate.position,
nearbyText: candidate.nearbyText,
confidence: candidate.confidence,
})),
@@ -36,8 +38,10 @@ export function buildGuideDraftPrompt(input: GuidePromptInput): string {
"Rules:",
"- Use short, explicit step instructions.",
"- Prefer visible target text from OCR when it is available.",
"- Return sourceCandidateId exactly from the chosen candidate.",
"- Never use generic marker text such as Ctrl+F12 marker or Ctrl marker as a UI target.",
"- Do not invent buttons or screens that are not in the candidates.",
"- If a target is unclear, describe the action by screen position or timestamp.",
"- If a target is unclear, describe the action by the candidate position and include the x/y percentages.",
"",
"Candidates:",
candidatesJson,
@@ -92,12 +96,18 @@ function buildInstruction(candidate: GuideStepCandidate, language: GuideLanguage
if (target) {
return `${candidate.action === "click" ? "Nhấn" : "Thực hiện thao tác"} vào "${target}".`;
}
if (candidate.position) {
return `Nhấn tại vùng ${candidate.position.description} (x ${candidate.position.xPercent}%, y ${candidate.position.yPercent}%).`;
}
return `Thực hiện thao tác tại mốc ${formatTimestamp(candidate.timeMs)}.`;
}
if (target) {
return `${candidate.action === "click" ? "Click" : "Use"} "${target}".`;
}
if (candidate.position) {
return `Click the ${candidate.position.description} area (x ${candidate.position.xPercent}%, y ${candidate.position.yPercent}%).`;
}
return `Perform the action at ${formatTimestamp(candidate.timeMs)}.`;
}
+26
View File
@@ -90,6 +90,32 @@ describe("buildGuideStepCandidates", () => {
});
});
it("treats hotkey markers with coordinates like clicks", () => {
const session = createSession();
session.events[0] = {
...session.events[0],
kind: "hotkey",
source: "guide-hotkey",
normalizedX: 0.5,
normalizedY: 0.5,
label: "Ctrl+F12 marker",
};
const candidates = buildGuideStepCandidates(session);
expect(candidates[0]).toMatchObject({
action: "click",
targetText: "Save",
targetRole: "button",
position: {
normalizedX: 0.5,
normalizedY: 0.5,
xPercent: 50,
yPercent: 50,
},
});
});
it("prefers a nearby line phrase over a single OCR word", () => {
const session = createSession();
session.events[0] = {
+44 -4
View File
@@ -46,8 +46,11 @@ export function buildGuideStepCandidates(
0,
maxNearbyText,
);
const label = normalizeText(event.label);
const targetText = label ?? normalizeText(targetRegion?.text);
const label = normalizeEventLabelForTarget(event);
const point = getEventPoint(event);
const targetText = point
? (normalizeText(targetRegion?.text) ?? label)
: (label ?? normalizeText(targetRegion?.text));
return {
id: `candidate-${event.id}`,
@@ -57,6 +60,7 @@ export function buildGuideStepCandidates(
action: inferAction(event),
targetText,
targetRole: inferTargetRole(targetText),
position: point ? describeEventPosition(point) : undefined,
nearbyText,
confidence: calculateCandidateConfidence(event, targetRegion, rankedRegions[0]?.score),
};
@@ -233,7 +237,7 @@ function pointInsideExpandedBox(
}
function inferAction(event: GuideEvent): GuideAction {
if (event.kind === "click") {
if (event.kind === "click" || (event.kind === "hotkey" && getEventPoint(event))) {
return "click";
}
return "manual";
@@ -275,7 +279,7 @@ function calculateCandidateConfidence(
0.45 + clamp01(targetRegion.confidence) * 0.25 + clamp01(score ?? 0) * 0.3,
);
}
if (event.label) {
if (normalizeEventLabelForTarget(event)) {
return 0.75;
}
if (getEventPoint(event)) {
@@ -307,6 +311,38 @@ function normalizeText(value: string | undefined): string | undefined {
return text ? text : undefined;
}
function normalizeEventLabelForTarget(event: GuideEvent): string | undefined {
const label = normalizeText(event.label);
if (!label) {
return undefined;
}
if (/^(?:ctrl(?:\s*\+\s*f12)?|control)\s+marker$/i.test(label)) {
return undefined;
}
if (/^manual\s+marker$/i.test(label)) {
return undefined;
}
return label;
}
function describeEventPosition(point: { x: number; y: number }): GuideStepCandidate["position"] {
const normalizedX = clamp01(point.x);
const normalizedY = clamp01(point.y);
return {
normalizedX: roundPosition(normalizedX),
normalizedY: roundPosition(normalizedY),
xPercent: Math.round(normalizedX * 100),
yPercent: Math.round(normalizedY * 100),
description: describeScreenRegion(normalizedX, normalizedY),
};
}
function describeScreenRegion(x: number, y: number): string {
const vertical = y < 0.33 ? "top" : y > 0.66 ? "bottom" : "middle";
const horizontal = x < 0.33 ? "left" : x > 0.66 ? "right" : "center";
return vertical === "middle" && horizontal === "center" ? "center" : `${vertical} ${horizontal}`;
}
function isUsefulOcrText(text: string): boolean {
if (!/[A-Za-z0-9À-ỹ]/.test(text)) {
return false;
@@ -346,6 +382,10 @@ function roundConfidence(value: number): number {
return Math.round(clamp01(value) * 100) / 100;
}
function roundPosition(value: number): number {
return Math.round(clamp01(value) * 1000) / 1000;
}
function clamp01(value: number): number {
if (!Number.isFinite(value)) {
return 0;
+25 -9
View File
@@ -209,18 +209,27 @@ export function useScreenRecorder(): UseScreenRecorderReturn {
return;
}
void window.electronAPI.guide
.addMarker({
void (async () => {
if (window.electronAPI?.guide.capturePointerMarker) {
const captureResult = await window.electronAPI.guide.capturePointerMarker();
if (captureResult.success && captureResult.data.captured) {
return;
}
if (!captureResult.success) {
console.warn("Failed to capture guide pointer marker:", captureResult.error);
}
}
const result = await window.electronAPI.guide.addMarker({
recordingId: activeRecordingId,
kind: "manual",
timeMs: getRecordingDurationMs(),
label: "Manual marker",
})
.then((result) => {
if (!result.success) {
console.warn("Failed to add guide marker:", result.error);
}
});
if (!result.success) {
console.warn("Failed to add guide marker:", result.error);
}
})();
}, [getRecordingDurationMs, recording]);
const selectMimeType = () => {
@@ -912,7 +921,10 @@ export function useScreenRecorder(): UseScreenRecorderReturn {
}
const activeRecordingId = Date.now();
const displayId = Number(selectedSource.display_id);
const displayId =
typeof selectedSource.displayId === "number"
? selectedSource.displayId
: Number(selectedSource.display_id);
const sourceType = selectedSource.id.startsWith("window:") ? "window" : "display";
const windowHandle = parseWindowHandleFromSourceId(selectedSource.id);
if (webcamEnabled) {
@@ -937,6 +949,7 @@ export function useScreenRecorder(): UseScreenRecorderReturn {
type: sourceType,
sourceId: selectedSource.id,
...(Number.isFinite(displayId) ? { displayId } : {}),
...(selectedSource.bounds ? { bounds: selectedSource.bounds } : {}),
...(windowHandle ? { windowHandle } : {}),
},
video: {
@@ -1030,7 +1043,9 @@ export function useScreenRecorder(): UseScreenRecorderReturn {
const activeRecordingId = Date.now();
const sourceType = selectedSource.id.startsWith("window:") ? "window" : "display";
const displayId =
Number(selectedSource.display_id) || parseMacDisplayIdFromSourceId(selectedSource.id);
typeof selectedSource.displayId === "number"
? selectedSource.displayId
: Number(selectedSource.display_id) || parseMacDisplayIdFromSourceId(selectedSource.id);
const windowId = parseMacWindowIdFromSourceId(selectedSource.id);
let nativeWebcamRecorder: RecorderHandle | null = null;
if (webcamEnabled) {
@@ -1074,6 +1089,7 @@ export function useScreenRecorder(): UseScreenRecorderReturn {
type: sourceType,
sourceId: selectedSource.id,
...(displayId ? { displayId } : {}),
...(selectedSource.bounds ? { bounds: selectedSource.bounds } : {}),
...(windowId ? { windowId } : {}),
},
video: {
+1 -1
View File
@@ -31,7 +31,7 @@
"guide": {
"enableGuideMode": "Enable guide mode",
"disableGuideMode": "Disable guide mode",
"addMarker": "Add guide marker"
"addMarker": "Capture guide marker (Ctrl or Ctrl+F12)"
},
"sourceSelector": {
"loading": "Loading sources...",
+1 -1
View File
@@ -47,6 +47,6 @@
"guide": {
"enableGuideMode": "Bật chế độ tạo hướng dẫn",
"disableGuideMode": "Tắt chế độ tạo hướng dẫn",
"addMarker": "Thêm mốc hướng dẫn"
"addMarker": "Chụp mốc hướng dẫn (Ctrl hoặc Ctrl+F12)"
}
}
+6
View File
@@ -6,6 +6,12 @@ export type NativeWindowsRecordingRequest = {
type: NativeWindowsSourceType;
sourceId: string;
displayId?: number;
bounds?: {
x: number;
y: number;
width: number;
height: number;
};
windowHandle?: string;
};
video: {
+301 -18
View File
@@ -5,6 +5,7 @@ import importlib.util
import os
import sys
import tempfile
from dataclasses import dataclass
from pathlib import Path
from threading import Lock
from typing import Any
@@ -17,6 +18,65 @@ app = FastAPI(title="OpenScreen PaddleOCR service")
_engines: dict[str, Any] = {}
_engine_lock = Lock()
_LATIN_RECOGNITION_LANGS = {
"af",
"az",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"en",
"es",
"et",
"eu",
"fi",
"fr",
"ga",
"gl",
"hr",
"hu",
"id",
"is",
"it",
"ku",
"la",
"latin",
"lb",
"lt",
"lv",
"mi",
"ms",
"mt",
"nl",
"no",
"oc",
"pi",
"pl",
"pt",
"qu",
"rm",
"ro",
"rs_latin",
"rslatin",
"sk",
"sl",
"sq",
"sv",
"sw",
"tl",
"tr",
"uz",
"vi",
}
@dataclass(frozen=True)
class PreparedImage:
path: str
scale: float = 1.0
should_delete: bool = False
class OcrRequest(BaseModel):
@@ -24,6 +84,7 @@ class OcrRequest(BaseModel):
path: str | None = None
imagePath: str | None = None
language: str | None = None
profile: str | None = None
@app.get("/health")
@@ -33,7 +94,9 @@ def health() -> dict[str, Any]:
"paddleocrInstalled": importlib.util.find_spec("paddleocr") is not None,
"paddleInstalled": importlib.util.find_spec("paddle") is not None,
"engineReady": bool(_engines),
"defaultLanguage": os.getenv("PADDLEOCR_LANG", "latin"),
"defaultLanguage": os.getenv("PADDLEOCR_LANG") or "vi,en",
"defaultProfile": os.getenv("OPENSCREEN_OCR_PROFILE") or "vietnamese",
"loadedEngines": sorted(_engines.keys()),
}
@@ -41,8 +104,12 @@ def health() -> dict[str, Any]:
async def ocr(request: OcrRequest) -> dict[str, Any]:
image_path, should_delete = _resolve_image_path(request)
try:
engine = _get_engine(request.language)
blocks = await run_in_threadpool(_recognize_blocks, engine, image_path)
blocks = await run_in_threadpool(
_recognize_profile_blocks,
image_path,
request.language,
request.profile,
)
return {"blocks": blocks}
finally:
if should_delete:
@@ -73,8 +140,7 @@ def _resolve_image_path(request: OcrRequest) -> tuple[str, bool]:
return handle.name, True
def _get_engine(language: str | None) -> Any:
paddle_lang = _resolve_paddle_language(language)
def _get_engine(paddle_lang: str) -> Any:
cache_key = f"{paddle_lang}|{os.getenv('PADDLEOCR_DEVICE', 'cpu')}"
with _engine_lock:
if cache_key not in _engines:
@@ -105,13 +171,17 @@ def _create_engine(paddle_lang: str) -> Any:
"enable_mkldnn": os.getenv("PADDLEOCR_ENABLE_MKLDNN", "0") == "1",
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"use_textline_orientation": False,
"use_textline_orientation": os.getenv("PADDLEOCR_USE_TEXTLINE_ORIENTATION", "0") == "1",
}
if os.getenv("PADDLEOCR_USE_MOBILE", "1") != "0":
modern_kwargs.update(
{
"text_detection_model_name": "PP-OCRv5_mobile_det",
"text_recognition_model_name": _mobile_recognition_model(paddle_lang),
"text_detection_model_name": os.getenv(
"PADDLEOCR_DET_MODEL",
"PP-OCRv5_mobile_det",
),
"text_recognition_model_name": os.getenv("PADDLEOCR_REC_MODEL")
or _mobile_recognition_model(paddle_lang),
}
)
@@ -150,23 +220,236 @@ def _patch_paddlex_frozen_ocr_extra_gate() -> None:
deps._openscreen_ocr_extra_patch = True
def _resolve_paddle_language(language: str | None) -> str:
explicit = os.getenv("PADDLEOCR_LANG")
def _recognize_profile_blocks(
image_path: str,
language: str | None,
profile: str | None,
) -> list[dict[str, Any]]:
ocr_profile = _resolve_ocr_profile(profile)
languages = _resolve_paddle_languages(language, ocr_profile)
prepared = _prepare_image_for_profile(image_path, ocr_profile)
try:
blocks: list[dict[str, Any]] = []
for paddle_lang in languages:
engine = _get_engine(paddle_lang)
recognized = _recognize_blocks(engine, prepared.path)
blocks.extend(_scale_blocks(recognized, prepared.scale))
return _merge_blocks(blocks)
finally:
if prepared.should_delete:
Path(prepared.path).unlink(missing_ok=True)
def _resolve_ocr_profile(profile: str | None) -> str:
explicit = (os.getenv("OPENSCREEN_OCR_PROFILE") or "").strip().lower()
value = explicit or (profile or "").strip().lower()
if value in {"fast", "vietnamese", "hybrid"}:
return value
return "vietnamese"
def _resolve_paddle_languages(language: str | None, profile: str) -> list[str]:
explicit = (os.getenv("PADDLEOCR_LANG") or "").strip().lower()
if explicit:
return explicit
return [explicit]
language_value = (language or "vi,en").lower()
if "vi" in language_value or "latin" in language_value:
has_vietnamese = "vi" in _split_language_tags(language_value)
if profile == "fast":
return [_resolve_primary_paddle_language(language_value, prefer_vietnamese=False)]
if profile == "hybrid":
languages = ["vi"] if has_vietnamese else []
languages.append("latin")
return _dedupe_languages(languages)
return [_resolve_primary_paddle_language(language_value, prefer_vietnamese=True)]
def _split_language_tags(language: str) -> set[str]:
return {part.strip().lower() for part in language.split(",") if part.strip()}
def _dedupe_languages(languages: list[str]) -> list[str]:
seen: set[str] = set()
result: list[str] = []
for language in languages:
if language not in seen:
seen.add(language)
result.append(language)
return result
def _resolve_primary_paddle_language(language_value: str, *, prefer_vietnamese: bool) -> str:
tags = _split_language_tags(language_value)
if prefer_vietnamese and "vi" in tags:
return "vi"
if "latin" in tags or "vi" in tags or "en" in tags:
return "latin"
if "en" in language_value:
return "en"
return language_value.split(",")[0].strip() or "latin"
for tag in tags:
return tag
return "latin"
def _prepare_image_for_profile(image_path: str, profile: str) -> PreparedImage:
if profile == "fast":
return PreparedImage(image_path)
try:
from PIL import Image, ImageEnhance, ImageOps
except Exception:
return PreparedImage(image_path)
try:
with Image.open(image_path) as source:
image = source.convert("RGB")
except Exception:
return PreparedImage(image_path)
scale = _resolve_enhancement_scale(image.width, image.height)
if scale <= 1:
return PreparedImage(image_path)
resampling = getattr(getattr(Image, "Resampling", Image), "LANCZOS")
enhanced = image.resize((round(image.width * scale), round(image.height * scale)), resampling)
enhanced = ImageOps.autocontrast(enhanced)
enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
enhanced = ImageEnhance.Sharpness(enhanced).enhance(1.35)
handle = tempfile.NamedTemporaryFile(prefix="openscreen-ocr-enhanced-", suffix=".png", delete=False)
try:
handle.close()
enhanced.save(handle.name, format="PNG")
return PreparedImage(handle.name, scale=scale, should_delete=True)
except Exception:
Path(handle.name).unlink(missing_ok=True)
return PreparedImage(image_path)
def _resolve_enhancement_scale(width: int, height: int) -> float:
try:
requested_scale = float(os.getenv("OPENSCREEN_OCR_ENHANCE_SCALE", "2"))
except ValueError:
requested_scale = 2.0
scale = max(1.0, min(3.0, requested_scale))
try:
max_side = int(os.getenv("OPENSCREEN_OCR_ENHANCE_MAX_SIDE", "2400"))
except ValueError:
max_side = 2400
largest_side = max(width, height)
if largest_side <= 0:
return 1.0
return max(1.0, min(scale, max_side / largest_side))
def _scale_blocks(blocks: list[dict[str, Any]], scale: float) -> list[dict[str, Any]]:
if scale <= 1:
return blocks
scaled_blocks: list[dict[str, Any]] = []
for block in blocks:
box = block.get("box")
if not isinstance(box, dict) or not _box_uses_pixels(box):
scaled_blocks.append(block)
continue
scaled_box = {
"x": float(box["x"]) / scale,
"y": float(box["y"]) / scale,
"width": float(box["width"]) / scale,
"height": float(box["height"]) / scale,
}
scaled_blocks.append({**block, "box": scaled_box})
return scaled_blocks
def _box_uses_pixels(box: dict[str, Any]) -> bool:
try:
x = float(box["x"])
y = float(box["y"])
width = float(box["width"])
height = float(box["height"])
except (KeyError, TypeError, ValueError):
return False
return x > 1 or y > 1 or width > 1 or height > 1 or x + width > 1 or y + height > 1
def _merge_blocks(blocks: list[dict[str, Any]]) -> list[dict[str, Any]]:
merged: list[dict[str, Any]] = []
for block in sorted(blocks, key=_block_quality, reverse=True):
box = block.get("box")
if not isinstance(box, dict):
continue
overlapping_index = next(
(
index
for index, existing in enumerate(merged)
if _box_iou(box, existing.get("box")) >= 0.62
),
None,
)
if overlapping_index is None:
merged.append(block)
continue
if _block_quality(block) > _block_quality(merged[overlapping_index]):
merged[overlapping_index] = block
return sorted(merged, key=lambda block: _box_sort_key(block.get("box")))
def _block_quality(block: dict[str, Any]) -> float:
text = str(block.get("text") or "")
score = _score_to_float(block.get("confidence"))
if _has_vietnamese_diacritics(text):
score += 0.08
if len(text) >= 2:
score += min(0.04, len(text) * 0.002)
return score
def _has_vietnamese_diacritics(text: str) -> bool:
return any(
character
in "ăâđêôơưĂÂĐÊÔƠƯáàảãạắằẳẵặấầẩẫậéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựýỳỷỹỵ"
for character in text
)
def _box_iou(left: Any, right: Any) -> float:
if not isinstance(left, dict) or not isinstance(right, dict):
return 0.0
try:
left_x = float(left["x"])
left_y = float(left["y"])
left_width = float(left["width"])
left_height = float(left["height"])
right_x = float(right["x"])
right_y = float(right["y"])
right_width = float(right["width"])
right_height = float(right["height"])
except (KeyError, TypeError, ValueError):
return 0.0
intersection_left = max(left_x, right_x)
intersection_top = max(left_y, right_y)
intersection_right = min(left_x + left_width, right_x + right_width)
intersection_bottom = min(left_y + left_height, right_y + right_height)
intersection_width = max(0.0, intersection_right - intersection_left)
intersection_height = max(0.0, intersection_bottom - intersection_top)
intersection_area = intersection_width * intersection_height
if intersection_area <= 0:
return 0.0
union_area = left_width * left_height + right_width * right_height - intersection_area
return intersection_area / union_area if union_area > 0 else 0.0
def _box_sort_key(box: Any) -> tuple[float, float]:
if not isinstance(box, dict):
return (0.0, 0.0)
try:
return (float(box["y"]), float(box["x"]))
except (KeyError, TypeError, ValueError):
return (0.0, 0.0)
def _mobile_recognition_model(paddle_lang: str) -> str:
if paddle_lang == "en":
return "en_PP-OCRv5_mobile_rec"
if paddle_lang == "latin":
if paddle_lang in _LATIN_RECOGNITION_LANGS:
return "latin_PP-OCRv5_mobile_rec"
return "PP-OCRv5_mobile_rec"