Release OpenScreen 1.4.4

This commit is contained in:
huanld
2026-05-28 12:25:23 +07:00
parent 198dc022b0
commit 0b78ff6f7d
14 changed files with 671 additions and 48 deletions
+8 -4
View File
@@ -6,7 +6,7 @@ OpenScreen calls OCR through a local HTTP service. The default endpoint is:
http://127.0.0.1:8866/ocr
```
The app sends either `imageBase64` or `path` and expects OCR blocks:
The app sends either `imageBase64` or `path`, plus optional `language` and `profile`, and expects OCR blocks:
```json
{
@@ -38,7 +38,7 @@ If `paddle` is still missing after installing `paddleocr`, install the CPU Paddl
```powershell
.\.venv-ocr\Scripts\Activate.ps1
$env:PADDLEOCR_DEVICE="cpu"
$env:PADDLEOCR_LANG="latin"
$env:OPENSCREEN_OCR_PROFILE="vietnamese"
npm run ocr:paddle
```
@@ -58,7 +58,8 @@ Expected healthy environment:
"paddleocrInstalled": true,
"paddleInstalled": true,
"engineReady": false,
"defaultLanguage": "latin"
"defaultLanguage": "vi,en",
"defaultProfile": "vietnamese"
}
```
@@ -67,7 +68,10 @@ Expected healthy environment:
## Configuration
- `PADDLEOCR_DEVICE`: `cpu`, `gpu:0`, or another PaddleOCR device string.
- `PADDLEOCR_LANG`: defaults to `latin`; this is preferred for Vietnamese UI text because it uses a Latin-script recognition model.
- `OPENSCREEN_OCR_PROFILE`: `fast`, `vietnamese`, or `hybrid`. The default `vietnamese` profile upscales and sharpens focused UI screenshots before OCR.
- `OPENSCREEN_GUIDE_OCR_LANGUAGE`: defaults to `vi,en`.
- `PADDLEOCR_LANG`: optional hard override. Leave unset for the app profile/language settings to work.
- `PADDLEOCR_VERSION`: defaults to `PP-OCRv5`.
- `PADDLEOCR_USE_MOBILE`: defaults to `1`; set to `0` to use the default/server models.
- `PADDLEOCR_REC_MODEL`: optional recognizer model override. The bundled profile uses `latin_PP-OCRv5_mobile_rec`, which supports Vietnamese Latin-script text.
- `OPENSCREEN_GUIDE_OCR_URL`: OpenScreen OCR endpoint override; defaults to `http://127.0.0.1:8866`.
@@ -0,0 +1,66 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { DeepSeekSettingsStore } from "./deepseekSettingsStore";
const tempDirs: string[] = [];
const originalOcrProfile = process.env.OPENSCREEN_GUIDE_OCR_PROFILE;
const originalOcrLanguage = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE;
beforeEach(() => {
delete process.env.OPENSCREEN_GUIDE_OCR_PROFILE;
delete process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE;
});
afterEach(async () => {
restoreEnv("OPENSCREEN_GUIDE_OCR_PROFILE", originalOcrProfile);
restoreEnv("OPENSCREEN_GUIDE_OCR_LANGUAGE", originalOcrLanguage);
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true })));
});
function restoreEnv(name: string, value: string | undefined): void {
if (value === undefined) {
delete process.env[name];
return;
}
process.env[name] = value;
}
async function createStore(): Promise<DeepSeekSettingsStore> {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openscreen-guide-settings-"));
tempDirs.push(dir);
return new DeepSeekSettingsStore(path.join(dir, "guide-ai-settings.json"));
}
describe("DeepSeekSettingsStore OCR settings", () => {
it("defaults to the Vietnamese enhanced OCR profile", async () => {
const store = await createStore();
await expect(store.getOcrConfig()).resolves.toEqual({
profile: "vietnamese",
language: "vi,en",
});
});
it("persists OCR profile changes alongside DeepSeek settings", async () => {
const store = await createStore();
const status = await store.save({
deepseekApiKeyEnvName: "DEEPSEEK_API_KEY",
baseUrl: "https://api.deepseek.com",
model: "deepseek-chat",
ocrProfile: "hybrid",
ocrLanguage: "vi,en",
});
expect(status.ocr).toMatchObject({
profile: "hybrid",
language: "vi,en",
});
await expect(store.getOcrConfig()).resolves.toEqual({
profile: "hybrid",
language: "vi,en",
});
});
});
+68 -2
View File
@@ -1,6 +1,10 @@
import fs from "node:fs/promises";
import path from "node:path";
import type { GuideAiSettings, SaveGuideAiSettingsInput } from "../../../src/guide/contracts";
import type {
GuideAiSettings,
GuideOcrProfile,
SaveGuideAiSettingsInput,
} from "../../../src/guide/contracts";
export interface DeepSeekGuideConfig {
apiKey?: string;
@@ -12,8 +16,22 @@ export interface DeepSeekGuideConfigProvider {
getDeepSeekConfig(): Promise<DeepSeekGuideConfig>;
}
export interface GuideOcrConfig {
profile: GuideOcrProfile;
language: string;
}
export interface GuideOcrConfigProvider {
getOcrConfig(): Promise<GuideOcrConfig>;
}
interface PersistedGuideAiSettings {
schemaVersion: 1;
ocr?: {
profile?: GuideOcrProfile;
language?: string;
updatedAt?: string;
};
deepseek?: {
apiKeyEnvName?: string;
baseUrl?: string;
@@ -25,8 +43,10 @@ interface PersistedGuideAiSettings {
const DEFAULT_DEEPSEEK_API_KEY_ENV_NAME = "DEEPSEEK_API_KEY";
const DEFAULT_DEEPSEEK_BASE_URL = "https://api.deepseek.com";
const DEFAULT_DEEPSEEK_MODEL = "deepseek-chat";
const DEFAULT_OCR_PROFILE: GuideOcrProfile = "vietnamese";
const DEFAULT_OCR_LANGUAGE = "vi,en";
export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider, GuideOcrConfigProvider {
constructor(private readonly filePath: string) {}
async getStatus(): Promise<GuideAiSettings> {
@@ -35,6 +55,13 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
const activeApiKey = process.env[apiKeyEnvName];
return {
ocr: {
profile: normalizeOcrProfile(raw?.ocr?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
language: normalizeOcrLanguage(
raw?.ocr?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE,
),
updatedAt: raw?.ocr?.updatedAt,
},
deepseek: {
hasApiKey: Boolean(activeApiKey),
apiKeyEnvName,
@@ -49,7 +76,14 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
async save(input: SaveGuideAiSettingsInput): Promise<GuideAiSettings> {
const current = (await this.readSettings()) ?? { schemaVersion: 1 };
const currentOcr = current.ocr ?? {};
const currentDeepSeek = current.deepseek ?? {};
const nextOcr = {
...currentOcr,
profile: normalizeOcrProfile(input.ocrProfile ?? currentOcr.profile),
language: normalizeOcrLanguage(input.ocrLanguage ?? currentOcr.language),
updatedAt: new Date().toISOString(),
};
const nextDeepSeek = {
...currentDeepSeek,
baseUrl: normalizeBaseUrl(input.baseUrl ?? currentDeepSeek.baseUrl),
@@ -65,6 +99,7 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
await this.writeSettings({
schemaVersion: 1,
ocr: nextOcr,
deepseek: nextDeepSeek,
});
return await this.getStatus();
@@ -80,6 +115,16 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
};
}
async getOcrConfig(): Promise<GuideOcrConfig> {
const raw = await this.readSettings();
return {
profile: normalizeOcrProfile(raw?.ocr?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
language: normalizeOcrLanguage(
raw?.ocr?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE,
),
};
}
private async readSettings(): Promise<PersistedGuideAiSettings | null> {
try {
const content = await fs.readFile(this.filePath, "utf-8");
@@ -120,6 +165,11 @@ function normalizePersistedSettings(input: unknown): PersistedGuideAiSettings |
}
return {
schemaVersion: 1,
ocr: {
profile: normalizeOcrProfile(raw.ocr?.profile),
language: normalizeOcrLanguage(raw.ocr?.language),
updatedAt: raw.ocr?.updatedAt,
},
deepseek: {
apiKeyEnvName: normalizeEnvName(raw.deepseek?.apiKeyEnvName),
baseUrl: raw.deepseek?.baseUrl,
@@ -155,3 +205,19 @@ function normalizeBaseUrl(value: string | undefined): string {
function normalizeModel(value: string | undefined): string {
return value?.trim() || DEFAULT_DEEPSEEK_MODEL;
}
function normalizeOcrProfile(value: string | undefined): GuideOcrProfile {
if (value === "fast" || value === "vietnamese" || value === "hybrid") {
return value;
}
return DEFAULT_OCR_PROFILE;
}
function normalizeOcrLanguage(value: string | undefined): string {
const normalized = value
?.split(",")
.map((part) => part.trim().toLowerCase())
.filter(Boolean)
.join(",");
return normalized || DEFAULT_OCR_LANGUAGE;
}
+8 -2
View File
@@ -34,7 +34,10 @@ import {
DeepSeekGuideClientError,
type GuideDraftClient,
} from "./ai/deepseekGuideClient";
import type { DeepSeekGuideConfigProvider } from "./ai/deepseekSettingsStore";
import type {
DeepSeekGuideConfigProvider,
GuideOcrConfigProvider,
} from "./ai/deepseekSettingsStore";
import { type GuidePaths, normalizeGuideRecordingId, resolveGuidePaths } from "./guidePaths";
import { createFocusedOcrSnapshot, remapFocusedOcrBlocks } from "./ocr/focusedOcrSnapshot";
import { DefaultGuideOcrClient, type GuideOcrClient } from "./ocr/paddleOcrClient";
@@ -70,6 +73,7 @@ export interface GuideStoreDependencies {
ocrClient?: GuideOcrClient;
draftClient?: GuideDraftClient;
deepSeekConfigProvider?: DeepSeekGuideConfigProvider;
ocrConfigProvider?: GuideOcrConfigProvider;
focusOcrSnapshots?: boolean;
}
@@ -255,7 +259,9 @@ export class GuideStore {
throw new GuideStoreError("guide-invalid-input", "No guide snapshots are available for OCR.");
}
const ocrClient = this.dependencies.ocrClient ?? new DefaultGuideOcrClient();
const ocrClient =
this.dependencies.ocrClient ??
DefaultGuideOcrClient.fromConfig(await this.dependencies.ocrConfigProvider?.getOcrConfig());
const shouldFocusOcrSnapshots =
this.dependencies.focusOcrSnapshots ?? this.dependencies.ocrClient === undefined;
const eventsById = new Map(session.events.map((event) => [event.id, event]));
+3 -1
View File
@@ -156,8 +156,10 @@ function startOcrServiceProcess(
OPENSCREEN_OCR_PORT: DEFAULT_OCR_PORT,
PADDLEOCR_DEVICE: process.env.PADDLEOCR_DEVICE ?? "cpu",
PADDLEOCR_ENABLE_MKLDNN: process.env.PADDLEOCR_ENABLE_MKLDNN ?? "0",
PADDLEOCR_LANG: process.env.PADDLEOCR_LANG ?? "latin",
PADDLEOCR_LANG: process.env.PADDLEOCR_LANG ?? "",
PADDLEOCR_USE_MOBILE: process.env.PADDLEOCR_USE_MOBILE ?? "1",
OPENSCREEN_OCR_PROFILE:
process.env.OPENSCREEN_OCR_PROFILE ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE ?? "",
PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT: process.env.PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT ?? "False",
PADDLE_PDX_CACHE_HOME: process.env.PADDLE_PDX_CACHE_HOME ?? runtimePaths.paddlexCachePath,
PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK:
+38 -1
View File
@@ -1,8 +1,12 @@
import { describe, expect, it } from "vitest";
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
import {
DefaultGuideOcrClient,
normalizeOcrResponse,
PaddleOcrHttpClient,
parseWindowsOcrPayload,
} from "./paddleOcrClient";
@@ -16,6 +20,10 @@ const snapshot: GuideSnapshot = {
height: 800,
};
afterEach(() => {
vi.unstubAllGlobals();
});
describe("normalizeOcrResponse", () => {
it("normalizes pixel boxes into guide OCR blocks", () => {
const blocks = normalizeOcrResponse(
@@ -67,6 +75,35 @@ describe("normalizeOcrResponse", () => {
});
});
describe("PaddleOcrHttpClient", () => {
it("sends the selected OCR profile to the local service", async () => {
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openscreen-ocr-client-"));
const imagePath = path.join(tempDir, "step.png");
await fs.writeFile(imagePath, Buffer.from([137, 80, 78, 71]));
const requests: unknown[] = [];
vi.stubGlobal(
"fetch",
vi.fn(async (_url: string, init?: RequestInit) => {
requests.push(JSON.parse(String(init?.body ?? "{}")));
return new Response(JSON.stringify({ blocks: [] }), {
status: 200,
headers: { "content-type": "application/json" },
});
}),
);
const client = new PaddleOcrHttpClient("https://ocr.example.test", "vi,en", "hybrid");
await client.recognize({ ...snapshot, path: imagePath });
expect(requests[0]).toMatchObject({
language: "vi,en",
profile: "hybrid",
path: imagePath,
});
await fs.rm(tempDir, { recursive: true, force: true });
});
});
describe("DefaultGuideOcrClient", () => {
it("falls back when the HTTP OCR service is unavailable", async () => {
const fallbackBlock: OcrBlock = {
+45 -3
View File
@@ -1,7 +1,7 @@
import { execFile } from "node:child_process";
import fs from "node:fs/promises";
import { promisify } from "node:util";
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
import type { GuideOcrProfile, GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
import { ensureBundledOcrServiceRunning } from "./bundledOcrService";
const execFileAsync = promisify(execFile);
@@ -10,6 +10,11 @@ export interface GuideOcrClient {
recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]>;
}
export interface GuideOcrClientConfig {
profile: GuideOcrProfile;
language: string;
}
interface PaddleOcrResponseBlock {
text?: unknown;
confidence?: unknown;
@@ -21,7 +26,8 @@ interface PaddleOcrResponseBlock {
export class PaddleOcrHttpClient implements GuideOcrClient {
constructor(
private readonly baseUrl = process.env.OPENSCREEN_GUIDE_OCR_URL ?? "http://127.0.0.1:8866",
private readonly language = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE ?? "vi,en",
private readonly language = normalizeOcrLanguage(process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
private readonly profile = normalizeOcrProfile(process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
) {}
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
@@ -36,6 +42,7 @@ export class PaddleOcrHttpClient implements GuideOcrClient {
imageBase64,
path: snapshot.path,
language: this.language,
profile: this.profile,
}),
});
} catch (error) {
@@ -54,7 +61,9 @@ export class PaddleOcrHttpClient implements GuideOcrClient {
}
export class WindowsOcrClient implements GuideOcrClient {
constructor(private readonly language = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE ?? "vi,en") {}
constructor(
private readonly language = normalizeOcrLanguage(process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
) {}
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
if (process.platform !== "win32") {
@@ -96,6 +105,14 @@ export class WindowsOcrClient implements GuideOcrClient {
}
export class DefaultGuideOcrClient implements GuideOcrClient {
static fromConfig(config?: Partial<GuideOcrClientConfig>): DefaultGuideOcrClient {
const normalizedConfig = normalizeOcrClientConfig(config);
return new DefaultGuideOcrClient(
new PaddleOcrHttpClient(undefined, normalizedConfig.language, normalizedConfig.profile),
new WindowsOcrClient(normalizedConfig.language),
);
}
constructor(
private readonly httpClient = new PaddleOcrHttpClient(),
private readonly windowsClient = new WindowsOcrClient(),
@@ -119,6 +136,31 @@ export class DefaultGuideOcrClient implements GuideOcrClient {
}
}
function normalizeOcrClientConfig(
config: Partial<GuideOcrClientConfig> | undefined,
): GuideOcrClientConfig {
return {
profile: normalizeOcrProfile(config?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
language: normalizeOcrLanguage(config?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
};
}
function normalizeOcrProfile(value: string | undefined): GuideOcrProfile {
if (value === "fast" || value === "vietnamese" || value === "hybrid") {
return value;
}
return "vietnamese";
}
function normalizeOcrLanguage(value: string | undefined): string {
const normalized = value
?.split(",")
.map((part) => part.trim().toLowerCase())
.filter(Boolean)
.join(",");
return normalized || "vi,en";
}
export function parseWindowsOcrPayload(stdout: string): unknown {
const normalized = stdout.replace(/^\uFEFF/, "").trim();
try {
+39 -1
View File
@@ -1732,7 +1732,7 @@ export function registerIpcHandlers(
const sources = await desktopCapturer.getSources(opts);
lastEnumeratedSources = new Map(sources.map((source) => [source.id, source]));
let screenSourceIndex = 0;
return sources.map((source) => {
const processedSources = sources.map((source) => {
const isScreenSource = source.id.startsWith("screen:");
const sourceIndex = isScreenSource
? (parseDesktopCapturerScreenIndex(source.id) ?? screenSourceIndex)
@@ -1760,6 +1760,43 @@ export function registerIpcHandlers(
bounds,
};
});
const screenDisplays = screen.getAllDisplays();
const mappedDisplayIds = new Set(
processedSources
.filter((source) => source.id.startsWith("screen:") && typeof source.displayId === "number")
.map((source) => source.displayId),
);
const fallbackScreenSources = screenDisplays
.map((display, displayIndex) => ({ display, displayIndex }))
.filter(({ display }) => !mappedDisplayIds.has(display.id))
.map(({ display, displayIndex }) => {
const bounds = toSourceBounds(display.bounds);
return {
id: `screen:${displayIndex}:fallback:${display.id}`,
name: `Screen ${displayIndex + 1}`,
display_id: String(display.id),
thumbnail: null,
appIcon: null,
displayId: display.id,
displayIndex,
screenIndex: displayIndex,
displayLabel: `Display ${displayIndex + 1} - ${bounds.width}x${bounds.height} @ ${bounds.x},${bounds.y}`,
bounds,
};
});
if (fallbackScreenSources.length > 0) {
console.warn("[desktop-capturer] added fallback display sources", {
capturerScreens: processedSources.filter((source) => source.id.startsWith("screen:"))
.length,
electronDisplays: screenDisplays.length,
fallbackScreens: fallbackScreenSources.map((source) => ({
id: source.id,
displayId: source.displayId,
bounds: source.bounds,
})),
});
}
return [...processedSources, ...fallbackScreenSources];
});
ipcMain.handle("select-source", async (_, source: SelectedSource) => {
@@ -2637,6 +2674,7 @@ export function registerIpcHandlers(
);
const guideStore = new GuideStore(RECORDINGS_DIR, {
deepSeekConfigProvider: guideAiSettingsStore,
ocrConfigProvider: guideAiSettingsStore,
});
registerGuideMarkerHotkey(guideStore);
registerGuideIpcHandlers(ipcMain, guideStore, guideAiSettingsStore, {
+2 -2
View File
@@ -1,12 +1,12 @@
{
"name": "openscreen",
"version": "1.4.2",
"version": "1.4.4",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "openscreen",
"version": "1.4.2",
"version": "1.4.4",
"dependencies": {
"@fix-webm-duration/fix": "^1.0.1",
"@pixi/filter-drop-shadow": "^5.2.0",
+1 -1
View File
@@ -1,7 +1,7 @@
{
"name": "openscreen",
"private": true,
"version": "1.4.2",
"version": "1.4.4",
"type": "module",
"packageManager": "npm@10.9.4",
"engines": {
+14 -2
View File
@@ -65,7 +65,13 @@ export function SourceSelector() {
fetchSources();
}, []);
const screenSources = sources.filter((s) => s.id.startsWith("screen:"));
const screenSources = sources
.filter((s) => s.id.startsWith("screen:"))
.sort(
(left, right) =>
(left.displayIndex ?? left.screenIndex ?? Number.MAX_SAFE_INTEGER) -
(right.displayIndex ?? right.screenIndex ?? Number.MAX_SAFE_INTEGER),
);
const windowSources = sources.filter((s) => s.id.startsWith("window:"));
const handleSourceSelect = (source: DesktopSource) => setSelectedSource(source);
@@ -96,11 +102,17 @@ export function SourceSelector() {
onClick={() => handleSourceSelect(source)}
>
<div className="relative mb-1.5 overflow-hidden rounded-lg border border-white/[0.06] bg-black/30">
{source.thumbnail ? (
<img
src={source.thumbnail || ""}
src={source.thumbnail}
alt={source.name}
className="w-full aspect-video object-cover"
/>
) : (
<div className="flex aspect-video w-full items-center justify-center bg-zinc-950 text-center text-[11px] font-medium text-zinc-400">
{source.displayLabel ?? source.name}
</div>
)}
{isSelected && (
<div className="absolute right-1.5 top-1.5">
<div className={styles.checkBadge}>
@@ -7,6 +7,7 @@ import type {
GuideAiProvider,
GuideAiSettings,
GuideLanguage,
GuideOcrProfile,
GuideSession,
} from "@/guide/contracts";
import { captureGuideSnapshots } from "@/guide/snapshot/extractGuideSnapshots";
@@ -42,13 +43,19 @@ const COPY = {
captureStep: "Capture step",
captureLabel: "Manual capture",
settings: "Settings",
guideSettings: "Guide settings",
apiKey: "API key env",
apiKeyPlaceholder: "DEEPSEEK_API_KEY",
baseUrl: "Base URL",
model: "Model",
ocrProfile: "OCR profile",
ocrLanguage: "OCR languages",
ocrFast: "Fast Latin",
ocrVietnamese: "Vietnamese Enhanced",
ocrHybrid: "Hybrid Vi + Latin",
saveSettings: "Save",
clearKey: "Reset env",
keySaved: "DeepSeek settings saved.",
settingsSaved: "Guide settings saved.",
keyMissing: "Set a DeepSeek API key environment variable before generating with DeepSeek.",
keyConfigured: "Env ready",
keyNotConfigured: "Env value missing",
@@ -78,13 +85,19 @@ const COPY = {
captureStep: "Chụp bước",
captureLabel: "Chụp thủ công",
settings: "Cài đặt",
guideSettings: "Guide settings",
apiKey: "API key env",
apiKeyPlaceholder: "DEEPSEEK_API_KEY",
baseUrl: "Base URL",
model: "Model",
ocrProfile: "OCR profile",
ocrLanguage: "OCR languages",
ocrFast: "Fast Latin",
ocrVietnamese: "Vietnamese Enhanced",
ocrHybrid: "Hybrid Vi + Latin",
saveSettings: "Lưu",
clearKey: "Reset env",
keySaved: "Đã lưu cài đặt DeepSeek.",
settingsSaved: "Da luu cai dat guide.",
keyMissing: "Hãy set biến môi trường DeepSeek API key trước khi tạo draft bằng DeepSeek.",
keyConfigured: "Env ready",
keyNotConfigured: "Chưa thấy giá trị env",
@@ -108,6 +121,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
const [deepSeekApiKeyEnvName, setDeepSeekApiKeyEnvName] = useState("DEEPSEEK_API_KEY");
const [deepSeekBaseUrl, setDeepSeekBaseUrl] = useState("https://api.deepseek.com");
const [deepSeekModel, setDeepSeekModel] = useState("deepseek-chat");
const [ocrProfile, setOcrProfile] = useState<GuideOcrProfile>("vietnamese");
const [ocrLanguage, setOcrLanguage] = useState("vi,en");
const [message, setMessage] = useState<string | null>(null);
const isBusy = busyAction !== null;
@@ -138,6 +153,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
setDeepSeekBaseUrl(result.data.deepseek.baseUrl);
setDeepSeekModel(result.data.deepseek.model);
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
setOcrProfile(result.data.ocr.profile);
setOcrLanguage(result.data.ocr.language);
}, []);
useEffect(() => {
@@ -269,6 +286,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
deepseekApiKeyEnvName: deepSeekApiKeyEnvName,
baseUrl: deepSeekBaseUrl,
model: deepSeekModel,
ocrProfile,
ocrLanguage,
});
if (!result.success) {
throw new Error(result.error);
@@ -277,7 +296,9 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
setDeepSeekBaseUrl(result.data.deepseek.baseUrl);
setDeepSeekModel(result.data.deepseek.model);
toast.success(copy.keySaved);
setOcrProfile(result.data.ocr.profile);
setOcrLanguage(result.data.ocr.language);
toast.success(copy.settingsSaved);
} catch (error) {
const text = error instanceof Error ? error.message : String(error);
setMessage(text);
@@ -285,7 +306,14 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
} finally {
setSettingsBusy(false);
}
}, [copy.keySaved, deepSeekApiKeyEnvName, deepSeekBaseUrl, deepSeekModel]);
}, [
copy.settingsSaved,
deepSeekApiKeyEnvName,
deepSeekBaseUrl,
deepSeekModel,
ocrLanguage,
ocrProfile,
]);
const handleClearDeepSeekKey = useCallback(async () => {
if (!window.electronAPI?.guide?.saveAiSettings) {
@@ -298,13 +326,17 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
clearDeepseekApiKeyEnvName: true,
baseUrl: deepSeekBaseUrl,
model: deepSeekModel,
ocrProfile,
ocrLanguage,
});
if (!result.success) {
throw new Error(result.error);
}
setAiSettings(result.data);
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
toast.success(copy.keySaved);
setOcrProfile(result.data.ocr.profile);
setOcrLanguage(result.data.ocr.language);
toast.success(copy.settingsSaved);
} catch (error) {
const text = error instanceof Error ? error.message : String(error);
setMessage(text);
@@ -312,7 +344,7 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
} finally {
setSettingsBusy(false);
}
}, [copy.keySaved, deepSeekBaseUrl, deepSeekModel]);
}, [copy.settingsSaved, deepSeekBaseUrl, deepSeekModel, ocrLanguage, ocrProfile]);
const handleGenerateGuide = useCallback(() => {
void runAction("generate", async () => {
@@ -455,7 +487,7 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
<div className="flex items-center justify-between gap-2">
<div className="min-w-0">
<div className="truncate text-[11px] font-semibold text-slate-100">
{copy.deepseek} {copy.settings}
{copy.guideSettings}
</div>
<div className="truncate text-[10px] text-slate-500">
{aiSettings?.deepseek.hasApiKey
@@ -470,6 +502,33 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
</span>
</div>
<div className="grid grid-cols-2 gap-1.5">
<label className="block min-w-0 text-[10px] font-medium text-slate-400">
{copy.ocrProfile}
<select
value={ocrProfile}
onChange={(event) => setOcrProfile(event.target.value as GuideOcrProfile)}
disabled={settingsBusy}
className="mt-1 h-8 w-full rounded-md border border-white/[0.08] bg-black/20 px-2 text-[11px] text-slate-100 outline-none"
>
<option value="vietnamese">{copy.ocrVietnamese}</option>
<option value="hybrid">{copy.ocrHybrid}</option>
<option value="fast">{copy.ocrFast}</option>
</select>
</label>
<label className="block min-w-0 text-[10px] font-medium text-slate-400">
{copy.ocrLanguage}
<input
type="text"
value={ocrLanguage}
onChange={(event) => setOcrLanguage(event.target.value)}
placeholder="vi,en"
disabled={settingsBusy}
className="mt-1 h-8 w-full rounded-md border border-white/[0.08] bg-black/20 px-2 text-[11px] text-slate-100 outline-none placeholder:text-slate-600"
/>
</label>
</div>
<label className="block text-[10px] font-medium text-slate-400">
{copy.apiKey}
<input
+8
View File
@@ -9,6 +9,7 @@ export type GuideTargetRole = "button" | "menu" | "tab" | "field" | "link" | "un
export type GuideLanguage = "vi" | "en";
export type GuideAiProvider = "deepseek" | "local";
export type GuideSecretStorage = "environment" | "none";
export type GuideOcrProfile = "fast" | "vietnamese" | "hybrid";
export type GuideSessionStatus =
| "recording"
@@ -178,6 +179,11 @@ export interface GenerateGuideDraftInput {
}
export interface GuideAiSettings {
ocr: {
profile: GuideOcrProfile;
language: string;
updatedAt?: string;
};
deepseek: {
hasApiKey: boolean;
apiKeyEnvName: string;
@@ -194,6 +200,8 @@ export interface SaveGuideAiSettingsInput {
clearDeepseekApiKeyEnvName?: boolean;
baseUrl?: string;
model?: string;
ocrProfile?: GuideOcrProfile;
ocrLanguage?: string;
}
export interface SaveGuideInput {
+301 -18
View File
@@ -5,6 +5,7 @@ import importlib.util
import os
import sys
import tempfile
from dataclasses import dataclass
from pathlib import Path
from threading import Lock
from typing import Any
@@ -17,6 +18,65 @@ app = FastAPI(title="OpenScreen PaddleOCR service")
_engines: dict[str, Any] = {}
_engine_lock = Lock()
_LATIN_RECOGNITION_LANGS = {
"af",
"az",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"en",
"es",
"et",
"eu",
"fi",
"fr",
"ga",
"gl",
"hr",
"hu",
"id",
"is",
"it",
"ku",
"la",
"latin",
"lb",
"lt",
"lv",
"mi",
"ms",
"mt",
"nl",
"no",
"oc",
"pi",
"pl",
"pt",
"qu",
"rm",
"ro",
"rs_latin",
"rslatin",
"sk",
"sl",
"sq",
"sv",
"sw",
"tl",
"tr",
"uz",
"vi",
}
@dataclass(frozen=True)
class PreparedImage:
path: str
scale: float = 1.0
should_delete: bool = False
class OcrRequest(BaseModel):
@@ -24,6 +84,7 @@ class OcrRequest(BaseModel):
path: str | None = None
imagePath: str | None = None
language: str | None = None
profile: str | None = None
@app.get("/health")
@@ -33,7 +94,9 @@ def health() -> dict[str, Any]:
"paddleocrInstalled": importlib.util.find_spec("paddleocr") is not None,
"paddleInstalled": importlib.util.find_spec("paddle") is not None,
"engineReady": bool(_engines),
"defaultLanguage": os.getenv("PADDLEOCR_LANG", "latin"),
"defaultLanguage": os.getenv("PADDLEOCR_LANG") or "vi,en",
"defaultProfile": os.getenv("OPENSCREEN_OCR_PROFILE") or "vietnamese",
"loadedEngines": sorted(_engines.keys()),
}
@@ -41,8 +104,12 @@ def health() -> dict[str, Any]:
async def ocr(request: OcrRequest) -> dict[str, Any]:
image_path, should_delete = _resolve_image_path(request)
try:
engine = _get_engine(request.language)
blocks = await run_in_threadpool(_recognize_blocks, engine, image_path)
blocks = await run_in_threadpool(
_recognize_profile_blocks,
image_path,
request.language,
request.profile,
)
return {"blocks": blocks}
finally:
if should_delete:
@@ -73,8 +140,7 @@ def _resolve_image_path(request: OcrRequest) -> tuple[str, bool]:
return handle.name, True
def _get_engine(language: str | None) -> Any:
paddle_lang = _resolve_paddle_language(language)
def _get_engine(paddle_lang: str) -> Any:
cache_key = f"{paddle_lang}|{os.getenv('PADDLEOCR_DEVICE', 'cpu')}"
with _engine_lock:
if cache_key not in _engines:
@@ -105,13 +171,17 @@ def _create_engine(paddle_lang: str) -> Any:
"enable_mkldnn": os.getenv("PADDLEOCR_ENABLE_MKLDNN", "0") == "1",
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"use_textline_orientation": False,
"use_textline_orientation": os.getenv("PADDLEOCR_USE_TEXTLINE_ORIENTATION", "0") == "1",
}
if os.getenv("PADDLEOCR_USE_MOBILE", "1") != "0":
modern_kwargs.update(
{
"text_detection_model_name": "PP-OCRv5_mobile_det",
"text_recognition_model_name": _mobile_recognition_model(paddle_lang),
"text_detection_model_name": os.getenv(
"PADDLEOCR_DET_MODEL",
"PP-OCRv5_mobile_det",
),
"text_recognition_model_name": os.getenv("PADDLEOCR_REC_MODEL")
or _mobile_recognition_model(paddle_lang),
}
)
@@ -150,23 +220,236 @@ def _patch_paddlex_frozen_ocr_extra_gate() -> None:
deps._openscreen_ocr_extra_patch = True
def _resolve_paddle_language(language: str | None) -> str:
explicit = os.getenv("PADDLEOCR_LANG")
def _recognize_profile_blocks(
image_path: str,
language: str | None,
profile: str | None,
) -> list[dict[str, Any]]:
ocr_profile = _resolve_ocr_profile(profile)
languages = _resolve_paddle_languages(language, ocr_profile)
prepared = _prepare_image_for_profile(image_path, ocr_profile)
try:
blocks: list[dict[str, Any]] = []
for paddle_lang in languages:
engine = _get_engine(paddle_lang)
recognized = _recognize_blocks(engine, prepared.path)
blocks.extend(_scale_blocks(recognized, prepared.scale))
return _merge_blocks(blocks)
finally:
if prepared.should_delete:
Path(prepared.path).unlink(missing_ok=True)
def _resolve_ocr_profile(profile: str | None) -> str:
explicit = (os.getenv("OPENSCREEN_OCR_PROFILE") or "").strip().lower()
value = explicit or (profile or "").strip().lower()
if value in {"fast", "vietnamese", "hybrid"}:
return value
return "vietnamese"
def _resolve_paddle_languages(language: str | None, profile: str) -> list[str]:
explicit = (os.getenv("PADDLEOCR_LANG") or "").strip().lower()
if explicit:
return explicit
return [explicit]
language_value = (language or "vi,en").lower()
if "vi" in language_value or "latin" in language_value:
has_vietnamese = "vi" in _split_language_tags(language_value)
if profile == "fast":
return [_resolve_primary_paddle_language(language_value, prefer_vietnamese=False)]
if profile == "hybrid":
languages = ["vi"] if has_vietnamese else []
languages.append("latin")
return _dedupe_languages(languages)
return [_resolve_primary_paddle_language(language_value, prefer_vietnamese=True)]
def _split_language_tags(language: str) -> set[str]:
return {part.strip().lower() for part in language.split(",") if part.strip()}
def _dedupe_languages(languages: list[str]) -> list[str]:
seen: set[str] = set()
result: list[str] = []
for language in languages:
if language not in seen:
seen.add(language)
result.append(language)
return result
def _resolve_primary_paddle_language(language_value: str, *, prefer_vietnamese: bool) -> str:
tags = _split_language_tags(language_value)
if prefer_vietnamese and "vi" in tags:
return "vi"
if "latin" in tags or "vi" in tags or "en" in tags:
return "latin"
if "en" in language_value:
return "en"
return language_value.split(",")[0].strip() or "latin"
for tag in tags:
return tag
return "latin"
def _prepare_image_for_profile(image_path: str, profile: str) -> PreparedImage:
if profile == "fast":
return PreparedImage(image_path)
try:
from PIL import Image, ImageEnhance, ImageOps
except Exception:
return PreparedImage(image_path)
try:
with Image.open(image_path) as source:
image = source.convert("RGB")
except Exception:
return PreparedImage(image_path)
scale = _resolve_enhancement_scale(image.width, image.height)
if scale <= 1:
return PreparedImage(image_path)
resampling = getattr(getattr(Image, "Resampling", Image), "LANCZOS")
enhanced = image.resize((round(image.width * scale), round(image.height * scale)), resampling)
enhanced = ImageOps.autocontrast(enhanced)
enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
enhanced = ImageEnhance.Sharpness(enhanced).enhance(1.35)
handle = tempfile.NamedTemporaryFile(prefix="openscreen-ocr-enhanced-", suffix=".png", delete=False)
try:
handle.close()
enhanced.save(handle.name, format="PNG")
return PreparedImage(handle.name, scale=scale, should_delete=True)
except Exception:
Path(handle.name).unlink(missing_ok=True)
return PreparedImage(image_path)
def _resolve_enhancement_scale(width: int, height: int) -> float:
try:
requested_scale = float(os.getenv("OPENSCREEN_OCR_ENHANCE_SCALE", "2"))
except ValueError:
requested_scale = 2.0
scale = max(1.0, min(3.0, requested_scale))
try:
max_side = int(os.getenv("OPENSCREEN_OCR_ENHANCE_MAX_SIDE", "2400"))
except ValueError:
max_side = 2400
largest_side = max(width, height)
if largest_side <= 0:
return 1.0
return max(1.0, min(scale, max_side / largest_side))
def _scale_blocks(blocks: list[dict[str, Any]], scale: float) -> list[dict[str, Any]]:
if scale <= 1:
return blocks
scaled_blocks: list[dict[str, Any]] = []
for block in blocks:
box = block.get("box")
if not isinstance(box, dict) or not _box_uses_pixels(box):
scaled_blocks.append(block)
continue
scaled_box = {
"x": float(box["x"]) / scale,
"y": float(box["y"]) / scale,
"width": float(box["width"]) / scale,
"height": float(box["height"]) / scale,
}
scaled_blocks.append({**block, "box": scaled_box})
return scaled_blocks
def _box_uses_pixels(box: dict[str, Any]) -> bool:
try:
x = float(box["x"])
y = float(box["y"])
width = float(box["width"])
height = float(box["height"])
except (KeyError, TypeError, ValueError):
return False
return x > 1 or y > 1 or width > 1 or height > 1 or x + width > 1 or y + height > 1
def _merge_blocks(blocks: list[dict[str, Any]]) -> list[dict[str, Any]]:
merged: list[dict[str, Any]] = []
for block in sorted(blocks, key=_block_quality, reverse=True):
box = block.get("box")
if not isinstance(box, dict):
continue
overlapping_index = next(
(
index
for index, existing in enumerate(merged)
if _box_iou(box, existing.get("box")) >= 0.62
),
None,
)
if overlapping_index is None:
merged.append(block)
continue
if _block_quality(block) > _block_quality(merged[overlapping_index]):
merged[overlapping_index] = block
return sorted(merged, key=lambda block: _box_sort_key(block.get("box")))
def _block_quality(block: dict[str, Any]) -> float:
text = str(block.get("text") or "")
score = _score_to_float(block.get("confidence"))
if _has_vietnamese_diacritics(text):
score += 0.08
if len(text) >= 2:
score += min(0.04, len(text) * 0.002)
return score
def _has_vietnamese_diacritics(text: str) -> bool:
return any(
character
in "ăâđêôơưĂÂĐÊÔƠƯáàảãạắằẳẵặấầẩẫậéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựýỳỷỹỵ"
for character in text
)
def _box_iou(left: Any, right: Any) -> float:
if not isinstance(left, dict) or not isinstance(right, dict):
return 0.0
try:
left_x = float(left["x"])
left_y = float(left["y"])
left_width = float(left["width"])
left_height = float(left["height"])
right_x = float(right["x"])
right_y = float(right["y"])
right_width = float(right["width"])
right_height = float(right["height"])
except (KeyError, TypeError, ValueError):
return 0.0
intersection_left = max(left_x, right_x)
intersection_top = max(left_y, right_y)
intersection_right = min(left_x + left_width, right_x + right_width)
intersection_bottom = min(left_y + left_height, right_y + right_height)
intersection_width = max(0.0, intersection_right - intersection_left)
intersection_height = max(0.0, intersection_bottom - intersection_top)
intersection_area = intersection_width * intersection_height
if intersection_area <= 0:
return 0.0
union_area = left_width * left_height + right_width * right_height - intersection_area
return intersection_area / union_area if union_area > 0 else 0.0
def _box_sort_key(box: Any) -> tuple[float, float]:
if not isinstance(box, dict):
return (0.0, 0.0)
try:
return (float(box["y"]), float(box["x"]))
except (KeyError, TypeError, ValueError):
return (0.0, 0.0)
def _mobile_recognition_model(paddle_lang: str) -> str:
if paddle_lang == "en":
return "en_PP-OCRv5_mobile_rec"
if paddle_lang == "latin":
if paddle_lang in _LATIN_RECOGNITION_LANGS:
return "latin_PP-OCRv5_mobile_rec"
return "PP-OCRv5_mobile_rec"