Release OpenScreen 1.4.4
This commit is contained in:
@@ -6,7 +6,7 @@ OpenScreen calls OCR through a local HTTP service. The default endpoint is:
|
|||||||
http://127.0.0.1:8866/ocr
|
http://127.0.0.1:8866/ocr
|
||||||
```
|
```
|
||||||
|
|
||||||
The app sends either `imageBase64` or `path` and expects OCR blocks:
|
The app sends either `imageBase64` or `path`, plus optional `language` and `profile`, and expects OCR blocks:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@@ -38,7 +38,7 @@ If `paddle` is still missing after installing `paddleocr`, install the CPU Paddl
|
|||||||
```powershell
|
```powershell
|
||||||
.\.venv-ocr\Scripts\Activate.ps1
|
.\.venv-ocr\Scripts\Activate.ps1
|
||||||
$env:PADDLEOCR_DEVICE="cpu"
|
$env:PADDLEOCR_DEVICE="cpu"
|
||||||
$env:PADDLEOCR_LANG="latin"
|
$env:OPENSCREEN_OCR_PROFILE="vietnamese"
|
||||||
npm run ocr:paddle
|
npm run ocr:paddle
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -58,7 +58,8 @@ Expected healthy environment:
|
|||||||
"paddleocrInstalled": true,
|
"paddleocrInstalled": true,
|
||||||
"paddleInstalled": true,
|
"paddleInstalled": true,
|
||||||
"engineReady": false,
|
"engineReady": false,
|
||||||
"defaultLanguage": "latin"
|
"defaultLanguage": "vi,en",
|
||||||
|
"defaultProfile": "vietnamese"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -67,7 +68,10 @@ Expected healthy environment:
|
|||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
- `PADDLEOCR_DEVICE`: `cpu`, `gpu:0`, or another PaddleOCR device string.
|
- `PADDLEOCR_DEVICE`: `cpu`, `gpu:0`, or another PaddleOCR device string.
|
||||||
- `PADDLEOCR_LANG`: defaults to `latin`; this is preferred for Vietnamese UI text because it uses a Latin-script recognition model.
|
- `OPENSCREEN_OCR_PROFILE`: `fast`, `vietnamese`, or `hybrid`. The default `vietnamese` profile upscales and sharpens focused UI screenshots before OCR.
|
||||||
|
- `OPENSCREEN_GUIDE_OCR_LANGUAGE`: defaults to `vi,en`.
|
||||||
|
- `PADDLEOCR_LANG`: optional hard override. Leave unset for the app profile/language settings to work.
|
||||||
- `PADDLEOCR_VERSION`: defaults to `PP-OCRv5`.
|
- `PADDLEOCR_VERSION`: defaults to `PP-OCRv5`.
|
||||||
- `PADDLEOCR_USE_MOBILE`: defaults to `1`; set to `0` to use the default/server models.
|
- `PADDLEOCR_USE_MOBILE`: defaults to `1`; set to `0` to use the default/server models.
|
||||||
|
- `PADDLEOCR_REC_MODEL`: optional recognizer model override. The bundled profile uses `latin_PP-OCRv5_mobile_rec`, which supports Vietnamese Latin-script text.
|
||||||
- `OPENSCREEN_GUIDE_OCR_URL`: OpenScreen OCR endpoint override; defaults to `http://127.0.0.1:8866`.
|
- `OPENSCREEN_GUIDE_OCR_URL`: OpenScreen OCR endpoint override; defaults to `http://127.0.0.1:8866`.
|
||||||
|
|||||||
@@ -0,0 +1,66 @@
|
|||||||
|
import fs from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
|
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||||
|
import { DeepSeekSettingsStore } from "./deepseekSettingsStore";
|
||||||
|
|
||||||
|
const tempDirs: string[] = [];
|
||||||
|
const originalOcrProfile = process.env.OPENSCREEN_GUIDE_OCR_PROFILE;
|
||||||
|
const originalOcrLanguage = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
delete process.env.OPENSCREEN_GUIDE_OCR_PROFILE;
|
||||||
|
delete process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE;
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(async () => {
|
||||||
|
restoreEnv("OPENSCREEN_GUIDE_OCR_PROFILE", originalOcrProfile);
|
||||||
|
restoreEnv("OPENSCREEN_GUIDE_OCR_LANGUAGE", originalOcrLanguage);
|
||||||
|
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true })));
|
||||||
|
});
|
||||||
|
|
||||||
|
function restoreEnv(name: string, value: string | undefined): void {
|
||||||
|
if (value === undefined) {
|
||||||
|
delete process.env[name];
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
process.env[name] = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createStore(): Promise<DeepSeekSettingsStore> {
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openscreen-guide-settings-"));
|
||||||
|
tempDirs.push(dir);
|
||||||
|
return new DeepSeekSettingsStore(path.join(dir, "guide-ai-settings.json"));
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("DeepSeekSettingsStore OCR settings", () => {
|
||||||
|
it("defaults to the Vietnamese enhanced OCR profile", async () => {
|
||||||
|
const store = await createStore();
|
||||||
|
|
||||||
|
await expect(store.getOcrConfig()).resolves.toEqual({
|
||||||
|
profile: "vietnamese",
|
||||||
|
language: "vi,en",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("persists OCR profile changes alongside DeepSeek settings", async () => {
|
||||||
|
const store = await createStore();
|
||||||
|
|
||||||
|
const status = await store.save({
|
||||||
|
deepseekApiKeyEnvName: "DEEPSEEK_API_KEY",
|
||||||
|
baseUrl: "https://api.deepseek.com",
|
||||||
|
model: "deepseek-chat",
|
||||||
|
ocrProfile: "hybrid",
|
||||||
|
ocrLanguage: "vi,en",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(status.ocr).toMatchObject({
|
||||||
|
profile: "hybrid",
|
||||||
|
language: "vi,en",
|
||||||
|
});
|
||||||
|
await expect(store.getOcrConfig()).resolves.toEqual({
|
||||||
|
profile: "hybrid",
|
||||||
|
language: "vi,en",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,6 +1,10 @@
|
|||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import type { GuideAiSettings, SaveGuideAiSettingsInput } from "../../../src/guide/contracts";
|
import type {
|
||||||
|
GuideAiSettings,
|
||||||
|
GuideOcrProfile,
|
||||||
|
SaveGuideAiSettingsInput,
|
||||||
|
} from "../../../src/guide/contracts";
|
||||||
|
|
||||||
export interface DeepSeekGuideConfig {
|
export interface DeepSeekGuideConfig {
|
||||||
apiKey?: string;
|
apiKey?: string;
|
||||||
@@ -12,8 +16,22 @@ export interface DeepSeekGuideConfigProvider {
|
|||||||
getDeepSeekConfig(): Promise<DeepSeekGuideConfig>;
|
getDeepSeekConfig(): Promise<DeepSeekGuideConfig>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface GuideOcrConfig {
|
||||||
|
profile: GuideOcrProfile;
|
||||||
|
language: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface GuideOcrConfigProvider {
|
||||||
|
getOcrConfig(): Promise<GuideOcrConfig>;
|
||||||
|
}
|
||||||
|
|
||||||
interface PersistedGuideAiSettings {
|
interface PersistedGuideAiSettings {
|
||||||
schemaVersion: 1;
|
schemaVersion: 1;
|
||||||
|
ocr?: {
|
||||||
|
profile?: GuideOcrProfile;
|
||||||
|
language?: string;
|
||||||
|
updatedAt?: string;
|
||||||
|
};
|
||||||
deepseek?: {
|
deepseek?: {
|
||||||
apiKeyEnvName?: string;
|
apiKeyEnvName?: string;
|
||||||
baseUrl?: string;
|
baseUrl?: string;
|
||||||
@@ -25,8 +43,10 @@ interface PersistedGuideAiSettings {
|
|||||||
const DEFAULT_DEEPSEEK_API_KEY_ENV_NAME = "DEEPSEEK_API_KEY";
|
const DEFAULT_DEEPSEEK_API_KEY_ENV_NAME = "DEEPSEEK_API_KEY";
|
||||||
const DEFAULT_DEEPSEEK_BASE_URL = "https://api.deepseek.com";
|
const DEFAULT_DEEPSEEK_BASE_URL = "https://api.deepseek.com";
|
||||||
const DEFAULT_DEEPSEEK_MODEL = "deepseek-chat";
|
const DEFAULT_DEEPSEEK_MODEL = "deepseek-chat";
|
||||||
|
const DEFAULT_OCR_PROFILE: GuideOcrProfile = "vietnamese";
|
||||||
|
const DEFAULT_OCR_LANGUAGE = "vi,en";
|
||||||
|
|
||||||
export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
|
export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider, GuideOcrConfigProvider {
|
||||||
constructor(private readonly filePath: string) {}
|
constructor(private readonly filePath: string) {}
|
||||||
|
|
||||||
async getStatus(): Promise<GuideAiSettings> {
|
async getStatus(): Promise<GuideAiSettings> {
|
||||||
@@ -35,6 +55,13 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
|
|||||||
const activeApiKey = process.env[apiKeyEnvName];
|
const activeApiKey = process.env[apiKeyEnvName];
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
ocr: {
|
||||||
|
profile: normalizeOcrProfile(raw?.ocr?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
|
||||||
|
language: normalizeOcrLanguage(
|
||||||
|
raw?.ocr?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE,
|
||||||
|
),
|
||||||
|
updatedAt: raw?.ocr?.updatedAt,
|
||||||
|
},
|
||||||
deepseek: {
|
deepseek: {
|
||||||
hasApiKey: Boolean(activeApiKey),
|
hasApiKey: Boolean(activeApiKey),
|
||||||
apiKeyEnvName,
|
apiKeyEnvName,
|
||||||
@@ -49,7 +76,14 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
|
|||||||
|
|
||||||
async save(input: SaveGuideAiSettingsInput): Promise<GuideAiSettings> {
|
async save(input: SaveGuideAiSettingsInput): Promise<GuideAiSettings> {
|
||||||
const current = (await this.readSettings()) ?? { schemaVersion: 1 };
|
const current = (await this.readSettings()) ?? { schemaVersion: 1 };
|
||||||
|
const currentOcr = current.ocr ?? {};
|
||||||
const currentDeepSeek = current.deepseek ?? {};
|
const currentDeepSeek = current.deepseek ?? {};
|
||||||
|
const nextOcr = {
|
||||||
|
...currentOcr,
|
||||||
|
profile: normalizeOcrProfile(input.ocrProfile ?? currentOcr.profile),
|
||||||
|
language: normalizeOcrLanguage(input.ocrLanguage ?? currentOcr.language),
|
||||||
|
updatedAt: new Date().toISOString(),
|
||||||
|
};
|
||||||
const nextDeepSeek = {
|
const nextDeepSeek = {
|
||||||
...currentDeepSeek,
|
...currentDeepSeek,
|
||||||
baseUrl: normalizeBaseUrl(input.baseUrl ?? currentDeepSeek.baseUrl),
|
baseUrl: normalizeBaseUrl(input.baseUrl ?? currentDeepSeek.baseUrl),
|
||||||
@@ -65,6 +99,7 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
|
|||||||
|
|
||||||
await this.writeSettings({
|
await this.writeSettings({
|
||||||
schemaVersion: 1,
|
schemaVersion: 1,
|
||||||
|
ocr: nextOcr,
|
||||||
deepseek: nextDeepSeek,
|
deepseek: nextDeepSeek,
|
||||||
});
|
});
|
||||||
return await this.getStatus();
|
return await this.getStatus();
|
||||||
@@ -80,6 +115,16 @@ export class DeepSeekSettingsStore implements DeepSeekGuideConfigProvider {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async getOcrConfig(): Promise<GuideOcrConfig> {
|
||||||
|
const raw = await this.readSettings();
|
||||||
|
return {
|
||||||
|
profile: normalizeOcrProfile(raw?.ocr?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
|
||||||
|
language: normalizeOcrLanguage(
|
||||||
|
raw?.ocr?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE,
|
||||||
|
),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
private async readSettings(): Promise<PersistedGuideAiSettings | null> {
|
private async readSettings(): Promise<PersistedGuideAiSettings | null> {
|
||||||
try {
|
try {
|
||||||
const content = await fs.readFile(this.filePath, "utf-8");
|
const content = await fs.readFile(this.filePath, "utf-8");
|
||||||
@@ -120,6 +165,11 @@ function normalizePersistedSettings(input: unknown): PersistedGuideAiSettings |
|
|||||||
}
|
}
|
||||||
return {
|
return {
|
||||||
schemaVersion: 1,
|
schemaVersion: 1,
|
||||||
|
ocr: {
|
||||||
|
profile: normalizeOcrProfile(raw.ocr?.profile),
|
||||||
|
language: normalizeOcrLanguage(raw.ocr?.language),
|
||||||
|
updatedAt: raw.ocr?.updatedAt,
|
||||||
|
},
|
||||||
deepseek: {
|
deepseek: {
|
||||||
apiKeyEnvName: normalizeEnvName(raw.deepseek?.apiKeyEnvName),
|
apiKeyEnvName: normalizeEnvName(raw.deepseek?.apiKeyEnvName),
|
||||||
baseUrl: raw.deepseek?.baseUrl,
|
baseUrl: raw.deepseek?.baseUrl,
|
||||||
@@ -155,3 +205,19 @@ function normalizeBaseUrl(value: string | undefined): string {
|
|||||||
function normalizeModel(value: string | undefined): string {
|
function normalizeModel(value: string | undefined): string {
|
||||||
return value?.trim() || DEFAULT_DEEPSEEK_MODEL;
|
return value?.trim() || DEFAULT_DEEPSEEK_MODEL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function normalizeOcrProfile(value: string | undefined): GuideOcrProfile {
|
||||||
|
if (value === "fast" || value === "vietnamese" || value === "hybrid") {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
return DEFAULT_OCR_PROFILE;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeOcrLanguage(value: string | undefined): string {
|
||||||
|
const normalized = value
|
||||||
|
?.split(",")
|
||||||
|
.map((part) => part.trim().toLowerCase())
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(",");
|
||||||
|
return normalized || DEFAULT_OCR_LANGUAGE;
|
||||||
|
}
|
||||||
|
|||||||
@@ -34,7 +34,10 @@ import {
|
|||||||
DeepSeekGuideClientError,
|
DeepSeekGuideClientError,
|
||||||
type GuideDraftClient,
|
type GuideDraftClient,
|
||||||
} from "./ai/deepseekGuideClient";
|
} from "./ai/deepseekGuideClient";
|
||||||
import type { DeepSeekGuideConfigProvider } from "./ai/deepseekSettingsStore";
|
import type {
|
||||||
|
DeepSeekGuideConfigProvider,
|
||||||
|
GuideOcrConfigProvider,
|
||||||
|
} from "./ai/deepseekSettingsStore";
|
||||||
import { type GuidePaths, normalizeGuideRecordingId, resolveGuidePaths } from "./guidePaths";
|
import { type GuidePaths, normalizeGuideRecordingId, resolveGuidePaths } from "./guidePaths";
|
||||||
import { createFocusedOcrSnapshot, remapFocusedOcrBlocks } from "./ocr/focusedOcrSnapshot";
|
import { createFocusedOcrSnapshot, remapFocusedOcrBlocks } from "./ocr/focusedOcrSnapshot";
|
||||||
import { DefaultGuideOcrClient, type GuideOcrClient } from "./ocr/paddleOcrClient";
|
import { DefaultGuideOcrClient, type GuideOcrClient } from "./ocr/paddleOcrClient";
|
||||||
@@ -70,6 +73,7 @@ export interface GuideStoreDependencies {
|
|||||||
ocrClient?: GuideOcrClient;
|
ocrClient?: GuideOcrClient;
|
||||||
draftClient?: GuideDraftClient;
|
draftClient?: GuideDraftClient;
|
||||||
deepSeekConfigProvider?: DeepSeekGuideConfigProvider;
|
deepSeekConfigProvider?: DeepSeekGuideConfigProvider;
|
||||||
|
ocrConfigProvider?: GuideOcrConfigProvider;
|
||||||
focusOcrSnapshots?: boolean;
|
focusOcrSnapshots?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -255,7 +259,9 @@ export class GuideStore {
|
|||||||
throw new GuideStoreError("guide-invalid-input", "No guide snapshots are available for OCR.");
|
throw new GuideStoreError("guide-invalid-input", "No guide snapshots are available for OCR.");
|
||||||
}
|
}
|
||||||
|
|
||||||
const ocrClient = this.dependencies.ocrClient ?? new DefaultGuideOcrClient();
|
const ocrClient =
|
||||||
|
this.dependencies.ocrClient ??
|
||||||
|
DefaultGuideOcrClient.fromConfig(await this.dependencies.ocrConfigProvider?.getOcrConfig());
|
||||||
const shouldFocusOcrSnapshots =
|
const shouldFocusOcrSnapshots =
|
||||||
this.dependencies.focusOcrSnapshots ?? this.dependencies.ocrClient === undefined;
|
this.dependencies.focusOcrSnapshots ?? this.dependencies.ocrClient === undefined;
|
||||||
const eventsById = new Map(session.events.map((event) => [event.id, event]));
|
const eventsById = new Map(session.events.map((event) => [event.id, event]));
|
||||||
|
|||||||
@@ -156,8 +156,10 @@ function startOcrServiceProcess(
|
|||||||
OPENSCREEN_OCR_PORT: DEFAULT_OCR_PORT,
|
OPENSCREEN_OCR_PORT: DEFAULT_OCR_PORT,
|
||||||
PADDLEOCR_DEVICE: process.env.PADDLEOCR_DEVICE ?? "cpu",
|
PADDLEOCR_DEVICE: process.env.PADDLEOCR_DEVICE ?? "cpu",
|
||||||
PADDLEOCR_ENABLE_MKLDNN: process.env.PADDLEOCR_ENABLE_MKLDNN ?? "0",
|
PADDLEOCR_ENABLE_MKLDNN: process.env.PADDLEOCR_ENABLE_MKLDNN ?? "0",
|
||||||
PADDLEOCR_LANG: process.env.PADDLEOCR_LANG ?? "latin",
|
PADDLEOCR_LANG: process.env.PADDLEOCR_LANG ?? "",
|
||||||
PADDLEOCR_USE_MOBILE: process.env.PADDLEOCR_USE_MOBILE ?? "1",
|
PADDLEOCR_USE_MOBILE: process.env.PADDLEOCR_USE_MOBILE ?? "1",
|
||||||
|
OPENSCREEN_OCR_PROFILE:
|
||||||
|
process.env.OPENSCREEN_OCR_PROFILE ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE ?? "",
|
||||||
PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT: process.env.PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT ?? "False",
|
PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT: process.env.PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT ?? "False",
|
||||||
PADDLE_PDX_CACHE_HOME: process.env.PADDLE_PDX_CACHE_HOME ?? runtimePaths.paddlexCachePath,
|
PADDLE_PDX_CACHE_HOME: process.env.PADDLE_PDX_CACHE_HOME ?? runtimePaths.paddlexCachePath,
|
||||||
PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK:
|
PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK:
|
||||||
|
|||||||
@@ -1,8 +1,12 @@
|
|||||||
import { describe, expect, it } from "vitest";
|
import fs from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
|
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||||
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
|
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
|
||||||
import {
|
import {
|
||||||
DefaultGuideOcrClient,
|
DefaultGuideOcrClient,
|
||||||
normalizeOcrResponse,
|
normalizeOcrResponse,
|
||||||
|
PaddleOcrHttpClient,
|
||||||
parseWindowsOcrPayload,
|
parseWindowsOcrPayload,
|
||||||
} from "./paddleOcrClient";
|
} from "./paddleOcrClient";
|
||||||
|
|
||||||
@@ -16,6 +20,10 @@ const snapshot: GuideSnapshot = {
|
|||||||
height: 800,
|
height: 800,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
vi.unstubAllGlobals();
|
||||||
|
});
|
||||||
|
|
||||||
describe("normalizeOcrResponse", () => {
|
describe("normalizeOcrResponse", () => {
|
||||||
it("normalizes pixel boxes into guide OCR blocks", () => {
|
it("normalizes pixel boxes into guide OCR blocks", () => {
|
||||||
const blocks = normalizeOcrResponse(
|
const blocks = normalizeOcrResponse(
|
||||||
@@ -67,6 +75,35 @@ describe("normalizeOcrResponse", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("PaddleOcrHttpClient", () => {
|
||||||
|
it("sends the selected OCR profile to the local service", async () => {
|
||||||
|
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openscreen-ocr-client-"));
|
||||||
|
const imagePath = path.join(tempDir, "step.png");
|
||||||
|
await fs.writeFile(imagePath, Buffer.from([137, 80, 78, 71]));
|
||||||
|
const requests: unknown[] = [];
|
||||||
|
vi.stubGlobal(
|
||||||
|
"fetch",
|
||||||
|
vi.fn(async (_url: string, init?: RequestInit) => {
|
||||||
|
requests.push(JSON.parse(String(init?.body ?? "{}")));
|
||||||
|
return new Response(JSON.stringify({ blocks: [] }), {
|
||||||
|
status: 200,
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
});
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const client = new PaddleOcrHttpClient("https://ocr.example.test", "vi,en", "hybrid");
|
||||||
|
await client.recognize({ ...snapshot, path: imagePath });
|
||||||
|
|
||||||
|
expect(requests[0]).toMatchObject({
|
||||||
|
language: "vi,en",
|
||||||
|
profile: "hybrid",
|
||||||
|
path: imagePath,
|
||||||
|
});
|
||||||
|
await fs.rm(tempDir, { recursive: true, force: true });
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe("DefaultGuideOcrClient", () => {
|
describe("DefaultGuideOcrClient", () => {
|
||||||
it("falls back when the HTTP OCR service is unavailable", async () => {
|
it("falls back when the HTTP OCR service is unavailable", async () => {
|
||||||
const fallbackBlock: OcrBlock = {
|
const fallbackBlock: OcrBlock = {
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import { execFile } from "node:child_process";
|
import { execFile } from "node:child_process";
|
||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import { promisify } from "node:util";
|
import { promisify } from "node:util";
|
||||||
import type { GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
|
import type { GuideOcrProfile, GuideSnapshot, OcrBlock } from "../../../src/guide/contracts";
|
||||||
import { ensureBundledOcrServiceRunning } from "./bundledOcrService";
|
import { ensureBundledOcrServiceRunning } from "./bundledOcrService";
|
||||||
|
|
||||||
const execFileAsync = promisify(execFile);
|
const execFileAsync = promisify(execFile);
|
||||||
@@ -10,6 +10,11 @@ export interface GuideOcrClient {
|
|||||||
recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]>;
|
recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface GuideOcrClientConfig {
|
||||||
|
profile: GuideOcrProfile;
|
||||||
|
language: string;
|
||||||
|
}
|
||||||
|
|
||||||
interface PaddleOcrResponseBlock {
|
interface PaddleOcrResponseBlock {
|
||||||
text?: unknown;
|
text?: unknown;
|
||||||
confidence?: unknown;
|
confidence?: unknown;
|
||||||
@@ -21,7 +26,8 @@ interface PaddleOcrResponseBlock {
|
|||||||
export class PaddleOcrHttpClient implements GuideOcrClient {
|
export class PaddleOcrHttpClient implements GuideOcrClient {
|
||||||
constructor(
|
constructor(
|
||||||
private readonly baseUrl = process.env.OPENSCREEN_GUIDE_OCR_URL ?? "http://127.0.0.1:8866",
|
private readonly baseUrl = process.env.OPENSCREEN_GUIDE_OCR_URL ?? "http://127.0.0.1:8866",
|
||||||
private readonly language = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE ?? "vi,en",
|
private readonly language = normalizeOcrLanguage(process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
|
||||||
|
private readonly profile = normalizeOcrProfile(process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
|
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
|
||||||
@@ -36,6 +42,7 @@ export class PaddleOcrHttpClient implements GuideOcrClient {
|
|||||||
imageBase64,
|
imageBase64,
|
||||||
path: snapshot.path,
|
path: snapshot.path,
|
||||||
language: this.language,
|
language: this.language,
|
||||||
|
profile: this.profile,
|
||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -54,7 +61,9 @@ export class PaddleOcrHttpClient implements GuideOcrClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export class WindowsOcrClient implements GuideOcrClient {
|
export class WindowsOcrClient implements GuideOcrClient {
|
||||||
constructor(private readonly language = process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE ?? "vi,en") {}
|
constructor(
|
||||||
|
private readonly language = normalizeOcrLanguage(process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
|
||||||
|
) {}
|
||||||
|
|
||||||
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
|
async recognize(snapshot: GuideSnapshot): Promise<OcrBlock[]> {
|
||||||
if (process.platform !== "win32") {
|
if (process.platform !== "win32") {
|
||||||
@@ -96,6 +105,14 @@ export class WindowsOcrClient implements GuideOcrClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export class DefaultGuideOcrClient implements GuideOcrClient {
|
export class DefaultGuideOcrClient implements GuideOcrClient {
|
||||||
|
static fromConfig(config?: Partial<GuideOcrClientConfig>): DefaultGuideOcrClient {
|
||||||
|
const normalizedConfig = normalizeOcrClientConfig(config);
|
||||||
|
return new DefaultGuideOcrClient(
|
||||||
|
new PaddleOcrHttpClient(undefined, normalizedConfig.language, normalizedConfig.profile),
|
||||||
|
new WindowsOcrClient(normalizedConfig.language),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
private readonly httpClient = new PaddleOcrHttpClient(),
|
private readonly httpClient = new PaddleOcrHttpClient(),
|
||||||
private readonly windowsClient = new WindowsOcrClient(),
|
private readonly windowsClient = new WindowsOcrClient(),
|
||||||
@@ -119,6 +136,31 @@ export class DefaultGuideOcrClient implements GuideOcrClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function normalizeOcrClientConfig(
|
||||||
|
config: Partial<GuideOcrClientConfig> | undefined,
|
||||||
|
): GuideOcrClientConfig {
|
||||||
|
return {
|
||||||
|
profile: normalizeOcrProfile(config?.profile ?? process.env.OPENSCREEN_GUIDE_OCR_PROFILE),
|
||||||
|
language: normalizeOcrLanguage(config?.language ?? process.env.OPENSCREEN_GUIDE_OCR_LANGUAGE),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeOcrProfile(value: string | undefined): GuideOcrProfile {
|
||||||
|
if (value === "fast" || value === "vietnamese" || value === "hybrid") {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
return "vietnamese";
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeOcrLanguage(value: string | undefined): string {
|
||||||
|
const normalized = value
|
||||||
|
?.split(",")
|
||||||
|
.map((part) => part.trim().toLowerCase())
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(",");
|
||||||
|
return normalized || "vi,en";
|
||||||
|
}
|
||||||
|
|
||||||
export function parseWindowsOcrPayload(stdout: string): unknown {
|
export function parseWindowsOcrPayload(stdout: string): unknown {
|
||||||
const normalized = stdout.replace(/^\uFEFF/, "").trim();
|
const normalized = stdout.replace(/^\uFEFF/, "").trim();
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -1732,7 +1732,7 @@ export function registerIpcHandlers(
|
|||||||
const sources = await desktopCapturer.getSources(opts);
|
const sources = await desktopCapturer.getSources(opts);
|
||||||
lastEnumeratedSources = new Map(sources.map((source) => [source.id, source]));
|
lastEnumeratedSources = new Map(sources.map((source) => [source.id, source]));
|
||||||
let screenSourceIndex = 0;
|
let screenSourceIndex = 0;
|
||||||
return sources.map((source) => {
|
const processedSources = sources.map((source) => {
|
||||||
const isScreenSource = source.id.startsWith("screen:");
|
const isScreenSource = source.id.startsWith("screen:");
|
||||||
const sourceIndex = isScreenSource
|
const sourceIndex = isScreenSource
|
||||||
? (parseDesktopCapturerScreenIndex(source.id) ?? screenSourceIndex)
|
? (parseDesktopCapturerScreenIndex(source.id) ?? screenSourceIndex)
|
||||||
@@ -1760,6 +1760,43 @@ export function registerIpcHandlers(
|
|||||||
bounds,
|
bounds,
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
const screenDisplays = screen.getAllDisplays();
|
||||||
|
const mappedDisplayIds = new Set(
|
||||||
|
processedSources
|
||||||
|
.filter((source) => source.id.startsWith("screen:") && typeof source.displayId === "number")
|
||||||
|
.map((source) => source.displayId),
|
||||||
|
);
|
||||||
|
const fallbackScreenSources = screenDisplays
|
||||||
|
.map((display, displayIndex) => ({ display, displayIndex }))
|
||||||
|
.filter(({ display }) => !mappedDisplayIds.has(display.id))
|
||||||
|
.map(({ display, displayIndex }) => {
|
||||||
|
const bounds = toSourceBounds(display.bounds);
|
||||||
|
return {
|
||||||
|
id: `screen:${displayIndex}:fallback:${display.id}`,
|
||||||
|
name: `Screen ${displayIndex + 1}`,
|
||||||
|
display_id: String(display.id),
|
||||||
|
thumbnail: null,
|
||||||
|
appIcon: null,
|
||||||
|
displayId: display.id,
|
||||||
|
displayIndex,
|
||||||
|
screenIndex: displayIndex,
|
||||||
|
displayLabel: `Display ${displayIndex + 1} - ${bounds.width}x${bounds.height} @ ${bounds.x},${bounds.y}`,
|
||||||
|
bounds,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
if (fallbackScreenSources.length > 0) {
|
||||||
|
console.warn("[desktop-capturer] added fallback display sources", {
|
||||||
|
capturerScreens: processedSources.filter((source) => source.id.startsWith("screen:"))
|
||||||
|
.length,
|
||||||
|
electronDisplays: screenDisplays.length,
|
||||||
|
fallbackScreens: fallbackScreenSources.map((source) => ({
|
||||||
|
id: source.id,
|
||||||
|
displayId: source.displayId,
|
||||||
|
bounds: source.bounds,
|
||||||
|
})),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return [...processedSources, ...fallbackScreenSources];
|
||||||
});
|
});
|
||||||
|
|
||||||
ipcMain.handle("select-source", async (_, source: SelectedSource) => {
|
ipcMain.handle("select-source", async (_, source: SelectedSource) => {
|
||||||
@@ -2637,6 +2674,7 @@ export function registerIpcHandlers(
|
|||||||
);
|
);
|
||||||
const guideStore = new GuideStore(RECORDINGS_DIR, {
|
const guideStore = new GuideStore(RECORDINGS_DIR, {
|
||||||
deepSeekConfigProvider: guideAiSettingsStore,
|
deepSeekConfigProvider: guideAiSettingsStore,
|
||||||
|
ocrConfigProvider: guideAiSettingsStore,
|
||||||
});
|
});
|
||||||
registerGuideMarkerHotkey(guideStore);
|
registerGuideMarkerHotkey(guideStore);
|
||||||
registerGuideIpcHandlers(ipcMain, guideStore, guideAiSettingsStore, {
|
registerGuideIpcHandlers(ipcMain, guideStore, guideAiSettingsStore, {
|
||||||
|
|||||||
Generated
+2
-2
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "openscreen",
|
"name": "openscreen",
|
||||||
"version": "1.4.2",
|
"version": "1.4.4",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "openscreen",
|
"name": "openscreen",
|
||||||
"version": "1.4.2",
|
"version": "1.4.4",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@fix-webm-duration/fix": "^1.0.1",
|
"@fix-webm-duration/fix": "^1.0.1",
|
||||||
"@pixi/filter-drop-shadow": "^5.2.0",
|
"@pixi/filter-drop-shadow": "^5.2.0",
|
||||||
|
|||||||
+1
-1
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"name": "openscreen",
|
"name": "openscreen",
|
||||||
"private": true,
|
"private": true,
|
||||||
"version": "1.4.2",
|
"version": "1.4.4",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"packageManager": "npm@10.9.4",
|
"packageManager": "npm@10.9.4",
|
||||||
"engines": {
|
"engines": {
|
||||||
|
|||||||
@@ -65,7 +65,13 @@ export function SourceSelector() {
|
|||||||
fetchSources();
|
fetchSources();
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
const screenSources = sources.filter((s) => s.id.startsWith("screen:"));
|
const screenSources = sources
|
||||||
|
.filter((s) => s.id.startsWith("screen:"))
|
||||||
|
.sort(
|
||||||
|
(left, right) =>
|
||||||
|
(left.displayIndex ?? left.screenIndex ?? Number.MAX_SAFE_INTEGER) -
|
||||||
|
(right.displayIndex ?? right.screenIndex ?? Number.MAX_SAFE_INTEGER),
|
||||||
|
);
|
||||||
const windowSources = sources.filter((s) => s.id.startsWith("window:"));
|
const windowSources = sources.filter((s) => s.id.startsWith("window:"));
|
||||||
|
|
||||||
const handleSourceSelect = (source: DesktopSource) => setSelectedSource(source);
|
const handleSourceSelect = (source: DesktopSource) => setSelectedSource(source);
|
||||||
@@ -96,11 +102,17 @@ export function SourceSelector() {
|
|||||||
onClick={() => handleSourceSelect(source)}
|
onClick={() => handleSourceSelect(source)}
|
||||||
>
|
>
|
||||||
<div className="relative mb-1.5 overflow-hidden rounded-lg border border-white/[0.06] bg-black/30">
|
<div className="relative mb-1.5 overflow-hidden rounded-lg border border-white/[0.06] bg-black/30">
|
||||||
|
{source.thumbnail ? (
|
||||||
<img
|
<img
|
||||||
src={source.thumbnail || ""}
|
src={source.thumbnail}
|
||||||
alt={source.name}
|
alt={source.name}
|
||||||
className="w-full aspect-video object-cover"
|
className="w-full aspect-video object-cover"
|
||||||
/>
|
/>
|
||||||
|
) : (
|
||||||
|
<div className="flex aspect-video w-full items-center justify-center bg-zinc-950 text-center text-[11px] font-medium text-zinc-400">
|
||||||
|
{source.displayLabel ?? source.name}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
{isSelected && (
|
{isSelected && (
|
||||||
<div className="absolute right-1.5 top-1.5">
|
<div className="absolute right-1.5 top-1.5">
|
||||||
<div className={styles.checkBadge}>
|
<div className={styles.checkBadge}>
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import type {
|
|||||||
GuideAiProvider,
|
GuideAiProvider,
|
||||||
GuideAiSettings,
|
GuideAiSettings,
|
||||||
GuideLanguage,
|
GuideLanguage,
|
||||||
|
GuideOcrProfile,
|
||||||
GuideSession,
|
GuideSession,
|
||||||
} from "@/guide/contracts";
|
} from "@/guide/contracts";
|
||||||
import { captureGuideSnapshots } from "@/guide/snapshot/extractGuideSnapshots";
|
import { captureGuideSnapshots } from "@/guide/snapshot/extractGuideSnapshots";
|
||||||
@@ -42,13 +43,19 @@ const COPY = {
|
|||||||
captureStep: "Capture step",
|
captureStep: "Capture step",
|
||||||
captureLabel: "Manual capture",
|
captureLabel: "Manual capture",
|
||||||
settings: "Settings",
|
settings: "Settings",
|
||||||
|
guideSettings: "Guide settings",
|
||||||
apiKey: "API key env",
|
apiKey: "API key env",
|
||||||
apiKeyPlaceholder: "DEEPSEEK_API_KEY",
|
apiKeyPlaceholder: "DEEPSEEK_API_KEY",
|
||||||
baseUrl: "Base URL",
|
baseUrl: "Base URL",
|
||||||
model: "Model",
|
model: "Model",
|
||||||
|
ocrProfile: "OCR profile",
|
||||||
|
ocrLanguage: "OCR languages",
|
||||||
|
ocrFast: "Fast Latin",
|
||||||
|
ocrVietnamese: "Vietnamese Enhanced",
|
||||||
|
ocrHybrid: "Hybrid Vi + Latin",
|
||||||
saveSettings: "Save",
|
saveSettings: "Save",
|
||||||
clearKey: "Reset env",
|
clearKey: "Reset env",
|
||||||
keySaved: "DeepSeek settings saved.",
|
settingsSaved: "Guide settings saved.",
|
||||||
keyMissing: "Set a DeepSeek API key environment variable before generating with DeepSeek.",
|
keyMissing: "Set a DeepSeek API key environment variable before generating with DeepSeek.",
|
||||||
keyConfigured: "Env ready",
|
keyConfigured: "Env ready",
|
||||||
keyNotConfigured: "Env value missing",
|
keyNotConfigured: "Env value missing",
|
||||||
@@ -78,13 +85,19 @@ const COPY = {
|
|||||||
captureStep: "Chụp bước",
|
captureStep: "Chụp bước",
|
||||||
captureLabel: "Chụp thủ công",
|
captureLabel: "Chụp thủ công",
|
||||||
settings: "Cài đặt",
|
settings: "Cài đặt",
|
||||||
|
guideSettings: "Guide settings",
|
||||||
apiKey: "API key env",
|
apiKey: "API key env",
|
||||||
apiKeyPlaceholder: "DEEPSEEK_API_KEY",
|
apiKeyPlaceholder: "DEEPSEEK_API_KEY",
|
||||||
baseUrl: "Base URL",
|
baseUrl: "Base URL",
|
||||||
model: "Model",
|
model: "Model",
|
||||||
|
ocrProfile: "OCR profile",
|
||||||
|
ocrLanguage: "OCR languages",
|
||||||
|
ocrFast: "Fast Latin",
|
||||||
|
ocrVietnamese: "Vietnamese Enhanced",
|
||||||
|
ocrHybrid: "Hybrid Vi + Latin",
|
||||||
saveSettings: "Lưu",
|
saveSettings: "Lưu",
|
||||||
clearKey: "Reset env",
|
clearKey: "Reset env",
|
||||||
keySaved: "Đã lưu cài đặt DeepSeek.",
|
settingsSaved: "Da luu cai dat guide.",
|
||||||
keyMissing: "Hãy set biến môi trường DeepSeek API key trước khi tạo draft bằng DeepSeek.",
|
keyMissing: "Hãy set biến môi trường DeepSeek API key trước khi tạo draft bằng DeepSeek.",
|
||||||
keyConfigured: "Env ready",
|
keyConfigured: "Env ready",
|
||||||
keyNotConfigured: "Chưa thấy giá trị env",
|
keyNotConfigured: "Chưa thấy giá trị env",
|
||||||
@@ -108,6 +121,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
|
|||||||
const [deepSeekApiKeyEnvName, setDeepSeekApiKeyEnvName] = useState("DEEPSEEK_API_KEY");
|
const [deepSeekApiKeyEnvName, setDeepSeekApiKeyEnvName] = useState("DEEPSEEK_API_KEY");
|
||||||
const [deepSeekBaseUrl, setDeepSeekBaseUrl] = useState("https://api.deepseek.com");
|
const [deepSeekBaseUrl, setDeepSeekBaseUrl] = useState("https://api.deepseek.com");
|
||||||
const [deepSeekModel, setDeepSeekModel] = useState("deepseek-chat");
|
const [deepSeekModel, setDeepSeekModel] = useState("deepseek-chat");
|
||||||
|
const [ocrProfile, setOcrProfile] = useState<GuideOcrProfile>("vietnamese");
|
||||||
|
const [ocrLanguage, setOcrLanguage] = useState("vi,en");
|
||||||
const [message, setMessage] = useState<string | null>(null);
|
const [message, setMessage] = useState<string | null>(null);
|
||||||
|
|
||||||
const isBusy = busyAction !== null;
|
const isBusy = busyAction !== null;
|
||||||
@@ -138,6 +153,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
|
|||||||
setDeepSeekBaseUrl(result.data.deepseek.baseUrl);
|
setDeepSeekBaseUrl(result.data.deepseek.baseUrl);
|
||||||
setDeepSeekModel(result.data.deepseek.model);
|
setDeepSeekModel(result.data.deepseek.model);
|
||||||
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
|
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
|
||||||
|
setOcrProfile(result.data.ocr.profile);
|
||||||
|
setOcrLanguage(result.data.ocr.language);
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
@@ -269,6 +286,8 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
|
|||||||
deepseekApiKeyEnvName: deepSeekApiKeyEnvName,
|
deepseekApiKeyEnvName: deepSeekApiKeyEnvName,
|
||||||
baseUrl: deepSeekBaseUrl,
|
baseUrl: deepSeekBaseUrl,
|
||||||
model: deepSeekModel,
|
model: deepSeekModel,
|
||||||
|
ocrProfile,
|
||||||
|
ocrLanguage,
|
||||||
});
|
});
|
||||||
if (!result.success) {
|
if (!result.success) {
|
||||||
throw new Error(result.error);
|
throw new Error(result.error);
|
||||||
@@ -277,7 +296,9 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
|
|||||||
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
|
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
|
||||||
setDeepSeekBaseUrl(result.data.deepseek.baseUrl);
|
setDeepSeekBaseUrl(result.data.deepseek.baseUrl);
|
||||||
setDeepSeekModel(result.data.deepseek.model);
|
setDeepSeekModel(result.data.deepseek.model);
|
||||||
toast.success(copy.keySaved);
|
setOcrProfile(result.data.ocr.profile);
|
||||||
|
setOcrLanguage(result.data.ocr.language);
|
||||||
|
toast.success(copy.settingsSaved);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const text = error instanceof Error ? error.message : String(error);
|
const text = error instanceof Error ? error.message : String(error);
|
||||||
setMessage(text);
|
setMessage(text);
|
||||||
@@ -285,7 +306,14 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
|
|||||||
} finally {
|
} finally {
|
||||||
setSettingsBusy(false);
|
setSettingsBusy(false);
|
||||||
}
|
}
|
||||||
}, [copy.keySaved, deepSeekApiKeyEnvName, deepSeekBaseUrl, deepSeekModel]);
|
}, [
|
||||||
|
copy.settingsSaved,
|
||||||
|
deepSeekApiKeyEnvName,
|
||||||
|
deepSeekBaseUrl,
|
||||||
|
deepSeekModel,
|
||||||
|
ocrLanguage,
|
||||||
|
ocrProfile,
|
||||||
|
]);
|
||||||
|
|
||||||
const handleClearDeepSeekKey = useCallback(async () => {
|
const handleClearDeepSeekKey = useCallback(async () => {
|
||||||
if (!window.electronAPI?.guide?.saveAiSettings) {
|
if (!window.electronAPI?.guide?.saveAiSettings) {
|
||||||
@@ -298,13 +326,17 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
|
|||||||
clearDeepseekApiKeyEnvName: true,
|
clearDeepseekApiKeyEnvName: true,
|
||||||
baseUrl: deepSeekBaseUrl,
|
baseUrl: deepSeekBaseUrl,
|
||||||
model: deepSeekModel,
|
model: deepSeekModel,
|
||||||
|
ocrProfile,
|
||||||
|
ocrLanguage,
|
||||||
});
|
});
|
||||||
if (!result.success) {
|
if (!result.success) {
|
||||||
throw new Error(result.error);
|
throw new Error(result.error);
|
||||||
}
|
}
|
||||||
setAiSettings(result.data);
|
setAiSettings(result.data);
|
||||||
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
|
setDeepSeekApiKeyEnvName(result.data.deepseek.apiKeyEnvName);
|
||||||
toast.success(copy.keySaved);
|
setOcrProfile(result.data.ocr.profile);
|
||||||
|
setOcrLanguage(result.data.ocr.language);
|
||||||
|
toast.success(copy.settingsSaved);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const text = error instanceof Error ? error.message : String(error);
|
const text = error instanceof Error ? error.message : String(error);
|
||||||
setMessage(text);
|
setMessage(text);
|
||||||
@@ -312,7 +344,7 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
|
|||||||
} finally {
|
} finally {
|
||||||
setSettingsBusy(false);
|
setSettingsBusy(false);
|
||||||
}
|
}
|
||||||
}, [copy.keySaved, deepSeekBaseUrl, deepSeekModel]);
|
}, [copy.settingsSaved, deepSeekBaseUrl, deepSeekModel, ocrLanguage, ocrProfile]);
|
||||||
|
|
||||||
const handleGenerateGuide = useCallback(() => {
|
const handleGenerateGuide = useCallback(() => {
|
||||||
void runAction("generate", async () => {
|
void runAction("generate", async () => {
|
||||||
@@ -455,7 +487,7 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
|
|||||||
<div className="flex items-center justify-between gap-2">
|
<div className="flex items-center justify-between gap-2">
|
||||||
<div className="min-w-0">
|
<div className="min-w-0">
|
||||||
<div className="truncate text-[11px] font-semibold text-slate-100">
|
<div className="truncate text-[11px] font-semibold text-slate-100">
|
||||||
{copy.deepseek} {copy.settings}
|
{copy.guideSettings}
|
||||||
</div>
|
</div>
|
||||||
<div className="truncate text-[10px] text-slate-500">
|
<div className="truncate text-[10px] text-slate-500">
|
||||||
{aiSettings?.deepseek.hasApiKey
|
{aiSettings?.deepseek.hasApiKey
|
||||||
@@ -470,6 +502,33 @@ export function GuidePanel({ recordingId, videoPath, videoSourcePath }: GuidePan
|
|||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div className="grid grid-cols-2 gap-1.5">
|
||||||
|
<label className="block min-w-0 text-[10px] font-medium text-slate-400">
|
||||||
|
{copy.ocrProfile}
|
||||||
|
<select
|
||||||
|
value={ocrProfile}
|
||||||
|
onChange={(event) => setOcrProfile(event.target.value as GuideOcrProfile)}
|
||||||
|
disabled={settingsBusy}
|
||||||
|
className="mt-1 h-8 w-full rounded-md border border-white/[0.08] bg-black/20 px-2 text-[11px] text-slate-100 outline-none"
|
||||||
|
>
|
||||||
|
<option value="vietnamese">{copy.ocrVietnamese}</option>
|
||||||
|
<option value="hybrid">{copy.ocrHybrid}</option>
|
||||||
|
<option value="fast">{copy.ocrFast}</option>
|
||||||
|
</select>
|
||||||
|
</label>
|
||||||
|
<label className="block min-w-0 text-[10px] font-medium text-slate-400">
|
||||||
|
{copy.ocrLanguage}
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
value={ocrLanguage}
|
||||||
|
onChange={(event) => setOcrLanguage(event.target.value)}
|
||||||
|
placeholder="vi,en"
|
||||||
|
disabled={settingsBusy}
|
||||||
|
className="mt-1 h-8 w-full rounded-md border border-white/[0.08] bg-black/20 px-2 text-[11px] text-slate-100 outline-none placeholder:text-slate-600"
|
||||||
|
/>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
<label className="block text-[10px] font-medium text-slate-400">
|
<label className="block text-[10px] font-medium text-slate-400">
|
||||||
{copy.apiKey}
|
{copy.apiKey}
|
||||||
<input
|
<input
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ export type GuideTargetRole = "button" | "menu" | "tab" | "field" | "link" | "un
|
|||||||
export type GuideLanguage = "vi" | "en";
|
export type GuideLanguage = "vi" | "en";
|
||||||
export type GuideAiProvider = "deepseek" | "local";
|
export type GuideAiProvider = "deepseek" | "local";
|
||||||
export type GuideSecretStorage = "environment" | "none";
|
export type GuideSecretStorage = "environment" | "none";
|
||||||
|
export type GuideOcrProfile = "fast" | "vietnamese" | "hybrid";
|
||||||
|
|
||||||
export type GuideSessionStatus =
|
export type GuideSessionStatus =
|
||||||
| "recording"
|
| "recording"
|
||||||
@@ -178,6 +179,11 @@ export interface GenerateGuideDraftInput {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export interface GuideAiSettings {
|
export interface GuideAiSettings {
|
||||||
|
ocr: {
|
||||||
|
profile: GuideOcrProfile;
|
||||||
|
language: string;
|
||||||
|
updatedAt?: string;
|
||||||
|
};
|
||||||
deepseek: {
|
deepseek: {
|
||||||
hasApiKey: boolean;
|
hasApiKey: boolean;
|
||||||
apiKeyEnvName: string;
|
apiKeyEnvName: string;
|
||||||
@@ -194,6 +200,8 @@ export interface SaveGuideAiSettingsInput {
|
|||||||
clearDeepseekApiKeyEnvName?: boolean;
|
clearDeepseekApiKeyEnvName?: boolean;
|
||||||
baseUrl?: string;
|
baseUrl?: string;
|
||||||
model?: string;
|
model?: string;
|
||||||
|
ocrProfile?: GuideOcrProfile;
|
||||||
|
ocrLanguage?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SaveGuideInput {
|
export interface SaveGuideInput {
|
||||||
|
|||||||
+301
-18
@@ -5,6 +5,7 @@ import importlib.util
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -17,6 +18,65 @@ app = FastAPI(title="OpenScreen PaddleOCR service")
|
|||||||
|
|
||||||
_engines: dict[str, Any] = {}
|
_engines: dict[str, Any] = {}
|
||||||
_engine_lock = Lock()
|
_engine_lock = Lock()
|
||||||
|
_LATIN_RECOGNITION_LANGS = {
|
||||||
|
"af",
|
||||||
|
"az",
|
||||||
|
"bs",
|
||||||
|
"ca",
|
||||||
|
"cs",
|
||||||
|
"cy",
|
||||||
|
"da",
|
||||||
|
"de",
|
||||||
|
"en",
|
||||||
|
"es",
|
||||||
|
"et",
|
||||||
|
"eu",
|
||||||
|
"fi",
|
||||||
|
"fr",
|
||||||
|
"ga",
|
||||||
|
"gl",
|
||||||
|
"hr",
|
||||||
|
"hu",
|
||||||
|
"id",
|
||||||
|
"is",
|
||||||
|
"it",
|
||||||
|
"ku",
|
||||||
|
"la",
|
||||||
|
"latin",
|
||||||
|
"lb",
|
||||||
|
"lt",
|
||||||
|
"lv",
|
||||||
|
"mi",
|
||||||
|
"ms",
|
||||||
|
"mt",
|
||||||
|
"nl",
|
||||||
|
"no",
|
||||||
|
"oc",
|
||||||
|
"pi",
|
||||||
|
"pl",
|
||||||
|
"pt",
|
||||||
|
"qu",
|
||||||
|
"rm",
|
||||||
|
"ro",
|
||||||
|
"rs_latin",
|
||||||
|
"rslatin",
|
||||||
|
"sk",
|
||||||
|
"sl",
|
||||||
|
"sq",
|
||||||
|
"sv",
|
||||||
|
"sw",
|
||||||
|
"tl",
|
||||||
|
"tr",
|
||||||
|
"uz",
|
||||||
|
"vi",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PreparedImage:
|
||||||
|
path: str
|
||||||
|
scale: float = 1.0
|
||||||
|
should_delete: bool = False
|
||||||
|
|
||||||
|
|
||||||
class OcrRequest(BaseModel):
|
class OcrRequest(BaseModel):
|
||||||
@@ -24,6 +84,7 @@ class OcrRequest(BaseModel):
|
|||||||
path: str | None = None
|
path: str | None = None
|
||||||
imagePath: str | None = None
|
imagePath: str | None = None
|
||||||
language: str | None = None
|
language: str | None = None
|
||||||
|
profile: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
@@ -33,7 +94,9 @@ def health() -> dict[str, Any]:
|
|||||||
"paddleocrInstalled": importlib.util.find_spec("paddleocr") is not None,
|
"paddleocrInstalled": importlib.util.find_spec("paddleocr") is not None,
|
||||||
"paddleInstalled": importlib.util.find_spec("paddle") is not None,
|
"paddleInstalled": importlib.util.find_spec("paddle") is not None,
|
||||||
"engineReady": bool(_engines),
|
"engineReady": bool(_engines),
|
||||||
"defaultLanguage": os.getenv("PADDLEOCR_LANG", "latin"),
|
"defaultLanguage": os.getenv("PADDLEOCR_LANG") or "vi,en",
|
||||||
|
"defaultProfile": os.getenv("OPENSCREEN_OCR_PROFILE") or "vietnamese",
|
||||||
|
"loadedEngines": sorted(_engines.keys()),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -41,8 +104,12 @@ def health() -> dict[str, Any]:
|
|||||||
async def ocr(request: OcrRequest) -> dict[str, Any]:
|
async def ocr(request: OcrRequest) -> dict[str, Any]:
|
||||||
image_path, should_delete = _resolve_image_path(request)
|
image_path, should_delete = _resolve_image_path(request)
|
||||||
try:
|
try:
|
||||||
engine = _get_engine(request.language)
|
blocks = await run_in_threadpool(
|
||||||
blocks = await run_in_threadpool(_recognize_blocks, engine, image_path)
|
_recognize_profile_blocks,
|
||||||
|
image_path,
|
||||||
|
request.language,
|
||||||
|
request.profile,
|
||||||
|
)
|
||||||
return {"blocks": blocks}
|
return {"blocks": blocks}
|
||||||
finally:
|
finally:
|
||||||
if should_delete:
|
if should_delete:
|
||||||
@@ -73,8 +140,7 @@ def _resolve_image_path(request: OcrRequest) -> tuple[str, bool]:
|
|||||||
return handle.name, True
|
return handle.name, True
|
||||||
|
|
||||||
|
|
||||||
def _get_engine(language: str | None) -> Any:
|
def _get_engine(paddle_lang: str) -> Any:
|
||||||
paddle_lang = _resolve_paddle_language(language)
|
|
||||||
cache_key = f"{paddle_lang}|{os.getenv('PADDLEOCR_DEVICE', 'cpu')}"
|
cache_key = f"{paddle_lang}|{os.getenv('PADDLEOCR_DEVICE', 'cpu')}"
|
||||||
with _engine_lock:
|
with _engine_lock:
|
||||||
if cache_key not in _engines:
|
if cache_key not in _engines:
|
||||||
@@ -105,13 +171,17 @@ def _create_engine(paddle_lang: str) -> Any:
|
|||||||
"enable_mkldnn": os.getenv("PADDLEOCR_ENABLE_MKLDNN", "0") == "1",
|
"enable_mkldnn": os.getenv("PADDLEOCR_ENABLE_MKLDNN", "0") == "1",
|
||||||
"use_doc_orientation_classify": False,
|
"use_doc_orientation_classify": False,
|
||||||
"use_doc_unwarping": False,
|
"use_doc_unwarping": False,
|
||||||
"use_textline_orientation": False,
|
"use_textline_orientation": os.getenv("PADDLEOCR_USE_TEXTLINE_ORIENTATION", "0") == "1",
|
||||||
}
|
}
|
||||||
if os.getenv("PADDLEOCR_USE_MOBILE", "1") != "0":
|
if os.getenv("PADDLEOCR_USE_MOBILE", "1") != "0":
|
||||||
modern_kwargs.update(
|
modern_kwargs.update(
|
||||||
{
|
{
|
||||||
"text_detection_model_name": "PP-OCRv5_mobile_det",
|
"text_detection_model_name": os.getenv(
|
||||||
"text_recognition_model_name": _mobile_recognition_model(paddle_lang),
|
"PADDLEOCR_DET_MODEL",
|
||||||
|
"PP-OCRv5_mobile_det",
|
||||||
|
),
|
||||||
|
"text_recognition_model_name": os.getenv("PADDLEOCR_REC_MODEL")
|
||||||
|
or _mobile_recognition_model(paddle_lang),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -150,23 +220,236 @@ def _patch_paddlex_frozen_ocr_extra_gate() -> None:
|
|||||||
deps._openscreen_ocr_extra_patch = True
|
deps._openscreen_ocr_extra_patch = True
|
||||||
|
|
||||||
|
|
||||||
def _resolve_paddle_language(language: str | None) -> str:
|
def _recognize_profile_blocks(
|
||||||
explicit = os.getenv("PADDLEOCR_LANG")
|
image_path: str,
|
||||||
|
language: str | None,
|
||||||
|
profile: str | None,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
ocr_profile = _resolve_ocr_profile(profile)
|
||||||
|
languages = _resolve_paddle_languages(language, ocr_profile)
|
||||||
|
prepared = _prepare_image_for_profile(image_path, ocr_profile)
|
||||||
|
try:
|
||||||
|
blocks: list[dict[str, Any]] = []
|
||||||
|
for paddle_lang in languages:
|
||||||
|
engine = _get_engine(paddle_lang)
|
||||||
|
recognized = _recognize_blocks(engine, prepared.path)
|
||||||
|
blocks.extend(_scale_blocks(recognized, prepared.scale))
|
||||||
|
return _merge_blocks(blocks)
|
||||||
|
finally:
|
||||||
|
if prepared.should_delete:
|
||||||
|
Path(prepared.path).unlink(missing_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_ocr_profile(profile: str | None) -> str:
|
||||||
|
explicit = (os.getenv("OPENSCREEN_OCR_PROFILE") or "").strip().lower()
|
||||||
|
value = explicit or (profile or "").strip().lower()
|
||||||
|
if value in {"fast", "vietnamese", "hybrid"}:
|
||||||
|
return value
|
||||||
|
return "vietnamese"
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_paddle_languages(language: str | None, profile: str) -> list[str]:
|
||||||
|
explicit = (os.getenv("PADDLEOCR_LANG") or "").strip().lower()
|
||||||
if explicit:
|
if explicit:
|
||||||
return explicit
|
return [explicit]
|
||||||
|
|
||||||
language_value = (language or "vi,en").lower()
|
language_value = (language or "vi,en").lower()
|
||||||
if "vi" in language_value or "latin" in language_value:
|
has_vietnamese = "vi" in _split_language_tags(language_value)
|
||||||
|
if profile == "fast":
|
||||||
|
return [_resolve_primary_paddle_language(language_value, prefer_vietnamese=False)]
|
||||||
|
if profile == "hybrid":
|
||||||
|
languages = ["vi"] if has_vietnamese else []
|
||||||
|
languages.append("latin")
|
||||||
|
return _dedupe_languages(languages)
|
||||||
|
return [_resolve_primary_paddle_language(language_value, prefer_vietnamese=True)]
|
||||||
|
|
||||||
|
|
||||||
|
def _split_language_tags(language: str) -> set[str]:
|
||||||
|
return {part.strip().lower() for part in language.split(",") if part.strip()}
|
||||||
|
|
||||||
|
|
||||||
|
def _dedupe_languages(languages: list[str]) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
result: list[str] = []
|
||||||
|
for language in languages:
|
||||||
|
if language not in seen:
|
||||||
|
seen.add(language)
|
||||||
|
result.append(language)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_primary_paddle_language(language_value: str, *, prefer_vietnamese: bool) -> str:
|
||||||
|
tags = _split_language_tags(language_value)
|
||||||
|
if prefer_vietnamese and "vi" in tags:
|
||||||
|
return "vi"
|
||||||
|
if "latin" in tags or "vi" in tags or "en" in tags:
|
||||||
return "latin"
|
return "latin"
|
||||||
if "en" in language_value:
|
for tag in tags:
|
||||||
return "en"
|
return tag
|
||||||
return language_value.split(",")[0].strip() or "latin"
|
return "latin"
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_image_for_profile(image_path: str, profile: str) -> PreparedImage:
|
||||||
|
if profile == "fast":
|
||||||
|
return PreparedImage(image_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from PIL import Image, ImageEnhance, ImageOps
|
||||||
|
except Exception:
|
||||||
|
return PreparedImage(image_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with Image.open(image_path) as source:
|
||||||
|
image = source.convert("RGB")
|
||||||
|
except Exception:
|
||||||
|
return PreparedImage(image_path)
|
||||||
|
|
||||||
|
scale = _resolve_enhancement_scale(image.width, image.height)
|
||||||
|
if scale <= 1:
|
||||||
|
return PreparedImage(image_path)
|
||||||
|
|
||||||
|
resampling = getattr(getattr(Image, "Resampling", Image), "LANCZOS")
|
||||||
|
enhanced = image.resize((round(image.width * scale), round(image.height * scale)), resampling)
|
||||||
|
enhanced = ImageOps.autocontrast(enhanced)
|
||||||
|
enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
|
||||||
|
enhanced = ImageEnhance.Sharpness(enhanced).enhance(1.35)
|
||||||
|
|
||||||
|
handle = tempfile.NamedTemporaryFile(prefix="openscreen-ocr-enhanced-", suffix=".png", delete=False)
|
||||||
|
try:
|
||||||
|
handle.close()
|
||||||
|
enhanced.save(handle.name, format="PNG")
|
||||||
|
return PreparedImage(handle.name, scale=scale, should_delete=True)
|
||||||
|
except Exception:
|
||||||
|
Path(handle.name).unlink(missing_ok=True)
|
||||||
|
return PreparedImage(image_path)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_enhancement_scale(width: int, height: int) -> float:
|
||||||
|
try:
|
||||||
|
requested_scale = float(os.getenv("OPENSCREEN_OCR_ENHANCE_SCALE", "2"))
|
||||||
|
except ValueError:
|
||||||
|
requested_scale = 2.0
|
||||||
|
scale = max(1.0, min(3.0, requested_scale))
|
||||||
|
try:
|
||||||
|
max_side = int(os.getenv("OPENSCREEN_OCR_ENHANCE_MAX_SIDE", "2400"))
|
||||||
|
except ValueError:
|
||||||
|
max_side = 2400
|
||||||
|
largest_side = max(width, height)
|
||||||
|
if largest_side <= 0:
|
||||||
|
return 1.0
|
||||||
|
return max(1.0, min(scale, max_side / largest_side))
|
||||||
|
|
||||||
|
|
||||||
|
def _scale_blocks(blocks: list[dict[str, Any]], scale: float) -> list[dict[str, Any]]:
|
||||||
|
if scale <= 1:
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
scaled_blocks: list[dict[str, Any]] = []
|
||||||
|
for block in blocks:
|
||||||
|
box = block.get("box")
|
||||||
|
if not isinstance(box, dict) or not _box_uses_pixels(box):
|
||||||
|
scaled_blocks.append(block)
|
||||||
|
continue
|
||||||
|
scaled_box = {
|
||||||
|
"x": float(box["x"]) / scale,
|
||||||
|
"y": float(box["y"]) / scale,
|
||||||
|
"width": float(box["width"]) / scale,
|
||||||
|
"height": float(box["height"]) / scale,
|
||||||
|
}
|
||||||
|
scaled_blocks.append({**block, "box": scaled_box})
|
||||||
|
return scaled_blocks
|
||||||
|
|
||||||
|
|
||||||
|
def _box_uses_pixels(box: dict[str, Any]) -> bool:
|
||||||
|
try:
|
||||||
|
x = float(box["x"])
|
||||||
|
y = float(box["y"])
|
||||||
|
width = float(box["width"])
|
||||||
|
height = float(box["height"])
|
||||||
|
except (KeyError, TypeError, ValueError):
|
||||||
|
return False
|
||||||
|
return x > 1 or y > 1 or width > 1 or height > 1 or x + width > 1 or y + height > 1
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_blocks(blocks: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||||
|
merged: list[dict[str, Any]] = []
|
||||||
|
for block in sorted(blocks, key=_block_quality, reverse=True):
|
||||||
|
box = block.get("box")
|
||||||
|
if not isinstance(box, dict):
|
||||||
|
continue
|
||||||
|
overlapping_index = next(
|
||||||
|
(
|
||||||
|
index
|
||||||
|
for index, existing in enumerate(merged)
|
||||||
|
if _box_iou(box, existing.get("box")) >= 0.62
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if overlapping_index is None:
|
||||||
|
merged.append(block)
|
||||||
|
continue
|
||||||
|
if _block_quality(block) > _block_quality(merged[overlapping_index]):
|
||||||
|
merged[overlapping_index] = block
|
||||||
|
return sorted(merged, key=lambda block: _box_sort_key(block.get("box")))
|
||||||
|
|
||||||
|
|
||||||
|
def _block_quality(block: dict[str, Any]) -> float:
|
||||||
|
text = str(block.get("text") or "")
|
||||||
|
score = _score_to_float(block.get("confidence"))
|
||||||
|
if _has_vietnamese_diacritics(text):
|
||||||
|
score += 0.08
|
||||||
|
if len(text) >= 2:
|
||||||
|
score += min(0.04, len(text) * 0.002)
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def _has_vietnamese_diacritics(text: str) -> bool:
|
||||||
|
return any(
|
||||||
|
character
|
||||||
|
in "ăâđêôơưĂÂĐÊÔƠƯáàảãạắằẳẵặấầẩẫậéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựýỳỷỹỵ"
|
||||||
|
for character in text
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _box_iou(left: Any, right: Any) -> float:
|
||||||
|
if not isinstance(left, dict) or not isinstance(right, dict):
|
||||||
|
return 0.0
|
||||||
|
try:
|
||||||
|
left_x = float(left["x"])
|
||||||
|
left_y = float(left["y"])
|
||||||
|
left_width = float(left["width"])
|
||||||
|
left_height = float(left["height"])
|
||||||
|
right_x = float(right["x"])
|
||||||
|
right_y = float(right["y"])
|
||||||
|
right_width = float(right["width"])
|
||||||
|
right_height = float(right["height"])
|
||||||
|
except (KeyError, TypeError, ValueError):
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
intersection_left = max(left_x, right_x)
|
||||||
|
intersection_top = max(left_y, right_y)
|
||||||
|
intersection_right = min(left_x + left_width, right_x + right_width)
|
||||||
|
intersection_bottom = min(left_y + left_height, right_y + right_height)
|
||||||
|
intersection_width = max(0.0, intersection_right - intersection_left)
|
||||||
|
intersection_height = max(0.0, intersection_bottom - intersection_top)
|
||||||
|
intersection_area = intersection_width * intersection_height
|
||||||
|
if intersection_area <= 0:
|
||||||
|
return 0.0
|
||||||
|
union_area = left_width * left_height + right_width * right_height - intersection_area
|
||||||
|
return intersection_area / union_area if union_area > 0 else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def _box_sort_key(box: Any) -> tuple[float, float]:
|
||||||
|
if not isinstance(box, dict):
|
||||||
|
return (0.0, 0.0)
|
||||||
|
try:
|
||||||
|
return (float(box["y"]), float(box["x"]))
|
||||||
|
except (KeyError, TypeError, ValueError):
|
||||||
|
return (0.0, 0.0)
|
||||||
|
|
||||||
|
|
||||||
def _mobile_recognition_model(paddle_lang: str) -> str:
|
def _mobile_recognition_model(paddle_lang: str) -> str:
|
||||||
if paddle_lang == "en":
|
if paddle_lang in _LATIN_RECOGNITION_LANGS:
|
||||||
return "en_PP-OCRv5_mobile_rec"
|
|
||||||
if paddle_lang == "latin":
|
|
||||||
return "latin_PP-OCRv5_mobile_rec"
|
return "latin_PP-OCRv5_mobile_rec"
|
||||||
return "PP-OCRv5_mobile_rec"
|
return "PP-OCRv5_mobile_rec"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user