Implement OCR engine architecture with base, factory, and specific engines

2026-05-08 07:08:48 +02:00
parent d117be5eec
commit 061ebf9978
7 changed files with 460 additions and 0 deletions
--- a/app/ocr/base.py
+++ b/app/ocr/base.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Protocol
+
+import cv2
+import numpy as np
+
+
+@dataclass
+class OcrLine:
+    text: str
+    confidence: float | None = None
+    bbox: list[list[float]] | None = None
+
+
+@dataclass
+class OcrResult:
+    text: str = ""
+    confidence: float | None = None
+    lines: list[OcrLine] = field(default_factory=list)
+    error: str | None = None
+    elapsed_ms: float = 0.0
+    engine: str = "none"
+
+
+class OcrEngine(Protocol):
+    name: str
+
+    def read_label(self, frame_bgr: np.ndarray, bbox: tuple[int, int, int, int]) -> OcrResult:
+        ...
+
+
+def crop_bbox(frame_bgr: np.ndarray, bbox: tuple[int, int, int, int], margin: int = 0) -> np.ndarray | None:
+    x1, y1, x2, y2 = bbox
+    h, w = frame_bgr.shape[:2]
+    x1, y1 = max(0, x1 - margin), max(0, y1 - margin)
+    x2, y2 = min(w, x2 + margin), min(h, y2 + margin)
+    if x2 <= x1 or y2 <= y1:
+        return None
+    return frame_bgr[y1:y2, x1:x2]
+
+
+def prepare_ocr_image(image_bgr: np.ndarray, config: dict) -> np.ndarray:
+    scale = float(config.get("scale", 1.0))
+    if scale != 1.0:
+        image_bgr = cv2.resize(image_bgr, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+
+    if not config.get("threshold", False):
+        return image_bgr
+
+    gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
+    gray = cv2.GaussianBlur(gray, (3, 3), 0)
+    return cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]