Enhance OCR configuration and integrate fuzzy matching for label parsing

2026-05-08 07:09:08 +02:00
parent 061ebf9978
commit 2b582dc732
4 changed files with 194 additions and 77 deletions
--- a/app/detection.py
+++ b/app/detection.py
@@ -4,10 +4,10 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any

-import cv2
 import numpy as np

 from app.label_parser import ParsedLabel, parse_label_text
+from app.ocr import create_ocr_engine


@dataclass
@@ -18,6 +18,9 @@ class DetectionResult:
    raw_text: str = ""
    parsed: ParsedLabel | None = None
    error: str | None = None
+    ocr_engine: str | None = None
+    ocr_confidence: float | None = None
+    ocr_elapsed_ms: float | None = None
    all_boxes: list[dict[str, Any]] = field(default_factory=list)

    def to_metadata(self) -> dict[str, Any]:
@@ -28,6 +31,9 @@ class DetectionResult:
            "raw_text": self.raw_text,
            "parsed": self.parsed.to_dict() if self.parsed else None,
            "error": self.error,
+            "ocr_engine": self.ocr_engine,
+            "ocr_confidence": self.ocr_confidence,
+            "ocr_elapsed_ms": self.ocr_elapsed_ms,
            "all_boxes": self.all_boxes,
        }

@@ -72,6 +78,9 @@ class YoloLabelDetector:
        boxes = []
        names = getattr(self.model, "names", {})
        for result in results:
+            if result.boxes is None:
+                continue
+            
            for box in result.boxes:
                x1, y1, x2, y2 = [int(v) for v in box.xyxy[0].tolist()]
                confidence = float(box.conf[0])
@@ -106,78 +115,30 @@ class YoloLabelDetector:
        return result


-class TesseractOcr:
-    def __init__(self, config: dict[str, Any]) -> None:
-        self.config = config
-        self.load_error: str | None = None
-        self.pytesseract = None
-        self._load()
-
-    def _load(self) -> None:
-        if not self.config["ocr"].get("enabled", True):
-            return
-        try:
-            import pytesseract
-
-            command = self.config["ocr"].get("tesseract_cmd")
-            if command:
-                pytesseract.pytesseract.tesseract_cmd = command
-            self.pytesseract = pytesseract
-        except Exception as exc:
-            self.load_error = f"Nie mozna zaladowac pytesseract: {exc}"
-
-    def read_label(self, frame_bgr: np.ndarray, bbox: tuple[int, int, int, int]) -> tuple[str, str | None]:
-        if not self.config["ocr"].get("enabled", True):
-            return "", None
-        if self.pytesseract is None:
-            return "", self.load_error or "OCR nie jest zaladowany"
-
-        x1, y1, x2, y2 = bbox
-        h, w = frame_bgr.shape[:2]
-        x1, y1 = max(0, x1), max(0, y1)
-        x2, y2 = min(w, x2), min(h, y2)
-        if x2 <= x1 or y2 <= y1:
-            return "", "Nieprawidlowy bbox OCR"
-
-        roi = frame_bgr[y1:y2, x1:x2]
-        scale = float(self.config["ocr"].get("scale", 1.0))
-        if scale != 1.0:
-            roi = cv2.resize(roi, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
-
-        gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
-        if self.config["ocr"].get("threshold", True):
-            gray = cv2.GaussianBlur(gray, (3, 3), 0)
-            gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
-
-        try:
-            text = self.pytesseract.image_to_string(
-                gray,
-                lang=self.config["ocr"].get("language", "eng"),
-                config="--psm 6",
-            )
-        except Exception as exc:
-            return "", f"Blad OCR: {exc}"
-        return text, None
-
-
 class DetectionPipeline:
    def __init__(self, config: dict[str, Any], app_config: Any) -> None:
        self.config = config
        self.detector = YoloLabelDetector(config, app_config)
-        self.ocr = TesseractOcr(config)
+        self.ocr = create_ocr_engine(config)

    def process(self, frame_bgr: np.ndarray) -> DetectionResult:
        result = self.detector.detect(frame_bgr)
        if result.xyxy is None:
            return result

-        text, ocr_error = self.ocr.read_label(frame_bgr, result.xyxy)
-        result.raw_text = text
+        ocr_result = self.ocr.read_label(frame_bgr, result.xyxy)
+        result.raw_text = ocr_result.text
+        result.ocr_engine = ocr_result.engine
+        result.ocr_confidence = ocr_result.confidence
+        result.ocr_elapsed_ms = ocr_result.elapsed_ms
+        label_cfg = self.config["label_data"]
        result.parsed = parse_label_text(
-            text,
-            self.config["label_data"].get("colors", []),
-            self.config["label_data"].get("models", []),
+            ocr_result.text,
+            label_cfg.get("colors", []),
+            label_cfg.get("models", []),
+            model_min_score=float(label_cfg.get("model_min_score", 0.72)),
+            color_min_score=float(label_cfg.get("color_min_score", 0.72)),
        )
-        if ocr_error:
-            result.error = ocr_error
+        if ocr_result.error:
+            result.error = ocr_result.error
        return result