Implement OCR engine architecture with base, factory, and specific engines

2026-05-08 07:08:48 +02:00
parent d117be5eec
commit 061ebf9978
7 changed files with 460 additions and 0 deletions
--- a/app/ocr/cli.py
+++ b/app/ocr/cli.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+import cv2
+
+from app.config import AppConfig
+from app.label_parser import parse_label_text
+from app.ocr import create_ocr_engine
+
+
+def iter_images(path: Path) -> list[Path]:
+    if path.is_file():
+        return [path]
+
+    extensions = {".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tif", ".tiff"}
+    return sorted(item for item in path.iterdir() if item.is_file() and item.suffix.lower() in extensions)
+
+
+def result_to_dict(path: Path, result: Any, config: dict[str, Any]) -> dict[str, Any]:
+    label_cfg = config.get("label_data", {})
+    parsed = parse_label_text(
+        result.text,
+        label_cfg.get("colors", []),
+        label_cfg.get("models", []),
+        model_min_score=float(label_cfg.get("model_min_score", 0.72)),
+        color_min_score=float(label_cfg.get("color_min_score", 0.72)),
+    )
+    return {
+        "file": str(path),
+        "engine": result.engine,
+        "elapsed_ms": round(result.elapsed_ms, 2),
+        "confidence": result.confidence,
+        "error": result.error,
+        "text": result.text,
+        "lines": [
+            {
+                "text": line.text,
+                "confidence": line.confidence,
+                "bbox": line.bbox,
+            }
+            for line in result.lines
+        ],
+        "parsed": parsed.to_dict(),
+    }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Test OCR backend on cropped label images.")
+    parser.add_argument("path", help="Image file or directory with crop images")
+    parser.add_argument("--config", default="app_config.json", help="Application config JSON path")
+    parser.add_argument(
+        "--engine",
+        choices=["none", "tesseract", "paddle"],
+        help="Override ocr.engine from config",
+    )
+    parser.add_argument("--no-threshold", action="store_true", help="Disable threshold preprocessing")
+    parser.add_argument("--scale", type=float, help="Override OCR scale")
+    parser.add_argument("--json", action="store_true", help="Print JSON output")
+    args = parser.parse_args()
+
+    app_config = AppConfig(Path(args.config))
+    config = app_config.data
+    if args.engine:
+        config["ocr"]["engine"] = args.engine
+        config["ocr"]["enabled"] = args.engine != "none"
+    if args.no_threshold:
+        config["ocr"]["threshold"] = False
+    if args.scale is not None:
+        config["ocr"]["scale"] = args.scale
+
+    engine = create_ocr_engine(config)
+    outputs = []
+    for image_path in iter_images(Path(args.path)):
+        image = cv2.imread(str(image_path), cv2.IMREAD_COLOR)
+        if image is None:
+            outputs.append({"file": str(image_path), "error": "Nie mozna odczytac obrazu"})
+            continue
+
+        h, w = image.shape[:2]
+        result = engine.read_label(image, (0, 0, w, h))
+        outputs.append(result_to_dict(image_path, result, config))
+
+    if args.json:
+        print(json.dumps(outputs, indent=2, ensure_ascii=False))
+        return 0
+
+    for output in outputs:
+        print(f"file: {output['file']}")
+        print(f"engine: {output.get('engine')}")
+        print(f"elapsed_ms: {output.get('elapsed_ms')}")
+        print(f"confidence: {output.get('confidence')}")
+        if output.get("error"):
+            print(f"error: {output['error']}")
+        print("text:")
+        print(output.get("text") or "")
+        print(f"parsed: {output.get('parsed')}")
+        print()
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())