Enhance OCR configuration and integrate fuzzy matching for label parsing

This commit is contained in:
2026-05-08 07:09:08 +02:00
parent 061ebf9978
commit 2b582dc732
4 changed files with 194 additions and 77 deletions

View File

@@ -3,8 +3,12 @@ from __future__ import annotations
import re
from dataclasses import dataclass, asdict
from app.fuzzy_match import best_fuzzy_match
ORDER_RE = re.compile(r"\b(?P<order>\d{4}/\d{4}/(?:[1-9]|[1-9]\d))\b")
DEFAULT_MODEL_MIN_SCORE = 0.72
DEFAULT_COLOR_MIN_SCORE = 0.72
@dataclass
@@ -13,8 +17,10 @@ class ParsedLabel:
color_code: str | None
product_model: str | None
raw_text: str
color_score: float | None = None
product_model_score: float | None = None
def to_dict(self) -> dict[str, str | None]:
def to_dict(self) -> dict[str, str | float | None]:
return asdict(self)
@@ -22,23 +28,24 @@ def normalize_ocr_text(text: str) -> str:
return " ".join(text.replace("\n", " ").replace("\r", " ").split())
def parse_label_text(text: str, known_colors: list[str], known_models: list[str]) -> ParsedLabel:
def parse_label_text(
text: str,
known_colors: list[str],
known_models: list[str],
model_min_score: float = DEFAULT_MODEL_MIN_SCORE,
color_min_score: float = DEFAULT_COLOR_MIN_SCORE,
) -> ParsedLabel:
normalized = normalize_ocr_text(text)
order_match = ORDER_RE.search(normalized)
normalized_upper = normalized.upper()
color_code = next(
(color for color in known_colors if color.upper() in normalized_upper),
None,
)
product_model = next(
(model for model in known_models if re.search(rf"\b{re.escape(model)}\b", normalized, re.I)),
None,
)
color_match = best_fuzzy_match(normalized, known_colors, color_min_score)
model_match = best_fuzzy_match(normalized, known_models, model_min_score)
return ParsedLabel(
order_number=order_match.group("order") if order_match else None,
color_code=color_code,
product_model=product_model,
color_code=color_match.value if color_match else None,
product_model=model_match.value if model_match else None,
raw_text=normalized,
color_score=color_match.score if color_match else None,
product_model_score=model_match.score if model_match else None,
)