from __future__ import annotations import re from dataclasses import dataclass, asdict from app.fuzzy_match import best_fuzzy_match ORDER_RE = re.compile(r"\b(?P\d{4}/\d{4}/(?:[1-9]|[1-9]\d))\b") DEFAULT_MODEL_MIN_SCORE = 0.72 DEFAULT_COLOR_MIN_SCORE = 0.72 @dataclass class ParsedLabel: order_number: str | None color_code: str | None product_model: str | None raw_text: str color_score: float | None = None product_model_score: float | None = None def to_dict(self) -> dict[str, str | float | None]: return asdict(self) def normalize_ocr_text(text: str) -> str: return " ".join(text.replace("\n", " ").replace("\r", " ").split()) def parse_label_text( text: str, known_colors: list[str], known_models: list[str], model_min_score: float = DEFAULT_MODEL_MIN_SCORE, color_min_score: float = DEFAULT_COLOR_MIN_SCORE, ) -> ParsedLabel: normalized = normalize_ocr_text(text) order_match = ORDER_RE.search(normalized) color_match = best_fuzzy_match(normalized, known_colors, color_min_score) model_match = best_fuzzy_match(normalized, known_models, model_min_score) return ParsedLabel( order_number=order_match.group("order") if order_match else None, color_code=color_match.value if color_match else None, product_model=model_match.value if model_match else None, raw_text=normalized, color_score=color_match.score if color_match else None, product_model_score=model_match.score if model_match else None, )