duck-stain-yolo/app/label_parser.py

from __future__ import annotations

import re
from dataclasses import dataclass, asdict

from app.fuzzy_match import best_fuzzy_match


ORDER_RE = re.compile(r"\b(?P<order>\d{4}/\d{4}/(?:[1-9]|[1-9]\d))\b")
DEFAULT_MODEL_MIN_SCORE = 0.72
DEFAULT_COLOR_MIN_SCORE = 0.72


@dataclass
class ParsedLabel:
    order_number: str | None
    color_code: str | None
    product_model: str | None
    raw_text: str
    color_score: float | None = None
    product_model_score: float | None = None

    def to_dict(self) -> dict[str, str | float | None]:
        return asdict(self)


def normalize_ocr_text(text: str) -> str:
    return " ".join(text.replace("\n", " ").replace("\r", " ").split())


def parse_label_text(
    text: str,
    known_colors: list[str],
    known_models: list[str],
    model_min_score: float = DEFAULT_MODEL_MIN_SCORE,
    color_min_score: float = DEFAULT_COLOR_MIN_SCORE,
) -> ParsedLabel:
    normalized = normalize_ocr_text(text)
    order_match = ORDER_RE.search(normalized)

    color_match = best_fuzzy_match(normalized, known_colors, color_min_score)
    model_match = best_fuzzy_match(normalized, known_models, model_min_score)

    return ParsedLabel(
        order_number=order_match.group("order") if order_match else None,
        color_code=color_match.value if color_match else None,
        product_model=model_match.value if model_match else None,
        raw_text=normalized,
        color_score=color_match.score if color_match else None,
        product_model_score=model_match.score if model_match else None,
    )