Enhance OCR configuration and integrate fuzzy matching for label parsing

2026-05-08 07:09:08 +02:00
parent 061ebf9978
commit 2b582dc732
4 changed files with 194 additions and 77 deletions
--- a/app/fuzzy_match.py
+++ b/app/fuzzy_match.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+import re
+import unicodedata
+from dataclasses import dataclass
+from difflib import SequenceMatcher
+
+
+@dataclass(frozen=True)
+class FuzzyMatch:
+    value: str
+    score: float
+    matched_text: str
+    coverage: float
+
+
+def compact_text(text: str) -> str:
+    normalized = unicodedata.normalize("NFKD", text)
+    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
+    return re.sub(r"[^A-Z0-9]+", "", ascii_text.upper())
+
+
+def token_text(text: str) -> list[str]:
+    normalized = unicodedata.normalize("NFKD", text)
+    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
+    return re.findall(r"[A-Z0-9]+", ascii_text.upper())
+
+
+def similarity(left: str, right: str) -> float:
+    if not left or not right:
+        return 0.0
+    return SequenceMatcher(None, left, right).ratio()
+
+
+def best_fuzzy_match(text: str, candidates: list[str], min_score: float = 0.72) -> FuzzyMatch | None:
+    best: FuzzyMatch | None = None
+    for candidate in candidates:
+        candidate_compact = compact_text(candidate)
+        if not candidate_compact:
+            continue
+
+        score, matched_text, coverage = best_candidate_score(text, candidate_compact)
+        match = FuzzyMatch(
+            value=candidate,
+            score=score,
+            matched_text=matched_text,
+            coverage=coverage,
+        )
+        if best is None or _is_better_match(match, best):
+            best = match
+
+    if best is None or best.score < min_score:
+        return None
+    return best
+
+
+def best_candidate_score(text: str, candidate_compact: str) -> tuple[float, str, float]:
+    full_compact = compact_text(text)
+    if candidate_compact in full_compact:
+        return 1.0, candidate_compact, 1.0
+
+    windows = candidate_windows(text, len(candidate_compact))
+    if not windows and full_compact:
+        windows = [full_compact]
+
+    best_score = 0.0
+    best_window = ""
+    best_coverage = 0.0
+    for window in windows:
+        coverage = min(len(window), len(candidate_compact)) / max(len(window), len(candidate_compact))
+        score = similarity(candidate_compact, window) * coverage * digit_match_weight(candidate_compact, window)
+        if score > best_score:
+            best_score = score
+            best_window = window
+            best_coverage = coverage
+    return best_score, best_window, best_coverage
+
+
+def digit_match_weight(candidate: str, matched_text: str) -> float:
+    candidate_digits = re.findall(r"\d", candidate)
+    matched_digits = re.findall(r"\d", matched_text)
+    if not candidate_digits or not matched_digits:
+        return 1.0
+    if candidate_digits == matched_digits:
+        return 1.05
+    return 0.7
+
+
+def candidate_windows(text: str, candidate_length: int) -> list[str]:
+    tokens = token_text(text)
+    windows: set[str] = set()
+
+    for token in tokens:
+        windows.add(token)
+
+    max_ngram = min(8, len(tokens))
+    for size in range(2, max_ngram + 1):
+        for index in range(0, len(tokens) - size + 1):
+            joined = "".join(tokens[index : index + size])
+            if _length_is_plausible(joined, candidate_length):
+                windows.add(joined)
+
+    full_compact = compact_text(text)
+    if full_compact:
+        min_len = max(1, int(candidate_length * 0.65))
+        max_len = max(min_len, int(candidate_length * 1.35))
+        for length in range(min_len, max_len + 1):
+            if length > len(full_compact):
+                continue
+            for index in range(0, len(full_compact) - length + 1):
+                windows.add(full_compact[index : index + length])
+
+    return sorted(windows)
+
+
+def _length_is_plausible(value: str, candidate_length: int) -> bool:
+    if not value:
+        return False
+    return int(candidate_length * 0.65) <= len(value) <= int(candidate_length * 1.6)
+
+
+def _is_better_match(match: FuzzyMatch, best: FuzzyMatch) -> bool:
+    if match.score > best.score + 0.03:
+        return True
+    if match.score < best.score - 0.03:
+        return False
+    if match.coverage > best.coverage + 0.05:
+        return True
+    if match.coverage < best.coverage - 0.05:
+        return False
+    return len(compact_text(match.value)) > len(compact_text(best.value))