Enhance OCR configuration and integrate fuzzy matching for label parsing
This commit is contained in:
131
app/fuzzy_match.py
Normal file
131
app/fuzzy_match.py
Normal file
@@ -0,0 +1,131 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from dataclasses import dataclass
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FuzzyMatch:
|
||||
value: str
|
||||
score: float
|
||||
matched_text: str
|
||||
coverage: float
|
||||
|
||||
|
||||
def compact_text(text: str) -> str:
|
||||
normalized = unicodedata.normalize("NFKD", text)
|
||||
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
|
||||
return re.sub(r"[^A-Z0-9]+", "", ascii_text.upper())
|
||||
|
||||
|
||||
def token_text(text: str) -> list[str]:
|
||||
normalized = unicodedata.normalize("NFKD", text)
|
||||
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
|
||||
return re.findall(r"[A-Z0-9]+", ascii_text.upper())
|
||||
|
||||
|
||||
def similarity(left: str, right: str) -> float:
|
||||
if not left or not right:
|
||||
return 0.0
|
||||
return SequenceMatcher(None, left, right).ratio()
|
||||
|
||||
|
||||
def best_fuzzy_match(text: str, candidates: list[str], min_score: float = 0.72) -> FuzzyMatch | None:
|
||||
best: FuzzyMatch | None = None
|
||||
for candidate in candidates:
|
||||
candidate_compact = compact_text(candidate)
|
||||
if not candidate_compact:
|
||||
continue
|
||||
|
||||
score, matched_text, coverage = best_candidate_score(text, candidate_compact)
|
||||
match = FuzzyMatch(
|
||||
value=candidate,
|
||||
score=score,
|
||||
matched_text=matched_text,
|
||||
coverage=coverage,
|
||||
)
|
||||
if best is None or _is_better_match(match, best):
|
||||
best = match
|
||||
|
||||
if best is None or best.score < min_score:
|
||||
return None
|
||||
return best
|
||||
|
||||
|
||||
def best_candidate_score(text: str, candidate_compact: str) -> tuple[float, str, float]:
|
||||
full_compact = compact_text(text)
|
||||
if candidate_compact in full_compact:
|
||||
return 1.0, candidate_compact, 1.0
|
||||
|
||||
windows = candidate_windows(text, len(candidate_compact))
|
||||
if not windows and full_compact:
|
||||
windows = [full_compact]
|
||||
|
||||
best_score = 0.0
|
||||
best_window = ""
|
||||
best_coverage = 0.0
|
||||
for window in windows:
|
||||
coverage = min(len(window), len(candidate_compact)) / max(len(window), len(candidate_compact))
|
||||
score = similarity(candidate_compact, window) * coverage * digit_match_weight(candidate_compact, window)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_window = window
|
||||
best_coverage = coverage
|
||||
return best_score, best_window, best_coverage
|
||||
|
||||
|
||||
def digit_match_weight(candidate: str, matched_text: str) -> float:
|
||||
candidate_digits = re.findall(r"\d", candidate)
|
||||
matched_digits = re.findall(r"\d", matched_text)
|
||||
if not candidate_digits or not matched_digits:
|
||||
return 1.0
|
||||
if candidate_digits == matched_digits:
|
||||
return 1.05
|
||||
return 0.7
|
||||
|
||||
|
||||
def candidate_windows(text: str, candidate_length: int) -> list[str]:
|
||||
tokens = token_text(text)
|
||||
windows: set[str] = set()
|
||||
|
||||
for token in tokens:
|
||||
windows.add(token)
|
||||
|
||||
max_ngram = min(8, len(tokens))
|
||||
for size in range(2, max_ngram + 1):
|
||||
for index in range(0, len(tokens) - size + 1):
|
||||
joined = "".join(tokens[index : index + size])
|
||||
if _length_is_plausible(joined, candidate_length):
|
||||
windows.add(joined)
|
||||
|
||||
full_compact = compact_text(text)
|
||||
if full_compact:
|
||||
min_len = max(1, int(candidate_length * 0.65))
|
||||
max_len = max(min_len, int(candidate_length * 1.35))
|
||||
for length in range(min_len, max_len + 1):
|
||||
if length > len(full_compact):
|
||||
continue
|
||||
for index in range(0, len(full_compact) - length + 1):
|
||||
windows.add(full_compact[index : index + length])
|
||||
|
||||
return sorted(windows)
|
||||
|
||||
|
||||
def _length_is_plausible(value: str, candidate_length: int) -> bool:
|
||||
if not value:
|
||||
return False
|
||||
return int(candidate_length * 0.65) <= len(value) <= int(candidate_length * 1.6)
|
||||
|
||||
|
||||
def _is_better_match(match: FuzzyMatch, best: FuzzyMatch) -> bool:
|
||||
if match.score > best.score + 0.03:
|
||||
return True
|
||||
if match.score < best.score - 0.03:
|
||||
return False
|
||||
if match.coverage > best.coverage + 0.05:
|
||||
return True
|
||||
if match.coverage < best.coverage - 0.05:
|
||||
return False
|
||||
return len(compact_text(match.value)) > len(compact_text(best.value))
|
||||
Reference in New Issue
Block a user