from __future__ import annotations import re import unicodedata from dataclasses import dataclass from difflib import SequenceMatcher @dataclass(frozen=True) class FuzzyMatch: value: str score: float matched_text: str coverage: float def compact_text(text: str) -> str: normalized = unicodedata.normalize("NFKD", text) ascii_text = normalized.encode("ascii", "ignore").decode("ascii") return re.sub(r"[^A-Z0-9]+", "", ascii_text.upper()) def token_text(text: str) -> list[str]: normalized = unicodedata.normalize("NFKD", text) ascii_text = normalized.encode("ascii", "ignore").decode("ascii") return re.findall(r"[A-Z0-9]+", ascii_text.upper()) def similarity(left: str, right: str) -> float: if not left or not right: return 0.0 return SequenceMatcher(None, left, right).ratio() def best_fuzzy_match(text: str, candidates: list[str], min_score: float = 0.72) -> FuzzyMatch | None: best: FuzzyMatch | None = None for candidate in candidates: candidate_compact = compact_text(candidate) if not candidate_compact: continue score, matched_text, coverage = best_candidate_score(text, candidate_compact) match = FuzzyMatch( value=candidate, score=score, matched_text=matched_text, coverage=coverage, ) if best is None or _is_better_match(match, best): best = match if best is None or best.score < min_score: return None return best def best_candidate_score(text: str, candidate_compact: str) -> tuple[float, str, float]: full_compact = compact_text(text) if candidate_compact in full_compact: return 1.0, candidate_compact, 1.0 windows = candidate_windows(text, len(candidate_compact)) if not windows and full_compact: windows = [full_compact] best_score = 0.0 best_window = "" best_coverage = 0.0 for window in windows: coverage = min(len(window), len(candidate_compact)) / max(len(window), len(candidate_compact)) score = similarity(candidate_compact, window) * coverage * digit_match_weight(candidate_compact, window) if score > best_score: best_score = score best_window = window best_coverage = coverage return best_score, best_window, best_coverage def digit_match_weight(candidate: str, matched_text: str) -> float: candidate_digits = re.findall(r"\d", candidate) matched_digits = re.findall(r"\d", matched_text) if not candidate_digits or not matched_digits: return 1.0 if candidate_digits == matched_digits: return 1.05 return 0.7 def candidate_windows(text: str, candidate_length: int) -> list[str]: tokens = token_text(text) windows: set[str] = set() for token in tokens: windows.add(token) max_ngram = min(8, len(tokens)) for size in range(2, max_ngram + 1): for index in range(0, len(tokens) - size + 1): joined = "".join(tokens[index : index + size]) if _length_is_plausible(joined, candidate_length): windows.add(joined) full_compact = compact_text(text) if full_compact: min_len = max(1, int(candidate_length * 0.65)) max_len = max(min_len, int(candidate_length * 1.35)) for length in range(min_len, max_len + 1): if length > len(full_compact): continue for index in range(0, len(full_compact) - length + 1): windows.add(full_compact[index : index + length]) return sorted(windows) def _length_is_plausible(value: str, candidate_length: int) -> bool: if not value: return False return int(candidate_length * 0.65) <= len(value) <= int(candidate_length * 1.6) def _is_better_match(match: FuzzyMatch, best: FuzzyMatch) -> bool: if match.score > best.score + 0.03: return True if match.score < best.score - 0.03: return False if match.coverage > best.coverage + 0.05: return True if match.coverage < best.coverage - 0.05: return False return len(compact_text(match.value)) > len(compact_text(best.value))