Files
duck-prod-manager/backend/mayo/parser.py
2026-04-20 23:21:16 +02:00

143 lines
4.8 KiB
Python

from bs4 import BeautifulSoup
from bs4.filter import SoupStrainer
from typing import List
from unidecode import unidecode
import re
from .models import MayoSearchResult, MayoGuitarDetails
class MayoParser:
@staticmethod
def clean_text(text: str) -> str:
if not text:
return ""
return " ".join(text.split()).replace("\xa0", "").strip()
@classmethod
def parse_search_results(cls, html: str, base_url: str) -> List[MayoSearchResult]:
soup = BeautifulSoup(html, "html.parser")
results = []
table = soup.find("table", class_="std2")
if not table:
return results
tbody = table.find("tbody")
rows = tbody.find_all("tr") if tbody else table.find_all("tr")
for row in rows:
tds = row.find_all("td")
if len(tds) < 3:
continue
link_tag = tds[0].find("a")
if not link_tag:
continue
order_id = cls.clean_text(link_tag.get_text())
relative_url = link_tag.get("href")
full_url = f"{base_url}/{relative_url}" if relative_url else ""
prod_list = cls.clean_text(tds[1].get_text())
client = cls.clean_text(tds[2].get_text())
results.append(MayoSearchResult(
order_id=order_id,
client=client,
prod_list=prod_list,
url=full_url
))
return results
@classmethod
def parse_guitar_links(cls, html: str, base_url: str) -> List[str]:
soup = BeautifulSoup(html, "html.parser")
links = []
table = soup.find("table", class_="std2")
if not table:
return links
tbody = table.find("tbody")
rows = tbody.find_all("tr") if tbody else table.find_all("tr")
for row in rows:
a_tags = row.find_all("a", href=True)
for a in a_tags:
href = a.get("href")
if not isinstance(href, str):
continue
if not href.startswith("http") and not href.startswith("index.php"):
continue
if "id_zestawu=" in href:
links.append(f"{base_url}/{href}")
break
return links
@classmethod
def parse_specification(cls, html: str) -> MayoGuitarDetails:
result = {
"order_number" : "",
"completion_date" : "",
"client" : "",
"model" : "",
"spec" : {}
}
only_content = SoupStrainer("div", id="tresc")
soup = BeautifulSoup(html, "html.parser", parse_only=only_content)
center_tag = soup.find("center")
if center_tag is None:
return MayoGuitarDetails(**result)
table_tags = center_tag.find_all(lambda tag: tag.name == "table" and tag.get("class") == ["std"])
if not table_tags or len(table_tags) < 3:
return MayoGuitarDetails(**result)
# order_id, date, client
header = table_tags[0].get_text(strip=True)
pattern = r"zam\.:(?P<order_id>\S+)\s*z datą realizacji:\s*(?P<date>\d{4}-\d{2}-\d{2})(?P<client>.*?)$"
match = re.search(pattern=pattern, string=header)
if match:
data = match.groupdict()
result["order_number"] = data["order_id"]
result["completion_date"] = data["date"]
result["client"] = data["client"].rstrip("- ")
# Model
model_input = table_tags[1].find("input", attrs={"name": "s_nr_kat"})
if model_input is not None:
model = model_input.get("value")
if isinstance(model, str):
result["model"] = model
# spec
current_section = None
for row in table_tags[2].find_all("tr", recursive=False):
cells = row.find_all("td", recursive=False)
if not cells:
continue
if len(cells) == 1 and cells[0].get("colspan") == "4":
text = cells[0].get_text(strip=True)
if text.isupper() and len(text) < 40:
current_section = unidecode(text.lower())
result["spec"][current_section] = {}
continue
if len(cells) != 2:
continue
key = unidecode( cells[0].get_text(strip=True).lower().replace(" ", "") )
value = unidecode( cells[1].get_text(strip=True) )
parts = re.split(r"notatka", value, flags=re.IGNORECASE)
entries = [p.strip().lstrip("- ") for p in parts if p.strip()]
if key and value and current_section is not None:
result["spec"][current_section][key] = entries
return MayoGuitarDetails(**result)