from bs4 import BeautifulSoup from bs4.filter import SoupStrainer from typing import List from unidecode import unidecode import re from .models import MayoSearchResult, MayoGuitarDetails class MayoParser: @staticmethod def clean_text(text: str) -> str: if not text: return "" return " ".join(text.split()).replace("\xa0", "").strip() @classmethod def parse_search_results(cls, html: str, base_url: str) -> List[MayoSearchResult]: soup = BeautifulSoup(html, "html.parser") results = [] table = soup.find("table", class_="std2") if not table: return results tbody = table.find("tbody") rows = tbody.find_all("tr") if tbody else table.find_all("tr") for row in rows: tds = row.find_all("td") if len(tds) < 3: continue link_tag = tds[0].find("a") if not link_tag: continue order_id = cls.clean_text(link_tag.get_text()) relative_url = link_tag.get("href") full_url = f"{base_url}/{relative_url}" if relative_url else "" prod_list = cls.clean_text(tds[1].get_text()) client = cls.clean_text(tds[2].get_text()) results.append(MayoSearchResult( order_id=order_id, client=client, prod_list=prod_list, url=full_url )) return results @classmethod def parse_guitar_links(cls, html: str, base_url: str) -> List[str]: soup = BeautifulSoup(html, "html.parser") links = [] table = soup.find("table", class_="std2") if not table: return links tbody = table.find("tbody") rows = tbody.find_all("tr") if tbody else table.find_all("tr") for row in rows: a_tags = row.find_all("a", href=True) for a in a_tags: href = a.get("href") if not isinstance(href, str): continue if not href.startswith("http") and not href.startswith("index.php"): continue if "id_zestawu=" in href: links.append(f"{base_url}/{href}") break return links @classmethod def parse_specification(cls, html: str) -> MayoGuitarDetails: result = { "order_number" : "", "completion_date" : "", "client" : "", "model" : "", "spec" : {} } only_content = SoupStrainer("div", id="tresc") soup = BeautifulSoup(html, "html.parser", parse_only=only_content) center_tag = soup.find("center") if center_tag is None: return MayoGuitarDetails(**result) table_tags = center_tag.find_all(lambda tag: tag.name == "table" and tag.get("class") == ["std"]) if not table_tags or len(table_tags) < 3: return MayoGuitarDetails(**result) # order_id, date, client header = table_tags[0].get_text(strip=True) pattern = r"zam\.:(?P\S+)\s*z datą realizacji:\s*(?P\d{4}-\d{2}-\d{2})(?P.*?)$" match = re.search(pattern=pattern, string=header) if match: data = match.groupdict() result["order_number"] = data["order_id"] result["completion_date"] = data["date"] result["client"] = data["client"].rstrip("- ") # Model model_input = table_tags[1].find("input", attrs={"name": "s_nr_kat"}) if model_input is not None: model = model_input.get("value") if isinstance(model, str): result["model"] = model # spec current_section = None for row in table_tags[2].find_all("tr", recursive=False): cells = row.find_all("td", recursive=False) if not cells: continue if len(cells) == 1 and cells[0].get("colspan") == "4": text = cells[0].get_text(strip=True) if text.isupper() and len(text) < 40: current_section = unidecode(text.lower()) result["spec"][current_section] = {} continue if len(cells) != 2: continue key = unidecode( cells[0].get_text(strip=True).lower().replace(" ", "") ) value = unidecode( cells[1].get_text(strip=True) ) parts = re.split(r"notatka", value, flags=re.IGNORECASE) entries = [p.strip().lstrip("- ") for p in parts if p.strip()] if key and value and current_section is not None: result["spec"][current_section][key] = entries return MayoGuitarDetails(**result)