duck-prod-manager/backend/mayo/parser.py

from bs4 import BeautifulSoup
from bs4.filter import SoupStrainer
from typing import List
from unidecode import unidecode
import re
from .models import MayoSearchResult, MayoGuitarDetails

class MayoParser:
    @staticmethod
    def clean_text(text: str) -> str:
        if not text:
            return ""
        return " ".join(text.split()).replace("\xa0", "").strip()

    @classmethod
    def parse_search_results(cls, html: str, base_url: str) -> List[MayoSearchResult]:
        soup = BeautifulSoup(html, "html.parser")
        results = []

        table = soup.find("table", class_="std2")
        if not table:
            return results

        tbody = table.find("tbody")
        rows = tbody.find_all("tr") if tbody else table.find_all("tr")

        for row in rows:
            tds = row.find_all("td")
            if len(tds) < 3:
                continue

            link_tag = tds[0].find("a")
            if not link_tag:
                continue

            order_id = cls.clean_text(link_tag.get_text())
            relative_url = link_tag.get("href")
            full_url = f"{base_url}/{relative_url}" if relative_url else ""
            prod_list = cls.clean_text(tds[1].get_text())
            client = cls.clean_text(tds[2].get_text())

            results.append(MayoSearchResult(
                order_id=order_id,
                client=client,
                prod_list=prod_list,
                url=full_url
            ))

        return results

    @classmethod
    def parse_guitar_links(cls, html: str, base_url: str) -> List[str]:
        soup = BeautifulSoup(html, "html.parser")
        links = []

        table = soup.find("table", class_="std2")
        if not table:
            return links

        tbody = table.find("tbody")
        rows = tbody.find_all("tr") if tbody else table.find_all("tr")

        for row in rows:
            a_tags = row.find_all("a", href=True)
            for a in a_tags:
                href = a.get("href")
                if not isinstance(href, str):
                    continue
                if not href.startswith("http") and not href.startswith("index.php"):
                    continue
                if "id_zestawu=" in href:
                    links.append(f"{base_url}/{href}")
                    break
        return links


    @classmethod
    def parse_specification(cls, html: str) -> MayoGuitarDetails:
        result = {
            "order_number" : "",
            "completion_date" : "",
            "client" : "",
            "model" : "",
            "spec" : {}
        }

        only_content = SoupStrainer("div", id="tresc")
        soup = BeautifulSoup(html, "html.parser", parse_only=only_content)
        center_tag = soup.find("center")
        if center_tag is None:
            return MayoGuitarDetails(**result)

        table_tags = center_tag.find_all(lambda tag: tag.name == "table" and tag.get("class") == ["std"])

        if not table_tags or len(table_tags) < 3:
            return MayoGuitarDetails(**result)

        # order_id, date, client
        header = table_tags[0].get_text(strip=True)
        pattern = r"zam\.:(?P<order_id>\S+)\s*z datą realizacji:\s*(?P<date>\d{4}-\d{2}-\d{2})(?P<client>.*?)$"
        match = re.search(pattern=pattern, string=header)
        if match:
            data = match.groupdict()
            result["order_number"] = data["order_id"]
            result["completion_date"] = data["date"]
            result["client"] = data["client"].rstrip("- ")

        # Model
        model_input = table_tags[1].find("input", attrs={"name": "s_nr_kat"})
        if model_input is not None:
            model = model_input.get("value")
            if isinstance(model, str):
                result["model"] = model

        # spec
        current_section = None
        for row in table_tags[2].find_all("tr", recursive=False):
            cells = row.find_all("td", recursive=False)

            if not cells:
                continue

            if len(cells) == 1 and cells[0].get("colspan") == "4":
                text = cells[0].get_text(strip=True)

                if text.isupper() and len(text) < 40:
                    current_section = unidecode(text.lower())
                    result["spec"][current_section] = {}
                    continue

            if len(cells) != 2:
                continue

            key = unidecode( cells[0].get_text(strip=True).lower().replace(" ", "") )
            value = unidecode( cells[1].get_text(strip=True) )
            parts = re.split(r"notatka", value, flags=re.IGNORECASE)
            entries = [p.strip().lstrip("- ") for p in parts if p.strip()]

            if key and value and current_section is not None:
                result["spec"][current_section][key] = entries

        return MayoGuitarDetails(**result)