143 lines
4.8 KiB
Python
143 lines
4.8 KiB
Python
from bs4 import BeautifulSoup
|
|
from bs4.filter import SoupStrainer
|
|
from typing import List
|
|
from unidecode import unidecode
|
|
import re
|
|
from .models import MayoSearchResult, MayoGuitarDetails
|
|
|
|
class MayoParser:
|
|
@staticmethod
|
|
def clean_text(text: str) -> str:
|
|
if not text:
|
|
return ""
|
|
return " ".join(text.split()).replace("\xa0", "").strip()
|
|
|
|
@classmethod
|
|
def parse_search_results(cls, html: str, base_url: str) -> List[MayoSearchResult]:
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
results = []
|
|
|
|
table = soup.find("table", class_="std2")
|
|
if not table:
|
|
return results
|
|
|
|
tbody = table.find("tbody")
|
|
rows = tbody.find_all("tr") if tbody else table.find_all("tr")
|
|
|
|
for row in rows:
|
|
tds = row.find_all("td")
|
|
if len(tds) < 3:
|
|
continue
|
|
|
|
link_tag = tds[0].find("a")
|
|
if not link_tag:
|
|
continue
|
|
|
|
order_id = cls.clean_text(link_tag.get_text())
|
|
relative_url = link_tag.get("href")
|
|
full_url = f"{base_url}/{relative_url}" if relative_url else ""
|
|
prod_list = cls.clean_text(tds[1].get_text())
|
|
client = cls.clean_text(tds[2].get_text())
|
|
|
|
results.append(MayoSearchResult(
|
|
order_id=order_id,
|
|
client=client,
|
|
prod_list=prod_list,
|
|
url=full_url
|
|
))
|
|
|
|
return results
|
|
|
|
@classmethod
|
|
def parse_guitar_links(cls, html: str, base_url: str) -> List[str]:
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
links = []
|
|
|
|
table = soup.find("table", class_="std2")
|
|
if not table:
|
|
return links
|
|
|
|
tbody = table.find("tbody")
|
|
rows = tbody.find_all("tr") if tbody else table.find_all("tr")
|
|
|
|
for row in rows:
|
|
a_tags = row.find_all("a", href=True)
|
|
for a in a_tags:
|
|
href = a.get("href")
|
|
if not isinstance(href, str):
|
|
continue
|
|
if not href.startswith("http") and not href.startswith("index.php"):
|
|
continue
|
|
if "id_zestawu=" in href:
|
|
links.append(f"{base_url}/{href}")
|
|
break
|
|
return links
|
|
|
|
|
|
@classmethod
|
|
def parse_specification(cls, html: str) -> MayoGuitarDetails:
|
|
result = {
|
|
"order_number" : "",
|
|
"completion_date" : "",
|
|
"client" : "",
|
|
"model" : "",
|
|
"spec" : {}
|
|
}
|
|
|
|
only_content = SoupStrainer("div", id="tresc")
|
|
soup = BeautifulSoup(html, "html.parser", parse_only=only_content)
|
|
center_tag = soup.find("center")
|
|
if center_tag is None:
|
|
return MayoGuitarDetails(**result)
|
|
|
|
table_tags = center_tag.find_all(lambda tag: tag.name == "table" and tag.get("class") == ["std"])
|
|
|
|
if not table_tags or len(table_tags) < 3:
|
|
return MayoGuitarDetails(**result)
|
|
|
|
# order_id, date, client
|
|
header = table_tags[0].get_text(strip=True)
|
|
pattern = r"zam\.:(?P<order_id>\S+)\s*z datą realizacji:\s*(?P<date>\d{4}-\d{2}-\d{2})(?P<client>.*?)$"
|
|
match = re.search(pattern=pattern, string=header)
|
|
if match:
|
|
data = match.groupdict()
|
|
result["order_number"] = data["order_id"]
|
|
result["completion_date"] = data["date"]
|
|
result["client"] = data["client"].rstrip("- ")
|
|
|
|
# Model
|
|
model_input = table_tags[1].find("input", attrs={"name": "s_nr_kat"})
|
|
if model_input is not None:
|
|
model = model_input.get("value")
|
|
if isinstance(model, str):
|
|
result["model"] = model
|
|
|
|
# spec
|
|
current_section = None
|
|
for row in table_tags[2].find_all("tr", recursive=False):
|
|
cells = row.find_all("td", recursive=False)
|
|
|
|
if not cells:
|
|
continue
|
|
|
|
if len(cells) == 1 and cells[0].get("colspan") == "4":
|
|
text = cells[0].get_text(strip=True)
|
|
|
|
if text.isupper() and len(text) < 40:
|
|
current_section = unidecode(text.lower())
|
|
result["spec"][current_section] = {}
|
|
continue
|
|
|
|
if len(cells) != 2:
|
|
continue
|
|
|
|
key = unidecode( cells[0].get_text(strip=True).lower().replace(" ", "") )
|
|
value = unidecode( cells[1].get_text(strip=True) )
|
|
parts = re.split(r"notatka", value, flags=re.IGNORECASE)
|
|
entries = [p.strip().lstrip("- ") for p in parts if p.strip()]
|
|
|
|
if key and value and current_section is not None:
|
|
result["spec"][current_section][key] = entries
|
|
|
|
return MayoGuitarDetails(**result)
|