dodalem backend dzialajacy
This commit is contained in:
142
backend/mayo/parser.py
Normal file
142
backend/mayo/parser.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.filter import SoupStrainer
|
||||
from typing import List
|
||||
from unidecode import unidecode
|
||||
import re
|
||||
from .models import MayoSearchResult, MayoGuitarDetails
|
||||
|
||||
class MayoParser:
|
||||
@staticmethod
|
||||
def clean_text(text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
return " ".join(text.split()).replace("\xa0", "").strip()
|
||||
|
||||
@classmethod
|
||||
def parse_search_results(cls, html: str, base_url: str) -> List[MayoSearchResult]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
results = []
|
||||
|
||||
table = soup.find("table", class_="std2")
|
||||
if not table:
|
||||
return results
|
||||
|
||||
tbody = table.find("tbody")
|
||||
rows = tbody.find_all("tr") if tbody else table.find_all("tr")
|
||||
|
||||
for row in rows:
|
||||
tds = row.find_all("td")
|
||||
if len(tds) < 3:
|
||||
continue
|
||||
|
||||
link_tag = tds[0].find("a")
|
||||
if not link_tag:
|
||||
continue
|
||||
|
||||
order_id = cls.clean_text(link_tag.get_text())
|
||||
relative_url = link_tag.get("href")
|
||||
full_url = f"{base_url}/{relative_url}" if relative_url else ""
|
||||
prod_list = cls.clean_text(tds[1].get_text())
|
||||
client = cls.clean_text(tds[2].get_text())
|
||||
|
||||
results.append(MayoSearchResult(
|
||||
order_id=order_id,
|
||||
client=client,
|
||||
prod_list=prod_list,
|
||||
url=full_url
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
@classmethod
|
||||
def parse_guitar_links(cls, html: str, base_url: str) -> List[str]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
links = []
|
||||
|
||||
table = soup.find("table", class_="std2")
|
||||
if not table:
|
||||
return links
|
||||
|
||||
tbody = table.find("tbody")
|
||||
rows = tbody.find_all("tr") if tbody else table.find_all("tr")
|
||||
|
||||
for row in rows:
|
||||
a_tags = row.find_all("a", href=True)
|
||||
for a in a_tags:
|
||||
href = a.get("href")
|
||||
if not isinstance(href, str):
|
||||
continue
|
||||
if not href.startswith("http") and not href.startswith("index.php"):
|
||||
continue
|
||||
if "id_zestawu=" in href:
|
||||
links.append(f"{base_url}/{href}")
|
||||
break
|
||||
return links
|
||||
|
||||
|
||||
@classmethod
|
||||
def parse_specification(cls, html: str) -> MayoGuitarDetails:
|
||||
result = {
|
||||
"order_number" : "",
|
||||
"completion_date" : "",
|
||||
"client" : "",
|
||||
"model" : "",
|
||||
"spec" : {}
|
||||
}
|
||||
|
||||
only_content = SoupStrainer("div", id="tresc")
|
||||
soup = BeautifulSoup(html, "html.parser", parse_only=only_content)
|
||||
center_tag = soup.find("center")
|
||||
if center_tag is None:
|
||||
return MayoGuitarDetails(**result)
|
||||
|
||||
table_tags = center_tag.find_all(lambda tag: tag.name == "table" and tag.get("class") == ["std"])
|
||||
|
||||
if not table_tags or len(table_tags) < 3:
|
||||
return MayoGuitarDetails(**result)
|
||||
|
||||
# order_id, date, client
|
||||
header = table_tags[0].get_text(strip=True)
|
||||
pattern = r"zam\.:(?P<order_id>\S+)\s*z datą realizacji:\s*(?P<date>\d{4}-\d{2}-\d{2})(?P<client>.*?)$"
|
||||
match = re.search(pattern=pattern, string=header)
|
||||
if match:
|
||||
data = match.groupdict()
|
||||
result["order_number"] = data["order_id"]
|
||||
result["completion_date"] = data["date"]
|
||||
result["client"] = data["client"].rstrip("- ")
|
||||
|
||||
# Model
|
||||
model_input = table_tags[1].find("input", attrs={"name": "s_nr_kat"})
|
||||
if model_input is not None:
|
||||
model = model_input.get("value")
|
||||
if isinstance(model, str):
|
||||
result["model"] = model
|
||||
|
||||
# spec
|
||||
current_section = None
|
||||
for row in table_tags[2].find_all("tr", recursive=False):
|
||||
cells = row.find_all("td", recursive=False)
|
||||
|
||||
if not cells:
|
||||
continue
|
||||
|
||||
if len(cells) == 1 and cells[0].get("colspan") == "4":
|
||||
text = cells[0].get_text(strip=True)
|
||||
|
||||
if text.isupper() and len(text) < 40:
|
||||
current_section = unidecode(text.lower())
|
||||
result["spec"][current_section] = {}
|
||||
continue
|
||||
|
||||
if len(cells) != 2:
|
||||
continue
|
||||
|
||||
key = unidecode( cells[0].get_text(strip=True).lower().replace(" ", "") )
|
||||
value = unidecode( cells[1].get_text(strip=True) )
|
||||
parts = re.split(r"notatka", value, flags=re.IGNORECASE)
|
||||
entries = [p.strip().lstrip("- ") for p in parts if p.strip()]
|
||||
|
||||
if key and value and current_section is not None:
|
||||
result["spec"][current_section][key] = entries
|
||||
|
||||
return MayoGuitarDetails(**result)
|
||||
Reference in New Issue
Block a user