skrypt do scrapowania tresci z lokalnej strony mayo

2026-04-19 10:46:50 +02:00
parent 38904470b2
commit 4707a49e06
2 changed files with 303 additions and 0 deletions
--- a/backend/mayo/mayo1.py
+++ b/backend/mayo/mayo1.py
@@ -0,0 +1,127 @@
+from bs4 import BeautifulSoup
+
+def clean(text):
+    return " ".join(text.split())
+
+def parse_html(path):
+    with open(path, encoding="ISO-8859-2") as f:
+        soup = BeautifulSoup(f, "html.parser")
+
+    tresc = soup.find("div", id="tresc")
+
+    result = {
+        "meta": {},
+        "sections": {}
+    }
+
+    # -----------------------
+    # 🔹 1. META (LEPSZE)
+    # -----------------------
+
+    # 👉 Dot. zam.
+    first_table = tresc.find("table")
+
+    if first_table:
+        b_tags = first_table.find_all("b")
+        print(first_table.getText())
+
+        if len(b_tags) >= 2:
+            result["meta"]["nr_zamownia"] = clean(b_tags[1].get_text())
+            result["meta"]["realizacja"] = clean(b_tags[3].get_text())
+        
+        client = first_table.find('span', attrs={'style': "font-weight:bold;"} )
+        if client:
+            print(client.get_text())
+
+
+    # 👉 formularz (Model, Odbiorca itd.)
+    form_table = tresc.find("form")
+
+    if form_table:
+        table = form_table.find_parent("table")
+
+        if table:
+            # 🔥 przeszukujemy CAŁĄ tabelę (wszystkie tr)
+            # Model
+            model_input = table.find("input", {"name": "s_nr_kat"})
+            if model_input:
+                result["meta"]["Model"] = clean(model_input.get("value", ""))
+
+            # Odbiorca
+            odb_input = table.find("input", {"name": "s_odbiorca"})
+            if odb_input:
+                result["meta"]["Odbiorca"] = clean(odb_input.get("value", ""))
+
+            # Grupa
+            grupa_select = table.find("select", {"name": "s_grupa"})
+            if grupa_select:
+                selected = grupa_select.find("option", selected=True)
+                if selected:
+                    result["meta"]["Grupa"] = clean(selected.get_text())
+
+    # -----------------------
+    # 🔹 2. SEKCJE (SZYJKA itd.)
+    # -----------------------
+    current_section = None
+
+    for tr in tresc.find_all("tr"):
+        tds = tr.find_all("td")
+
+        if not tds:
+            continue
+
+        # 🔸 Sekcja (np. SZYJKA)
+        if len(tds) == 1:
+            text = clean(tds[0].get_text())
+
+            if text.isupper() and len(text) < 40:
+                current_section = text
+                result["sections"][current_section] = {}
+                continue
+
+        # 🔸 Element w sekcji
+        if len(tds) >= 2 and current_section:
+            key_tag = tds[0].find("b")
+
+            if not key_tag:
+                continue
+
+            key = clean(key_tag.get_text())
+
+            # usuń linki / śmieci
+            key = key.replace("\xa0", "").strip()
+
+            value_td = tds[1]
+
+            # zbierz wszystkie teksty (ignorując "Notatka")
+            texts = []
+
+            for x in value_td.stripped_strings:
+                if "Notatka" in x:
+                    continue
+                texts.append(x)
+
+            value = clean(" ".join(texts))
+
+            if key:
+                result["sections"][current_section][key] = value
+
+    return result
+
+import time
+
+start = time.perf_counter()
+
+data = parse_html("g.htm")
+
+end = time.perf_counter()
+
+print(f"Czas wykonania: {end - start:.6f} sekund")
+
+# from pprint import pprint
+# pprint(data)
+
+import json
+
+with open("output.json", "w", encoding="utf-8") as f:
+    json.dump(data, f, indent=2, ensure_ascii=False)