skrypt do scrapowania tresci z lokalnej strony mayo
This commit is contained in:
127
backend/mayo/mayo1.py
Normal file
127
backend/mayo/mayo1.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def clean(text):
|
||||
return " ".join(text.split())
|
||||
|
||||
def parse_html(path):
|
||||
with open(path, encoding="ISO-8859-2") as f:
|
||||
soup = BeautifulSoup(f, "html.parser")
|
||||
|
||||
tresc = soup.find("div", id="tresc")
|
||||
|
||||
result = {
|
||||
"meta": {},
|
||||
"sections": {}
|
||||
}
|
||||
|
||||
# -----------------------
|
||||
# 🔹 1. META (LEPSZE)
|
||||
# -----------------------
|
||||
|
||||
# 👉 Dot. zam.
|
||||
first_table = tresc.find("table")
|
||||
|
||||
if first_table:
|
||||
b_tags = first_table.find_all("b")
|
||||
print(first_table.getText())
|
||||
|
||||
if len(b_tags) >= 2:
|
||||
result["meta"]["nr_zamownia"] = clean(b_tags[1].get_text())
|
||||
result["meta"]["realizacja"] = clean(b_tags[3].get_text())
|
||||
|
||||
client = first_table.find('span', attrs={'style': "font-weight:bold;"} )
|
||||
if client:
|
||||
print(client.get_text())
|
||||
|
||||
|
||||
# 👉 formularz (Model, Odbiorca itd.)
|
||||
form_table = tresc.find("form")
|
||||
|
||||
if form_table:
|
||||
table = form_table.find_parent("table")
|
||||
|
||||
if table:
|
||||
# 🔥 przeszukujemy CAŁĄ tabelę (wszystkie tr)
|
||||
# Model
|
||||
model_input = table.find("input", {"name": "s_nr_kat"})
|
||||
if model_input:
|
||||
result["meta"]["Model"] = clean(model_input.get("value", ""))
|
||||
|
||||
# Odbiorca
|
||||
odb_input = table.find("input", {"name": "s_odbiorca"})
|
||||
if odb_input:
|
||||
result["meta"]["Odbiorca"] = clean(odb_input.get("value", ""))
|
||||
|
||||
# Grupa
|
||||
grupa_select = table.find("select", {"name": "s_grupa"})
|
||||
if grupa_select:
|
||||
selected = grupa_select.find("option", selected=True)
|
||||
if selected:
|
||||
result["meta"]["Grupa"] = clean(selected.get_text())
|
||||
|
||||
# -----------------------
|
||||
# 🔹 2. SEKCJE (SZYJKA itd.)
|
||||
# -----------------------
|
||||
current_section = None
|
||||
|
||||
for tr in tresc.find_all("tr"):
|
||||
tds = tr.find_all("td")
|
||||
|
||||
if not tds:
|
||||
continue
|
||||
|
||||
# 🔸 Sekcja (np. SZYJKA)
|
||||
if len(tds) == 1:
|
||||
text = clean(tds[0].get_text())
|
||||
|
||||
if text.isupper() and len(text) < 40:
|
||||
current_section = text
|
||||
result["sections"][current_section] = {}
|
||||
continue
|
||||
|
||||
# 🔸 Element w sekcji
|
||||
if len(tds) >= 2 and current_section:
|
||||
key_tag = tds[0].find("b")
|
||||
|
||||
if not key_tag:
|
||||
continue
|
||||
|
||||
key = clean(key_tag.get_text())
|
||||
|
||||
# usuń linki / śmieci
|
||||
key = key.replace("\xa0", "").strip()
|
||||
|
||||
value_td = tds[1]
|
||||
|
||||
# zbierz wszystkie teksty (ignorując "Notatka")
|
||||
texts = []
|
||||
|
||||
for x in value_td.stripped_strings:
|
||||
if "Notatka" in x:
|
||||
continue
|
||||
texts.append(x)
|
||||
|
||||
value = clean(" ".join(texts))
|
||||
|
||||
if key:
|
||||
result["sections"][current_section][key] = value
|
||||
|
||||
return result
|
||||
|
||||
import time
|
||||
|
||||
start = time.perf_counter()
|
||||
|
||||
data = parse_html("g.htm")
|
||||
|
||||
end = time.perf_counter()
|
||||
|
||||
print(f"Czas wykonania: {end - start:.6f} sekund")
|
||||
|
||||
# from pprint import pprint
|
||||
# pprint(data)
|
||||
|
||||
import json
|
||||
|
||||
with open("output.json", "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
Reference in New Issue
Block a user