from bs4 import BeautifulSoup def clean(text): return " ".join(text.split()) def parse_html(path): with open(path, encoding="ISO-8859-2") as f: soup = BeautifulSoup(f, "html.parser") tresc = soup.find("div", id="tresc") result = { "meta": {}, "sections": {} } # ----------------------- # 🔹 1. META (LEPSZE) # ----------------------- # 👉 Dot. zam. first_table = tresc.find("table") if first_table: b_tags = first_table.find_all("b") print(first_table.getText()) if len(b_tags) >= 2: result["meta"]["nr_zamownia"] = clean(b_tags[1].get_text()) result["meta"]["realizacja"] = clean(b_tags[3].get_text()) client = first_table.find('span', attrs={'style': "font-weight:bold;"} ) if client: print(client.get_text()) # 👉 formularz (Model, Odbiorca itd.) form_table = tresc.find("form") if form_table: table = form_table.find_parent("table") if table: # 🔥 przeszukujemy CAŁĄ tabelę (wszystkie tr) # Model model_input = table.find("input", {"name": "s_nr_kat"}) if model_input: result["meta"]["Model"] = clean(model_input.get("value", "")) # Odbiorca odb_input = table.find("input", {"name": "s_odbiorca"}) if odb_input: result["meta"]["Odbiorca"] = clean(odb_input.get("value", "")) # Grupa grupa_select = table.find("select", {"name": "s_grupa"}) if grupa_select: selected = grupa_select.find("option", selected=True) if selected: result["meta"]["Grupa"] = clean(selected.get_text()) # ----------------------- # 🔹 2. SEKCJE (SZYJKA itd.) # ----------------------- current_section = None for tr in tresc.find_all("tr"): tds = tr.find_all("td") if not tds: continue # 🔸 Sekcja (np. SZYJKA) if len(tds) == 1: text = clean(tds[0].get_text()) if text.isupper() and len(text) < 40: current_section = text result["sections"][current_section] = {} continue # 🔸 Element w sekcji if len(tds) >= 2 and current_section: key_tag = tds[0].find("b") if not key_tag: continue key = clean(key_tag.get_text()) # usuń linki / śmieci key = key.replace("\xa0", "").strip() value_td = tds[1] # zbierz wszystkie teksty (ignorując "Notatka") texts = [] for x in value_td.stripped_strings: if "Notatka" in x: continue texts.append(x) value = clean(" ".join(texts)) if key: result["sections"][current_section][key] = value return result import time start = time.perf_counter() data = parse_html("g.htm") end = time.perf_counter() print(f"Czas wykonania: {end - start:.6f} sekund") # from pprint import pprint # pprint(data) import json with open("output.json", "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False)