duck-prod-manager/backend/mayo/mayo1.py

from bs4 import BeautifulSoup

def clean(text):
    return " ".join(text.split())

def parse_html(path):
    with open(path, encoding="ISO-8859-2") as f:
        soup = BeautifulSoup(f, "html.parser")

    tresc = soup.find("div", id="tresc")

    result = {
        "meta": {},
        "sections": {}
    }

    # -----------------------
    # 🔹 1. META (LEPSZE)
    # -----------------------

    # 👉 Dot. zam.
    first_table = tresc.find("table")

    if first_table:
        b_tags = first_table.find_all("b")
        print(first_table.getText())

        if len(b_tags) >= 2:
            result["meta"]["nr_zamownia"] = clean(b_tags[1].get_text())
            result["meta"]["realizacja"] = clean(b_tags[3].get_text())

        client = first_table.find('span', attrs={'style': "font-weight:bold;"} )
        if client:
            print(client.get_text())


    # 👉 formularz (Model, Odbiorca itd.)
    form_table = tresc.find("form")

    if form_table:
        table = form_table.find_parent("table")

        if table:
            # 🔥 przeszukujemy CAŁĄ tabelę (wszystkie tr)
            # Model
            model_input = table.find("input", {"name": "s_nr_kat"})
            if model_input:
                result["meta"]["Model"] = clean(model_input.get("value", ""))

            # Odbiorca
            odb_input = table.find("input", {"name": "s_odbiorca"})
            if odb_input:
                result["meta"]["Odbiorca"] = clean(odb_input.get("value", ""))

            # Grupa
            grupa_select = table.find("select", {"name": "s_grupa"})
            if grupa_select:
                selected = grupa_select.find("option", selected=True)
                if selected:
                    result["meta"]["Grupa"] = clean(selected.get_text())

    # -----------------------
    # 🔹 2. SEKCJE (SZYJKA itd.)
    # -----------------------
    current_section = None

    for tr in tresc.find_all("tr"):
        tds = tr.find_all("td")

        if not tds:
            continue

        # 🔸 Sekcja (np. SZYJKA)
        if len(tds) == 1:
            text = clean(tds[0].get_text())

            if text.isupper() and len(text) < 40:
                current_section = text
                result["sections"][current_section] = {}
                continue

        # 🔸 Element w sekcji
        if len(tds) >= 2 and current_section:
            key_tag = tds[0].find("b")

            if not key_tag:
                continue

            key = clean(key_tag.get_text())

            # usuń linki / śmieci
            key = key.replace("\xa0", "").strip()

            value_td = tds[1]

            # zbierz wszystkie teksty (ignorując "Notatka")
            texts = []

            for x in value_td.stripped_strings:
                if "Notatka" in x:
                    continue
                texts.append(x)

            value = clean(" ".join(texts))

            if key:
                result["sections"][current_section][key] = value

    return result

import time

start = time.perf_counter()

data = parse_html("g.htm")

end = time.perf_counter()

print(f"Czas wykonania: {end - start:.6f} sekund")

# from pprint import pprint
# pprint(data)

import json

with open("output.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)