WebscrapperPython/scrapper_Adresse_v3.py

import requests
from bs4 import BeautifulSoup
import mysql.connector
import time
import re

# MySQL-Verbindungsdetails
DB_HOST = "192.168.178.201"
DB_USER = "gelbeseiten"
DB_PASSWORD = "Gm4bBE62gXCSVVY2"
DB_NAME = "domainchecker"

def connect_to_database():
    return mysql.connector.connect(
        host=DB_HOST,
        user=DB_USER,
        password=DB_PASSWORD,
        database=DB_NAME
    )

def get_data_from_table(cursor, table_name, column_name):
    cursor.execute(f"SELECT `{column_name}` FROM `{table_name}` order by id desc")
    return [row[0] for row in cursor.fetchall()]

def insert_into_gelbeseiten(cursor, data):
    sql = """
    INSERT INTO gelbeseiten
    (`data-realid`, `firmenname`, `domain`, `mailadresse`, `adresse`, `plz`, `ort`, `telefonnummer`, `ansprechpartner`,`branche` )
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    cursor.execute(sql, data)

def check_if_realid_exists(cursor, realid):
    cursor.execute("SELECT 1 FROM gelbeseiten WHERE `data-realid` = %s", (realid,))
    return cursor.fetchone() is not None

def split_address(address):
    # Adresse in Straße, Hausnummer, PLZ, Ort aufteilen
    address = address.strip().replace("\n", " ")

    # Annahme: Die Adresse hat das Format: "Straße Hausnummer, PLZ Ort"
    # Beispiel: "Musterstraße 1, 12345 Dresden"
    address_parts = re.split(r',\s*', address)

    if len(address_parts) == 2:
        street_and_number = address_parts[0].strip()
        plz_and_city = address_parts[1].strip()

        # Aufteilen der PLZ und Stadt
        plz_city_parts = re.split(r'\s+', plz_and_city)
        plz = plz_city_parts[0]
        city = " ".join(plz_city_parts[1:]) if len(plz_city_parts) > 1 else ""

        return street_and_number, plz, city
    else:
        # Falls die Adresse nicht wie erwartet ist, gebe leere Werte zurück
        return address, "Unbekannt", "Unbekannt"

def scrape_gelbeseiten(search_term, location, radius_km=50000):
    base_url = "https://www.gelbeseiten.de/suche"
    search_url = f"{base_url}/{search_term}/{location}?umkreis={radius_km}"
    print(f"Scrape {search_term} in {location}...")

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

    response = requests.get(search_url, headers=headers)
    if response.status_code != 200:
        print(f"Fehler beim Abrufen der Suchergebnisse für {search_term} in {location}.")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article", class_="mod mod-Treffer")

    results = []
    for article in articles:
        try:
            realid = article.get("data-realid")
            if not realid:
                continue

            name_tag = article.find("h2", class_="mod-Treffer__name")
            name = name_tag.text.strip() if name_tag else "Unbekannt"

            address_tag = article.find("div", class_="mod-AdresseKompakt__adress-text")
            address = address_tag.text.strip().replace("\n", " ") if address_tag else "Keine Adresse"

            street, plz, city = split_address(address)

            phone_tag = article.find("a", class_="mod-TelefonnummerKompakt__phoneNumber")
            phone = phone_tag.text.strip() if phone_tag else "Keine Telefonnummer"

            detail_url = f"https://www.gelbeseiten.de/gsbiz/{realid}"
            detail_data = scrape_detail_page(detail_url, headers)

            results.append({
                "realid": realid,
                "name": name,
                "street": street,
                "plz": plz,
                "city": city,
                "phone": phone,
                "website": detail_data.get("website", "Keine Webseite"),
                "email": detail_data.get("email", "Keine E-Mail"),
                "contact": detail_data.get("contact", "Unbekannt"),
                "branche": detail_data.get("branche", "Keine Branche")
            })

            print(f"Gescrapt: {name}")
            time.sleep(1)
        except Exception as e:
            print(f"Fehler beim Verarbeiten eines Eintrags: {e}")

    return results

def scrape_detail_page(url, headers):
    """Scrape Detailseite für Website, E-Mail, Ansprechpartner und Branche."""
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Fehler beim Abrufen der Detailseite: {url}")
        return {}

    soup = BeautifulSoup(response.text, "html.parser")

    # Webseite extrahieren
    website_div = soup.find("div", class_="mod-Kontaktdaten__list-item contains-icon-big-homepage")
    website = website_div.find("a")["href"] if website_div and website_div.find("a") else "Keine Webseite"

    # E-Mail aus einem direkten <a>-Tag suchen
    email_tag = soup.find("a", href=lambda href: href and "mailto:" in href)
    email = email_tag["href"].replace("mailto:", "") if email_tag else "Keine E-Mail"

    # Alternative E-Mail aus dem "aktionsleiste-button"-Div suchen
    email_div = soup.find("div", id="email_versenden")
    if email_div and email_div.get("data-link"):
        email_match = re.search(r"mailto:([^?]+)", email_div["data-link"])
        if email_match:
            email = email_match.group(1)

    # Ansprechpartner extrahieren
    contact_tag = soup.find("div", class_="mod-Ansprechpartner__name")
    contact = contact_tag.text.strip() if contact_tag else "Unbekannt"

    # Branche extrahieren
    branche_tag = soup.find("div", class_="mod-TeilnehmerKopf__branchen")
    branche_span = branche_tag.find("span", {"data-selenium": "teilnehmerkopf__branche"}) if branche_tag else None
    branche = branche_span.text.strip() if branche_span else "Keine Branche"

    return {
        "website": website,
        "email": email,
        "contact": contact,
        "branche": branche
    }

if __name__ == "__main__":
    conn = connect_to_database()
    cursor = conn.cursor()

    try:
        # Städte und Rubriken abrufen
        staedte = get_data_from_table(cursor, "staedte", "staedte")
        rubriken = get_data_from_table(cursor, "rubriken", "rubriken")

        for stadt in staedte:
            for rubrik in rubriken:
                results = scrape_gelbeseiten(rubrik, stadt)

                # Ergebnisse speichern, nur wenn die data-realid noch nicht existiert
                for result in results:
                    if not check_if_realid_exists(cursor, result["realid"]):
                        data = (
                            result["realid"],
                            result["name"],
                            result["website"],
                            result["email"],
                            result["street"],
                            result["plz"],
                            result["city"],
                            result["phone"],
                            result["contact"],
                            result["branche"]
                        )
                        insert_into_gelbeseiten(cursor, data)
                    else:
                        print(f"Die RealID {result['realid']} existiert bereits, überspringe das Einfügen.")
                conn.commit()
    except Exception as e:
        print(f"Fehler: {e}")
    finally:
        cursor.close()
        conn.close()