From 7833819fba25e25fa966ccbc040b864ca18c5ea0 Mon Sep 17 00:00:00 2001 From: Christopher Meinhold Date: Wed, 29 Jan 2025 08:55:08 +0100 Subject: [PATCH] Scrapper mit Adresssplitting --- 250129_Scrapper.py | 187 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 250129_Scrapper.py diff --git a/250129_Scrapper.py b/250129_Scrapper.py new file mode 100644 index 0000000..8314992 --- /dev/null +++ b/250129_Scrapper.py @@ -0,0 +1,187 @@ +import requests +from bs4 import BeautifulSoup +import mysql.connector +import time +import re + +# MySQL-Verbindungsdetails +DB_HOST = "192.168.178.201" +DB_USER = "gelbeseiten" +DB_PASSWORD = "Gm4bBE62gXCSVVY2" +DB_NAME = "domainchecker" + +def connect_to_database(): + return mysql.connector.connect( + host=DB_HOST, + user=DB_USER, + password=DB_PASSWORD, + database=DB_NAME + ) + +def get_next_city_category(cursor): + """Holt die nächste Stadt-Rubrik-Kombination mit status = 0""" + cursor.execute("SELECT id, stadt, rubrik FROM staedte_rubriken WHERE status = 0 LIMIT 1") + return cursor.fetchone() + +def update_status(cursor, conn, entry_id): + """Setzt den status auf 1 für eine verarbeitete Stadt-Rubrik-Kombination""" + cursor.execute("UPDATE staedte_rubriken SET status = 1 WHERE id = %s", (entry_id,)) + conn.commit() + +def insert_into_gelbeseiten(cursor, data): + sql = """ + INSERT INTO gelbeseiten + (`data-realid`, `firmenname`, `domain`, `mailadresse`, `adresse`, `plz`, `ort`, `telefonnummer`, `ansprechpartner`, `branche`) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + cursor.execute(sql, data) + +def check_if_realid_exists(cursor, realid): + cursor.execute("SELECT 1 FROM gelbeseiten WHERE `data-realid` = %s", (realid,)) + print(f"Real-Id exisitiert bereits: {realid}") + return cursor.fetchone() is not None + +def clean_address(address): + """Adressen in die Form: Straße, PLZ, Stadt bringen""" + address = address.strip().replace("\n", " ") + pattern = r"^(.*?),\s+(\d{5})\s+([^0-9]+)\s+\d+,\d+\s+km$" + + match = re.match(pattern, address) + if match: + street = match.group(1).strip() + plz = match.group(2) + city = match.group(3).replace("\t", "").replace("\n", "").strip() + return street, plz, city + else: + return address, "Unbekannt", "Unbekannt" + +def scrape_gelbeseiten(search_term, location, radius_km=50000): + base_url = "https://www.gelbeseiten.de/suche" + search_url = f"{base_url}/{search_term}/{location}?umkreis={radius_km}" + print(f"Scrape {search_term} in {location}...") + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" + } + + response = requests.get(search_url, headers=headers) + if response.status_code != 200: + print(f"Fehler beim Abrufen der Suchergebnisse für {search_term} in {location}.") + return [] + + soup = BeautifulSoup(response.text, "html.parser") + articles = soup.find_all("article", class_="mod mod-Treffer") + + results = [] + for article in articles: + try: + realid = article.get("data-realid") + if not realid: + continue + + name_tag = article.find("h2", class_="mod-Treffer__name") + name = name_tag.text.strip() if name_tag else "Unbekannt" + + address_tag = article.find("div", class_="mod-AdresseKompakt__adress-text") + raw_address = address_tag.text.strip().replace("\n", " ") if address_tag else "Keine Adresse" + + street, plz, city = clean_address(raw_address) + + phone_tag = article.find("a", class_="mod-TelefonnummerKompakt__phoneNumber") + phone = phone_tag.text.strip() if phone_tag else "Keine Telefonnummer" + + detail_url = f"https://www.gelbeseiten.de/gsbiz/{realid}" + detail_data = scrape_detail_page(detail_url, headers) + + results.append({ + "realid": realid, + "name": name, + "street": street, + "plz": plz, + "city": city, + "phone": phone, + "website": detail_data.get("website", "Keine Webseite"), + "email": detail_data.get("email", "Keine E-Mail"), + "contact": detail_data.get("contact", "Unbekannt"), + "branche": detail_data.get("branche", "Keine Branche") + }) + + print(f"Gescrapt: {name}") + time.sleep(1) + except Exception as e: + print(f"Fehler beim Verarbeiten eines Eintrags: {e}") + + return results + +def scrape_detail_page(url, headers): + response = requests.get(url, headers=headers) + if response.status_code != 200: + print(f"Fehler beim Abrufen der Detailseite: {url}") + return {} + + soup = BeautifulSoup(response.text, "html.parser") + + website_div = soup.find("div", class_="mod-Kontaktdaten__list-item contains-icon-big-homepage") + website = website_div.find("a")["href"] if website_div and website_div.find("a") else "Keine Webseite" + + email_tag = soup.find("a", href=lambda href: href and "mailto:" in href) + email = email_tag["href"].replace("mailto:", "") if email_tag else "Keine E-Mail" + + email_div = soup.find("div", id="email_versenden") + if email_div and email_div.get("data-link"): + email_match = re.search(r"mailto:([^?]+)", email_div["data-link"]) + if email_match: + email = email_match.group(1) + + contact_tag = soup.find("div", class_="mod-Ansprechpartner__name") + contact = contact_tag.text.strip() if contact_tag else "Unbekannt" + + branche_tag = soup.find("div", class_="mod-TeilnehmerKopf__branchen") + branche_span = branche_tag.find("span", {"data-selenium": "teilnehmerkopf__branche"}) if branche_tag else None + branche = branche_span.text.strip() if branche_span else "Keine Branche" + + return { + "website": website, + "email": email, + "contact": contact, + "branche": branche + } + +if __name__ == "__main__": + conn = connect_to_database() + cursor = conn.cursor() + + try: + while True: + city_category = get_next_city_category(cursor) + if not city_category: + print("Alle Stadt-Rubrik-Kombinationen wurden verarbeitet.") + break + + entry_id, city, category = city_category + results = scrape_gelbeseiten(category, city) + + for result in results: + if not check_if_realid_exists(cursor, result["realid"]): + data = ( + result["realid"], + result["name"], + result["website"], + result["email"], + result["street"], + result["plz"], + result["city"], + result["phone"], + result["contact"], + result["branche"] + ) + insert_into_gelbeseiten(cursor, data) + + conn.commit() + update_status(cursor, conn, entry_id) + + except Exception as e: + print(f"Fehler: {e}") + finally: + cursor.close() + conn.close()