import requests from bs4 import BeautifulSoup import mysql.connector import time import re # MySQL-Verbindungsdetails DB_HOST = "192.168.178.201" DB_USER = "gelbeseiten" DB_PASSWORD = "Gm4bBE62gXCSVVY2" DB_NAME = "domainchecker" def connect_to_database(): return mysql.connector.connect( host=DB_HOST, user=DB_USER, password=DB_PASSWORD, database=DB_NAME ) def get_next_city_category(cursor): """Holt die nächste Stadt-Rubrik-Kombination mit status = 0""" cursor.execute("SELECT id, stadt, rubrik FROM staedte_rubriken WHERE status = 0 LIMIT 1") return cursor.fetchone() def update_status(cursor, conn, entry_id): """Setzt den status auf 1 für eine verarbeitete Stadt-Rubrik-Kombination""" cursor.execute("UPDATE staedte_rubriken SET status = 1 WHERE id = %s", (entry_id,)) conn.commit() def insert_into_gelbeseiten(cursor, data): sql = """ INSERT INTO gelbeseiten (`data-realid`, `firmenname`, `domain`, `mailadresse`, `adresse`, `plz`, `ort`, `telefonnummer`, `ansprechpartner`, `branche`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ cursor.execute(sql, data) def check_if_realid_exists(cursor, realid): cursor.execute("SELECT 1 FROM gelbeseiten WHERE `data-realid` = %s", (realid,)) print(f"Real-Id exisitiert bereits: {realid}") return cursor.fetchone() is not None def clean_address(address): """Adressen in die Form: Straße, PLZ, Stadt bringen""" address = address.strip().replace("\n", " ") pattern = r"^(.*?),\s+(\d{5})\s+([^0-9]+)\s+\d+,\d+\s+km$" match = re.match(pattern, address) if match: street = match.group(1).strip() plz = match.group(2) city = match.group(3).replace("\t", "").replace("\n", "").strip() return street, plz, city else: return address, "Unbekannt", "Unbekannt" def scrape_gelbeseiten(search_term, location, radius_km=50000): base_url = "https://www.gelbeseiten.de/suche" search_url = f"{base_url}/{search_term}/{location}?umkreis={radius_km}" print(f"Scrape {search_term} in {location}...") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" } response = requests.get(search_url, headers=headers) if response.status_code != 200: print(f"Fehler beim Abrufen der Suchergebnisse für {search_term} in {location}.") return [] soup = BeautifulSoup(response.text, "html.parser") articles = soup.find_all("article", class_="mod mod-Treffer") results = [] for article in articles: try: realid = article.get("data-realid") if not realid: continue name_tag = article.find("h2", class_="mod-Treffer__name") name = name_tag.text.strip() if name_tag else "Unbekannt" address_tag = article.find("div", class_="mod-AdresseKompakt__adress-text") raw_address = address_tag.text.strip().replace("\n", " ") if address_tag else "Keine Adresse" street, plz, city = clean_address(raw_address) phone_tag = article.find("a", class_="mod-TelefonnummerKompakt__phoneNumber") phone = phone_tag.text.strip() if phone_tag else "Keine Telefonnummer" detail_url = f"https://www.gelbeseiten.de/gsbiz/{realid}" detail_data = scrape_detail_page(detail_url, headers) results.append({ "realid": realid, "name": name, "street": street, "plz": plz, "city": city, "phone": phone, "website": detail_data.get("website", "Keine Webseite"), "email": detail_data.get("email", "Keine E-Mail"), "contact": detail_data.get("contact", "Unbekannt"), "branche": detail_data.get("branche", "Keine Branche") }) print(f"Gescrapt: {name}") time.sleep(1) except Exception as e: print(f"Fehler beim Verarbeiten eines Eintrags: {e}") return results def scrape_detail_page(url, headers): response = requests.get(url, headers=headers) if response.status_code != 200: print(f"Fehler beim Abrufen der Detailseite: {url}") return {} soup = BeautifulSoup(response.text, "html.parser") website_div = soup.find("div", class_="mod-Kontaktdaten__list-item contains-icon-big-homepage") website = website_div.find("a")["href"] if website_div and website_div.find("a") else "Keine Webseite" email_tag = soup.find("a", href=lambda href: href and "mailto:" in href) email = email_tag["href"].replace("mailto:", "") if email_tag else "Keine E-Mail" email_div = soup.find("div", id="email_versenden") if email_div and email_div.get("data-link"): email_match = re.search(r"mailto:([^?]+)", email_div["data-link"]) if email_match: email = email_match.group(1) contact_tag = soup.find("div", class_="mod-Ansprechpartner__name") contact = contact_tag.text.strip() if contact_tag else "Unbekannt" branche_tag = soup.find("div", class_="mod-TeilnehmerKopf__branchen") branche_span = branche_tag.find("span", {"data-selenium": "teilnehmerkopf__branche"}) if branche_tag else None branche = branche_span.text.strip() if branche_span else "Keine Branche" return { "website": website, "email": email, "contact": contact, "branche": branche } if __name__ == "__main__": conn = connect_to_database() cursor = conn.cursor() try: while True: city_category = get_next_city_category(cursor) if not city_category: print("Alle Stadt-Rubrik-Kombinationen wurden verarbeitet.") break entry_id, city, category = city_category results = scrape_gelbeseiten(category, city) for result in results: if not check_if_realid_exists(cursor, result["realid"]): data = ( result["realid"], result["name"], result["website"], result["email"], result["street"], result["plz"], result["city"], result["phone"], result["contact"], result["branche"] ) insert_into_gelbeseiten(cursor, data) conn.commit() update_status(cursor, conn, entry_id) except Exception as e: print(f"Fehler: {e}") finally: cursor.close() conn.close()