Scrapper mit Adresssplitting

2025-01-29 08:55:08 +01:00 · 2025-01-29 08:55:08 +01:00 · 7833819fba
commit 7833819fba
parent 12997dbba7
1 changed files with 187 additions and 0 deletions
--- a/250129_Scrapper.py
+++ b/250129_Scrapper.py
@ -0,0 +1,187 @@
+import requests
+from bs4 import BeautifulSoup
+import mysql.connector
+import time
+import re
+
+# MySQL-Verbindungsdetails
+DB_HOST = "192.168.178.201"
+DB_USER = "gelbeseiten"
+DB_PASSWORD = "Gm4bBE62gXCSVVY2"
+DB_NAME = "domainchecker"
+
+def connect_to_database():
+    return mysql.connector.connect(
+        host=DB_HOST,
+        user=DB_USER,
+        password=DB_PASSWORD,
+        database=DB_NAME
+    )
+
+def get_next_city_category(cursor):
+    """Holt die nächste Stadt-Rubrik-Kombination mit status = 0"""
+    cursor.execute("SELECT id, stadt, rubrik FROM staedte_rubriken WHERE status = 0 LIMIT 1")
+    return cursor.fetchone()
+
+def update_status(cursor, conn, entry_id):
+    """Setzt den status auf 1 für eine verarbeitete Stadt-Rubrik-Kombination"""
+    cursor.execute("UPDATE staedte_rubriken SET status = 1 WHERE id = %s", (entry_id,))
+    conn.commit()
+
+def insert_into_gelbeseiten(cursor, data):
+    sql = """
+    INSERT INTO gelbeseiten 
+    (`data-realid`, `firmenname`, `domain`, `mailadresse`, `adresse`, `plz`, `ort`, `telefonnummer`, `ansprechpartner`, `branche`) 
+    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+    """
+    cursor.execute(sql, data)
+
+def check_if_realid_exists(cursor, realid):
+    cursor.execute("SELECT 1 FROM gelbeseiten WHERE `data-realid` = %s", (realid,))
+    print(f"Real-Id exisitiert bereits: {realid}")
+    return cursor.fetchone() is not None
+
+def clean_address(address):
+    """Adressen in die Form: Straße, PLZ, Stadt bringen"""
+    address = address.strip().replace("\n", " ")
+    pattern = r"^(.*?),\s+(\d{5})\s+([^0-9]+)\s+\d+,\d+\s+km$"
+    
+    match = re.match(pattern, address)
+    if match:
+        street = match.group(1).strip()
+        plz = match.group(2)
+        city = match.group(3).replace("\t", "").replace("\n", "").strip()
+        return street, plz, city
+    else:
+        return address, "Unbekannt", "Unbekannt"
+
+def scrape_gelbeseiten(search_term, location, radius_km=50000):
+    base_url = "https://www.gelbeseiten.de/suche"
+    search_url = f"{base_url}/{search_term}/{location}?umkreis={radius_km}"
+    print(f"Scrape {search_term} in {location}...")
+
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
+    }
+    
+    response = requests.get(search_url, headers=headers)
+    if response.status_code != 200:
+        print(f"Fehler beim Abrufen der Suchergebnisse für {search_term} in {location}.")
+        return []
+    
+    soup = BeautifulSoup(response.text, "html.parser")
+    articles = soup.find_all("article", class_="mod mod-Treffer")
+    
+    results = []
+    for article in articles:
+        try:
+            realid = article.get("data-realid")
+            if not realid:
+                continue
+            
+            name_tag = article.find("h2", class_="mod-Treffer__name")
+            name = name_tag.text.strip() if name_tag else "Unbekannt"
+            
+            address_tag = article.find("div", class_="mod-AdresseKompakt__adress-text")
+            raw_address = address_tag.text.strip().replace("\n", " ") if address_tag else "Keine Adresse"
+            
+            street, plz, city = clean_address(raw_address)
+            
+            phone_tag = article.find("a", class_="mod-TelefonnummerKompakt__phoneNumber")
+            phone = phone_tag.text.strip() if phone_tag else "Keine Telefonnummer"
+            
+            detail_url = f"https://www.gelbeseiten.de/gsbiz/{realid}"
+            detail_data = scrape_detail_page(detail_url, headers)
+            
+            results.append({
+                "realid": realid,
+                "name": name,
+                "street": street,
+                "plz": plz,
+                "city": city,
+                "phone": phone,
+                "website": detail_data.get("website", "Keine Webseite"),
+                "email": detail_data.get("email", "Keine E-Mail"),
+                "contact": detail_data.get("contact", "Unbekannt"),
+                "branche": detail_data.get("branche", "Keine Branche")
+            })
+            
+            print(f"Gescrapt: {name}")
+            time.sleep(1)
+        except Exception as e:
+            print(f"Fehler beim Verarbeiten eines Eintrags: {e}")
+    
+    return results
+
+def scrape_detail_page(url, headers):
+    response = requests.get(url, headers=headers)
+    if response.status_code != 200:
+        print(f"Fehler beim Abrufen der Detailseite: {url}")
+        return {}
+    
+    soup = BeautifulSoup(response.text, "html.parser")
+    
+    website_div = soup.find("div", class_="mod-Kontaktdaten__list-item contains-icon-big-homepage")
+    website = website_div.find("a")["href"] if website_div and website_div.find("a") else "Keine Webseite"
+    
+    email_tag = soup.find("a", href=lambda href: href and "mailto:" in href)
+    email = email_tag["href"].replace("mailto:", "") if email_tag else "Keine E-Mail"
+    
+    email_div = soup.find("div", id="email_versenden")
+    if email_div and email_div.get("data-link"):
+        email_match = re.search(r"mailto:([^?]+)", email_div["data-link"])
+        if email_match:
+            email = email_match.group(1)
+    
+    contact_tag = soup.find("div", class_="mod-Ansprechpartner__name")
+    contact = contact_tag.text.strip() if contact_tag else "Unbekannt"
+    
+    branche_tag = soup.find("div", class_="mod-TeilnehmerKopf__branchen")
+    branche_span = branche_tag.find("span", {"data-selenium": "teilnehmerkopf__branche"}) if branche_tag else None
+    branche = branche_span.text.strip() if branche_span else "Keine Branche"
+    
+    return {
+        "website": website,
+        "email": email,
+        "contact": contact,
+        "branche": branche
+    }
+
+if __name__ == "__main__":
+    conn = connect_to_database()
+    cursor = conn.cursor()
+
+    try:
+        while True:
+            city_category = get_next_city_category(cursor)
+            if not city_category:
+                print("Alle Stadt-Rubrik-Kombinationen wurden verarbeitet.")
+                break
+            
+            entry_id, city, category = city_category
+            results = scrape_gelbeseiten(category, city)
+            
+            for result in results:
+                if not check_if_realid_exists(cursor, result["realid"]):
+                    data = (
+                        result["realid"],
+                        result["name"],
+                        result["website"],
+                        result["email"],
+                        result["street"],
+                        result["plz"],
+                        result["city"],
+                        result["phone"],
+                        result["contact"],
+                        result["branche"]
+                    )
+                    insert_into_gelbeseiten(cursor, data)
+            
+            conn.commit()
+            update_status(cursor, conn, entry_id)
+
+    except Exception as e:
+        print(f"Fehler: {e}")
+    finally:
+        cursor.close()
+        conn.close()