WebscrapperPython/250129_Scrapper.py

188 lines
6.8 KiB
Python

import requests
from bs4 import BeautifulSoup
import mysql.connector
import time
import re
# MySQL-Verbindungsdetails
DB_HOST = "192.168.178.201"
DB_USER = "gelbeseiten"
DB_PASSWORD = "Gm4bBE62gXCSVVY2"
DB_NAME = "domainchecker"
def connect_to_database():
return mysql.connector.connect(
host=DB_HOST,
user=DB_USER,
password=DB_PASSWORD,
database=DB_NAME
)
def get_next_city_category(cursor):
"""Holt die nächste Stadt-Rubrik-Kombination mit status = 0"""
cursor.execute("SELECT id, stadt, rubrik FROM staedte_rubriken WHERE status = 0 LIMIT 1")
return cursor.fetchone()
def update_status(cursor, conn, entry_id):
"""Setzt den status auf 1 für eine verarbeitete Stadt-Rubrik-Kombination"""
cursor.execute("UPDATE staedte_rubriken SET status = 1 WHERE id = %s", (entry_id,))
conn.commit()
def insert_into_gelbeseiten(cursor, data):
sql = """
INSERT INTO gelbeseiten
(`data-realid`, `firmenname`, `domain`, `mailadresse`, `adresse`, `plz`, `ort`, `telefonnummer`, `ansprechpartner`, `branche`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(sql, data)
def check_if_realid_exists(cursor, realid):
cursor.execute("SELECT 1 FROM gelbeseiten WHERE `data-realid` = %s", (realid,))
print(f"Real-Id exisitiert bereits: {realid}")
return cursor.fetchone() is not None
def clean_address(address):
"""Adressen in die Form: Straße, PLZ, Stadt bringen"""
address = address.strip().replace("\n", " ")
pattern = r"^(.*?),\s+(\d{5})\s+([^0-9]+)\s+\d+,\d+\s+km$"
match = re.match(pattern, address)
if match:
street = match.group(1).strip()
plz = match.group(2)
city = match.group(3).replace("\t", "").replace("\n", "").strip()
return street, plz, city
else:
return address, "Unbekannt", "Unbekannt"
def scrape_gelbeseiten(search_term, location, radius_km=50000):
base_url = "https://www.gelbeseiten.de/suche"
search_url = f"{base_url}/{search_term}/{location}?umkreis={radius_km}"
print(f"Scrape {search_term} in {location}...")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
response = requests.get(search_url, headers=headers)
if response.status_code != 200:
print(f"Fehler beim Abrufen der Suchergebnisse für {search_term} in {location}.")
return []
soup = BeautifulSoup(response.text, "html.parser")
articles = soup.find_all("article", class_="mod mod-Treffer")
results = []
for article in articles:
try:
realid = article.get("data-realid")
if not realid:
continue
name_tag = article.find("h2", class_="mod-Treffer__name")
name = name_tag.text.strip() if name_tag else "Unbekannt"
address_tag = article.find("div", class_="mod-AdresseKompakt__adress-text")
raw_address = address_tag.text.strip().replace("\n", " ") if address_tag else "Keine Adresse"
street, plz, city = clean_address(raw_address)
phone_tag = article.find("a", class_="mod-TelefonnummerKompakt__phoneNumber")
phone = phone_tag.text.strip() if phone_tag else "Keine Telefonnummer"
detail_url = f"https://www.gelbeseiten.de/gsbiz/{realid}"
detail_data = scrape_detail_page(detail_url, headers)
results.append({
"realid": realid,
"name": name,
"street": street,
"plz": plz,
"city": city,
"phone": phone,
"website": detail_data.get("website", "Keine Webseite"),
"email": detail_data.get("email", "Keine E-Mail"),
"contact": detail_data.get("contact", "Unbekannt"),
"branche": detail_data.get("branche", "Keine Branche")
})
print(f"Gescrapt: {name}")
time.sleep(1)
except Exception as e:
print(f"Fehler beim Verarbeiten eines Eintrags: {e}")
return results
def scrape_detail_page(url, headers):
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f"Fehler beim Abrufen der Detailseite: {url}")
return {}
soup = BeautifulSoup(response.text, "html.parser")
website_div = soup.find("div", class_="mod-Kontaktdaten__list-item contains-icon-big-homepage")
website = website_div.find("a")["href"] if website_div and website_div.find("a") else "Keine Webseite"
email_tag = soup.find("a", href=lambda href: href and "mailto:" in href)
email = email_tag["href"].replace("mailto:", "") if email_tag else "Keine E-Mail"
email_div = soup.find("div", id="email_versenden")
if email_div and email_div.get("data-link"):
email_match = re.search(r"mailto:([^?]+)", email_div["data-link"])
if email_match:
email = email_match.group(1)
contact_tag = soup.find("div", class_="mod-Ansprechpartner__name")
contact = contact_tag.text.strip() if contact_tag else "Unbekannt"
branche_tag = soup.find("div", class_="mod-TeilnehmerKopf__branchen")
branche_span = branche_tag.find("span", {"data-selenium": "teilnehmerkopf__branche"}) if branche_tag else None
branche = branche_span.text.strip() if branche_span else "Keine Branche"
return {
"website": website,
"email": email,
"contact": contact,
"branche": branche
}
if __name__ == "__main__":
conn = connect_to_database()
cursor = conn.cursor()
try:
while True:
city_category = get_next_city_category(cursor)
if not city_category:
print("Alle Stadt-Rubrik-Kombinationen wurden verarbeitet.")
break
entry_id, city, category = city_category
results = scrape_gelbeseiten(category, city)
for result in results:
if not check_if_realid_exists(cursor, result["realid"]):
data = (
result["realid"],
result["name"],
result["website"],
result["email"],
result["street"],
result["plz"],
result["city"],
result["phone"],
result["contact"],
result["branche"]
)
insert_into_gelbeseiten(cursor, data)
conn.commit()
update_status(cursor, conn, entry_id)
except Exception as e:
print(f"Fehler: {e}")
finally:
cursor.close()
conn.close()