192 lines
7.2 KiB
Python
192 lines
7.2 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import mysql.connector
|
|
import time
|
|
import re
|
|
|
|
# MySQL-Verbindungsdetails
|
|
DB_HOST = "192.168.178.201"
|
|
DB_USER = "gelbeseiten"
|
|
DB_PASSWORD = "Gm4bBE62gXCSVVY2"
|
|
DB_NAME = "domainchecker"
|
|
|
|
def connect_to_database():
|
|
return mysql.connector.connect(
|
|
host=DB_HOST,
|
|
user=DB_USER,
|
|
password=DB_PASSWORD,
|
|
database=DB_NAME
|
|
)
|
|
|
|
def get_data_from_table(cursor, table_name, column_name):
|
|
cursor.execute(f"SELECT `{column_name}` FROM `{table_name}`")
|
|
return [row[0] for row in cursor.fetchall()]
|
|
|
|
def insert_into_gelbeseiten(cursor, data):
|
|
sql = """
|
|
INSERT INTO gelbeseiten
|
|
(`data-realid`, `firmenname`, `domain`, `mailadresse`, `adresse`, `plz`, `ort`, `telefonnummer`, `ansprechpartner`)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
"""
|
|
cursor.execute(sql, data)
|
|
|
|
def check_if_realid_exists(cursor, realid):
|
|
cursor.execute("SELECT 1 FROM gelbeseiten WHERE `data-realid` = %s", (realid,))
|
|
return cursor.fetchone() is not None
|
|
|
|
def split_address(address):
|
|
# Adresse in Straße, Hausnummer, PLZ, Ort aufteilen
|
|
address = address.strip().replace("\n", " ")
|
|
|
|
# Annahme: Die Adresse hat das Format: "Straße Hausnummer, PLZ Ort"
|
|
# Beispiel: "Musterstraße 1, 12345 Dresden"
|
|
address_parts = re.split(r',\s*', address)
|
|
|
|
if len(address_parts) == 2:
|
|
street_and_number = address_parts[0].strip()
|
|
plz_and_city = address_parts[1].strip()
|
|
|
|
# Aufteilen der PLZ und Stadt
|
|
plz_city_parts = re.split(r'\s+', plz_and_city)
|
|
plz = plz_city_parts[0]
|
|
city = " ".join(plz_city_parts[1:]) if len(plz_city_parts) > 1 else ""
|
|
|
|
return street_and_number, plz, city
|
|
else:
|
|
# Falls die Adresse nicht wie erwartet ist, gebe leere Werte zurück
|
|
return address, "Unbekannt", "Unbekannt"
|
|
|
|
def scrape_gelbeseiten(search_term, location, radius_km=50000):
|
|
base_url = "https://www.gelbeseiten.de/suche"
|
|
search_url = f"{base_url}/{search_term}/{location}?umkreis={radius_km}"
|
|
print(f"Scrape {search_term} in {location}...")
|
|
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
|
|
}
|
|
|
|
response = requests.get(search_url, headers=headers)
|
|
if response.status_code != 200:
|
|
print(f"Fehler beim Abrufen der Suchergebnisse für {search_term} in {location}.")
|
|
return []
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
articles = soup.find_all("article", class_="mod mod-Treffer")
|
|
|
|
results = []
|
|
for article in articles:
|
|
try:
|
|
realid = article.get("data-realid")
|
|
if not realid:
|
|
continue
|
|
|
|
name_tag = article.find("h2", class_="mod-Treffer__name")
|
|
name = name_tag.text.strip() if name_tag else "Unbekannt"
|
|
|
|
address_tag = article.find("div", class_="mod-AdresseKompakt__adress-text")
|
|
address = address_tag.text.strip().replace("\n", " ") if address_tag else "Keine Adresse"
|
|
|
|
street, plz, city = split_address(address)
|
|
|
|
phone_tag = article.find("a", class_="mod-TelefonnummerKompakt__phoneNumber")
|
|
phone = phone_tag.text.strip() if phone_tag else "Keine Telefonnummer"
|
|
|
|
detail_url = f"https://www.gelbeseiten.de/gsbiz/{realid}"
|
|
detail_data = scrape_detail_page(detail_url, headers)
|
|
|
|
results.append({
|
|
"realid": realid,
|
|
"name": name,
|
|
"street": street,
|
|
"plz": plz,
|
|
"city": city,
|
|
"phone": phone,
|
|
"website": detail_data.get("website", "Keine Webseite"),
|
|
"email": detail_data.get("email", "Keine E-Mail"),
|
|
"contact": detail_data.get("contact", "Unbekannt"),
|
|
})
|
|
|
|
print(f"Gescrapt: {name}")
|
|
time.sleep(1)
|
|
except Exception as e:
|
|
print(f"Fehler beim Verarbeiten eines Eintrags: {e}")
|
|
|
|
return results
|
|
|
|
def scrape_detail_page(url, headers):
|
|
"""Scrape Detailseite für Website, E-Mail, Ansprechpartner und Branche."""
|
|
response = requests.get(url, headers=headers)
|
|
if response.status_code != 200:
|
|
print(f"Fehler beim Abrufen der Detailseite: {url}")
|
|
return {}
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# Webseite extrahieren
|
|
website_div = soup.find("div", class_="mod-Kontaktdaten__list-item contains-icon-big-homepage")
|
|
website = website_div.find("a")["href"] if website_div and website_div.find("a") else "Keine Webseite"
|
|
|
|
# E-Mail aus einem direkten <a>-Tag suchen
|
|
email_tag = soup.find("a", href=lambda href: href and "mailto:" in href)
|
|
email = email_tag["href"].replace("mailto:", "") if email_tag else "Keine E-Mail"
|
|
|
|
# Alternative E-Mail aus dem "aktionsleiste-button"-Div suchen
|
|
email_div = soup.find("div", id="email_versenden")
|
|
if email_div and email_div.get("data-link"):
|
|
email_match = re.search(r"mailto:([^?]+)", email_div["data-link"])
|
|
if email_match:
|
|
email = email_match.group(1)
|
|
|
|
# Ansprechpartner extrahieren
|
|
contact_tag = soup.find("div", class_="mod-Ansprechpartner__name")
|
|
contact = contact_tag.text.strip() if contact_tag else "Unbekannt"
|
|
|
|
# Branche extrahieren
|
|
branche_tag = soup.find("div", class_="mod-TeilnehmerKopf__branchen")
|
|
branche_span = branche_tag.find("span", {"data-selenium": "teilnehmerkopf__branche"}) if branche_tag else None
|
|
branche = branche_span.text.strip() if branche_span else "Keine Branche"
|
|
|
|
return {
|
|
"website": website,
|
|
"email": email,
|
|
"contact": contact,
|
|
"branche": branche
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
conn = connect_to_database()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
# Städte und Rubriken abrufen
|
|
staedte = get_data_from_table(cursor, "staedte", "staedte")
|
|
rubriken = get_data_from_table(cursor, "rubriken", "rubriken")
|
|
|
|
for stadt in staedte:
|
|
for rubrik in rubriken:
|
|
results = scrape_gelbeseiten(rubrik, stadt)
|
|
|
|
# Ergebnisse speichern, nur wenn die data-realid noch nicht existiert
|
|
for result in results:
|
|
if not check_if_realid_exists(cursor, result["realid"]):
|
|
data = (
|
|
result["realid"],
|
|
result["name"],
|
|
result["website"],
|
|
result["email"],
|
|
result["street"],
|
|
result["plz"],
|
|
result["city"],
|
|
result["phone"],
|
|
result["contact"]
|
|
)
|
|
insert_into_gelbeseiten(cursor, data)
|
|
else:
|
|
print(f"Die RealID {result['realid']} existiert bereits, überspringe das Einfügen.")
|
|
conn.commit()
|
|
except Exception as e:
|
|
print(f"Fehler: {e}")
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|