Scrapper mit Adresssplitting
This commit is contained in:
parent
12997dbba7
commit
7833819fba
187
250129_Scrapper.py
Normal file
187
250129_Scrapper.py
Normal file
@ -0,0 +1,187 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import mysql.connector
|
||||
import time
|
||||
import re
|
||||
|
||||
# MySQL-Verbindungsdetails
|
||||
DB_HOST = "192.168.178.201"
|
||||
DB_USER = "gelbeseiten"
|
||||
DB_PASSWORD = "Gm4bBE62gXCSVVY2"
|
||||
DB_NAME = "domainchecker"
|
||||
|
||||
def connect_to_database():
|
||||
return mysql.connector.connect(
|
||||
host=DB_HOST,
|
||||
user=DB_USER,
|
||||
password=DB_PASSWORD,
|
||||
database=DB_NAME
|
||||
)
|
||||
|
||||
def get_next_city_category(cursor):
|
||||
"""Holt die nächste Stadt-Rubrik-Kombination mit status = 0"""
|
||||
cursor.execute("SELECT id, stadt, rubrik FROM staedte_rubriken WHERE status = 0 LIMIT 1")
|
||||
return cursor.fetchone()
|
||||
|
||||
def update_status(cursor, conn, entry_id):
|
||||
"""Setzt den status auf 1 für eine verarbeitete Stadt-Rubrik-Kombination"""
|
||||
cursor.execute("UPDATE staedte_rubriken SET status = 1 WHERE id = %s", (entry_id,))
|
||||
conn.commit()
|
||||
|
||||
def insert_into_gelbeseiten(cursor, data):
|
||||
sql = """
|
||||
INSERT INTO gelbeseiten
|
||||
(`data-realid`, `firmenname`, `domain`, `mailadresse`, `adresse`, `plz`, `ort`, `telefonnummer`, `ansprechpartner`, `branche`)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
"""
|
||||
cursor.execute(sql, data)
|
||||
|
||||
def check_if_realid_exists(cursor, realid):
|
||||
cursor.execute("SELECT 1 FROM gelbeseiten WHERE `data-realid` = %s", (realid,))
|
||||
print(f"Real-Id exisitiert bereits: {realid}")
|
||||
return cursor.fetchone() is not None
|
||||
|
||||
def clean_address(address):
|
||||
"""Adressen in die Form: Straße, PLZ, Stadt bringen"""
|
||||
address = address.strip().replace("\n", " ")
|
||||
pattern = r"^(.*?),\s+(\d{5})\s+([^0-9]+)\s+\d+,\d+\s+km$"
|
||||
|
||||
match = re.match(pattern, address)
|
||||
if match:
|
||||
street = match.group(1).strip()
|
||||
plz = match.group(2)
|
||||
city = match.group(3).replace("\t", "").replace("\n", "").strip()
|
||||
return street, plz, city
|
||||
else:
|
||||
return address, "Unbekannt", "Unbekannt"
|
||||
|
||||
def scrape_gelbeseiten(search_term, location, radius_km=50000):
|
||||
base_url = "https://www.gelbeseiten.de/suche"
|
||||
search_url = f"{base_url}/{search_term}/{location}?umkreis={radius_km}"
|
||||
print(f"Scrape {search_term} in {location}...")
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
response = requests.get(search_url, headers=headers)
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler beim Abrufen der Suchergebnisse für {search_term} in {location}.")
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
articles = soup.find_all("article", class_="mod mod-Treffer")
|
||||
|
||||
results = []
|
||||
for article in articles:
|
||||
try:
|
||||
realid = article.get("data-realid")
|
||||
if not realid:
|
||||
continue
|
||||
|
||||
name_tag = article.find("h2", class_="mod-Treffer__name")
|
||||
name = name_tag.text.strip() if name_tag else "Unbekannt"
|
||||
|
||||
address_tag = article.find("div", class_="mod-AdresseKompakt__adress-text")
|
||||
raw_address = address_tag.text.strip().replace("\n", " ") if address_tag else "Keine Adresse"
|
||||
|
||||
street, plz, city = clean_address(raw_address)
|
||||
|
||||
phone_tag = article.find("a", class_="mod-TelefonnummerKompakt__phoneNumber")
|
||||
phone = phone_tag.text.strip() if phone_tag else "Keine Telefonnummer"
|
||||
|
||||
detail_url = f"https://www.gelbeseiten.de/gsbiz/{realid}"
|
||||
detail_data = scrape_detail_page(detail_url, headers)
|
||||
|
||||
results.append({
|
||||
"realid": realid,
|
||||
"name": name,
|
||||
"street": street,
|
||||
"plz": plz,
|
||||
"city": city,
|
||||
"phone": phone,
|
||||
"website": detail_data.get("website", "Keine Webseite"),
|
||||
"email": detail_data.get("email", "Keine E-Mail"),
|
||||
"contact": detail_data.get("contact", "Unbekannt"),
|
||||
"branche": detail_data.get("branche", "Keine Branche")
|
||||
})
|
||||
|
||||
print(f"Gescrapt: {name}")
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Verarbeiten eines Eintrags: {e}")
|
||||
|
||||
return results
|
||||
|
||||
def scrape_detail_page(url, headers):
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler beim Abrufen der Detailseite: {url}")
|
||||
return {}
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
website_div = soup.find("div", class_="mod-Kontaktdaten__list-item contains-icon-big-homepage")
|
||||
website = website_div.find("a")["href"] if website_div and website_div.find("a") else "Keine Webseite"
|
||||
|
||||
email_tag = soup.find("a", href=lambda href: href and "mailto:" in href)
|
||||
email = email_tag["href"].replace("mailto:", "") if email_tag else "Keine E-Mail"
|
||||
|
||||
email_div = soup.find("div", id="email_versenden")
|
||||
if email_div and email_div.get("data-link"):
|
||||
email_match = re.search(r"mailto:([^?]+)", email_div["data-link"])
|
||||
if email_match:
|
||||
email = email_match.group(1)
|
||||
|
||||
contact_tag = soup.find("div", class_="mod-Ansprechpartner__name")
|
||||
contact = contact_tag.text.strip() if contact_tag else "Unbekannt"
|
||||
|
||||
branche_tag = soup.find("div", class_="mod-TeilnehmerKopf__branchen")
|
||||
branche_span = branche_tag.find("span", {"data-selenium": "teilnehmerkopf__branche"}) if branche_tag else None
|
||||
branche = branche_span.text.strip() if branche_span else "Keine Branche"
|
||||
|
||||
return {
|
||||
"website": website,
|
||||
"email": email,
|
||||
"contact": contact,
|
||||
"branche": branche
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
conn = connect_to_database()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
while True:
|
||||
city_category = get_next_city_category(cursor)
|
||||
if not city_category:
|
||||
print("Alle Stadt-Rubrik-Kombinationen wurden verarbeitet.")
|
||||
break
|
||||
|
||||
entry_id, city, category = city_category
|
||||
results = scrape_gelbeseiten(category, city)
|
||||
|
||||
for result in results:
|
||||
if not check_if_realid_exists(cursor, result["realid"]):
|
||||
data = (
|
||||
result["realid"],
|
||||
result["name"],
|
||||
result["website"],
|
||||
result["email"],
|
||||
result["street"],
|
||||
result["plz"],
|
||||
result["city"],
|
||||
result["phone"],
|
||||
result["contact"],
|
||||
result["branche"]
|
||||
)
|
||||
insert_into_gelbeseiten(cursor, data)
|
||||
|
||||
conn.commit()
|
||||
update_status(cursor, conn, entry_id)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Fehler: {e}")
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
Loading…
Reference in New Issue
Block a user