79 lines
2.7 KiB
Python
79 lines
2.7 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import mysql.connector
|
|
from urllib.parse import urljoin
|
|
|
|
# MySQL-Verbindungsdetails
|
|
DB_HOST = "192.168.178.201"
|
|
DB_USER = "gelbeseiten"
|
|
DB_PASSWORD = "Gm4bBE62gXCSVVY2"
|
|
DB_NAME = "domainchecker"
|
|
|
|
def connect_to_database():
|
|
return mysql.connector.connect(
|
|
host=DB_HOST,
|
|
user=DB_USER,
|
|
password=DB_PASSWORD,
|
|
database=DB_NAME
|
|
)
|
|
|
|
def get_valid_data_realids(cursor):
|
|
# Abrufen der Einträge mit data-realid
|
|
cursor.execute("SELECT `id`, `data-realid` FROM `gelbeseiten` WHERE `branche` IS NULL OR `branche` = ''")
|
|
return cursor.fetchall()
|
|
|
|
def update_branche_in_database(cursor, record_id, branche):
|
|
# Branche in der Tabelle aktualisieren
|
|
cursor.execute("UPDATE `gelbeseiten` SET `branche` = %s WHERE `id` = %s", (branche, record_id))
|
|
|
|
def scrape_branche_by_realid(data_realid):
|
|
"""Extrahiert die Branche über die data-realid."""
|
|
base_url = f"https://www.gelbeseiten.de/gsbiz/{data_realid}"
|
|
try:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
|
|
}
|
|
response = requests.get(base_url, headers=headers, timeout=10)
|
|
if response.status_code != 200:
|
|
print(f"Fehler beim Abrufen von {base_url}: {response.status_code}")
|
|
return None
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
# Branche suchen
|
|
branche_tag = soup.find("span", {"data-selenium": "teilnehmerkopf__branche"})
|
|
if branche_tag:
|
|
return branche_tag.text.strip()
|
|
|
|
print(f"Keine Branche auf {base_url} gefunden.")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Fehler beim Verarbeiten von {base_url}: {e}")
|
|
return None
|
|
|
|
if __name__ == "__main__":
|
|
conn = connect_to_database()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
# Abrufen der gültigen data-realid-Einträge
|
|
records = get_valid_data_realids(cursor)
|
|
|
|
for record_id, data_realid in records:
|
|
print(f"Scrape Branche für data-realid: {data_realid}")
|
|
branche = scrape_branche_by_realid(data_realid)
|
|
|
|
if branche:
|
|
# Branche in der Datenbank aktualisieren
|
|
update_branche_in_database(cursor, record_id, branche)
|
|
print(f"Branche '{branche}' für ID {record_id} gespeichert.")
|
|
else:
|
|
print(f"Keine Branche für data-realid {data_realid} gefunden.")
|
|
|
|
# Änderungen speichern
|
|
conn.commit()
|
|
except Exception as e:
|
|
print(f"Fehler: {e}")
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|