89 lines
2.9 KiB
Python
89 lines
2.9 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import mysql.connector
|
|
import re
|
|
|
|
# MySQL-Verbindungsdetails
|
|
DB_HOST = "192.168.178.201"
|
|
DB_USER = "gelbeseiten"
|
|
DB_PASSWORD = "Gm4bBE62gXCSVVY2"
|
|
DB_NAME = "domainchecker"
|
|
|
|
def connect_to_database():
|
|
return mysql.connector.connect(
|
|
host=DB_HOST,
|
|
user=DB_USER,
|
|
password=DB_PASSWORD,
|
|
database=DB_NAME
|
|
)
|
|
|
|
def get_valid_websites(cursor):
|
|
# Webseiten abrufen, die nicht "Keine Webseite" enthalten
|
|
cursor.execute("SELECT `id`, `domain` FROM `gelbeseiten` WHERE `domain` != 'Keine Webseite' AND `mailadresse` = 'Keine E-Mail'")
|
|
return cursor.fetchall()
|
|
|
|
def update_email_in_database(cursor, record_id, email):
|
|
# E-Mail-Adresse in der Tabelle aktualisieren
|
|
cursor.execute("UPDATE `gelbeseiten` SET `mailadresse` = %s WHERE `id` = %s", (email, record_id))
|
|
|
|
def scrape_email_from_website(url):
|
|
try:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
|
|
}
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
if response.status_code != 200:
|
|
print(f"Fehler beim Abrufen von {url}: {response.status_code}")
|
|
return None
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# E-Mail-Adressen suchen (mailto:)
|
|
email_tag = soup.find("a", href=re.compile(r"^mailto:"))
|
|
if email_tag:
|
|
email = email_tag["href"].replace("mailto:", "").strip()
|
|
# Überprüfen, ob die E-Mail-Adresse ein '@' enthält
|
|
if "@" in email:
|
|
return email
|
|
else:
|
|
print(f"Ungültige E-Mail-Adresse gefunden: {email}")
|
|
return None
|
|
|
|
print(f"Keine E-Mail-Adresse auf {url} gefunden.")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Fehler beim Verarbeiten der Webseite {url}: {e}")
|
|
return None
|
|
|
|
if __name__ == "__main__":
|
|
conn = connect_to_database()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
# Nur gültige Webseiten abrufen
|
|
websites = get_valid_websites(cursor)
|
|
|
|
for record_id, domain in websites:
|
|
# URL validieren und vorbereiten
|
|
if not domain.startswith("http"):
|
|
url = f"http://{domain}" # Standardprotokoll hinzufügen
|
|
else:
|
|
url = domain
|
|
|
|
print(f"Scrape E-Mail-Adresse von: {url}")
|
|
|
|
# E-Mail-Adresse scrappen
|
|
email = scrape_email_from_website(url)
|
|
if email:
|
|
# E-Mail-Adresse in der Datenbank aktualisieren
|
|
update_email_in_database(cursor, record_id, email)
|
|
print(f"E-Mail-Adresse {email} für ID {record_id} gespeichert.")
|
|
|
|
# Änderungen speichern
|
|
conn.commit()
|
|
except Exception as e:
|
|
print(f"Fehler: {e}")
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|