Files
html/api/weval_maghreb_scraper.py
2026-04-12 22:57:03 +02:00

95 lines
3.4 KiB
Python

#!/usr/bin/env python3
"""Maghreb B2B email harvester for WEVADS send_contacts"""
import re, sys, time, psycopg2
from playwright.sync_api import sync_playwright
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
batch = int(sys.argv[1]) if len(sys.argv) > 1 else 200
QUERIES = [
"entreprise maroc email contact",
"societe casablanca email directeur",
"startup maroc email fondateur",
"PME maroc email contact IT",
"banque maroc email direction",
"assurance maroc email direction",
"telecoms maroc email",
"industrie maroc email direction",
"agro-alimentaire maroc email",
"BTP maroc email direction",
"pharma maroc email direction",
"logistique maroc email",
"automobile maroc email",
"textile maroc email",
"tourisme maroc email direction",
"entreprise tunisie email contact",
"societe tunis email directeur",
"entreprise algerie email contact",
"startup maghreb email fondateur",
"cabinet conseil maroc email",
"SSII maroc email contact",
"ESN maroc email direction",
"integrateur SAP maroc email",
"cloud provider maroc email",
]
conn = psycopg2.connect(**DB)
cur = conn.cursor()
total = 0
bl_domains = ["google","facebook","linkedin","wikipedia","youtube","twitter","instagram","example","gov","edu"]
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0")
for q in QUERIES:
if total >= batch:
break
try:
page = ctx.new_page()
page.goto(f"https://duckduckgo.com/?q={q.replace(' ','+')}", timeout=12000)
page.wait_for_timeout(2500)
text = page.content()
page.close()
emails = set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', text))
emails = {e.lower() for e in emails if not any(x in e.lower() for x in bl_domains) and len(e) < 100}
for email in emails:
cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1", (email,))
if cur.fetchone():
continue
domain = email.split("@")[1]
country = "MA"
if any(x in domain for x in [".tn","tunisie"]):
country = "TN"
elif any(x in domain for x in [".dz","algerie"]):
country = "DZ"
name = email.split("@")[0].replace("."," ").replace("_"," ").replace("-"," ")
first = name.split()[0].title() if name.split() else ""
try:
cur.execute("""INSERT INTO admin.send_contacts
(email,first_name,domain,country,source,status,created_at)
VALUES(%s,%s,%s,%s,'playwright_maghreb','active',NOW())""",
(email, first or None, domain, country))
conn.commit()
total += 1
print(f"+EMAIL {email} ({country})")
except:
conn.rollback()
time.sleep(3)
except:
try: page.close()
except: pass
browser.close()
cur.close()
conn.close()
print(f"MAGHREB_CONTACTS: +{total}")