95 lines
3.4 KiB
Python
95 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Maghreb B2B email harvester for WEVADS send_contacts"""
|
|
import re, sys, time, psycopg2
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
|
|
batch = int(sys.argv[1]) if len(sys.argv) > 1 else 200
|
|
|
|
QUERIES = [
|
|
"entreprise maroc email contact",
|
|
"societe casablanca email directeur",
|
|
"startup maroc email fondateur",
|
|
"PME maroc email contact IT",
|
|
"banque maroc email direction",
|
|
"assurance maroc email direction",
|
|
"telecoms maroc email",
|
|
"industrie maroc email direction",
|
|
"agro-alimentaire maroc email",
|
|
"BTP maroc email direction",
|
|
"pharma maroc email direction",
|
|
"logistique maroc email",
|
|
"automobile maroc email",
|
|
"textile maroc email",
|
|
"tourisme maroc email direction",
|
|
"entreprise tunisie email contact",
|
|
"societe tunis email directeur",
|
|
"entreprise algerie email contact",
|
|
"startup maghreb email fondateur",
|
|
"cabinet conseil maroc email",
|
|
"SSII maroc email contact",
|
|
"ESN maroc email direction",
|
|
"integrateur SAP maroc email",
|
|
"cloud provider maroc email",
|
|
]
|
|
|
|
conn = psycopg2.connect(**DB)
|
|
cur = conn.cursor()
|
|
total = 0
|
|
|
|
bl_domains = ["google","facebook","linkedin","wikipedia","youtube","twitter","instagram","example","gov","edu"]
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
|
|
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0")
|
|
|
|
for q in QUERIES:
|
|
if total >= batch:
|
|
break
|
|
try:
|
|
page = ctx.new_page()
|
|
page.goto(f"https://duckduckgo.com/?q={q.replace(' ','+')}", timeout=12000)
|
|
page.wait_for_timeout(2500)
|
|
text = page.content()
|
|
page.close()
|
|
|
|
emails = set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', text))
|
|
emails = {e.lower() for e in emails if not any(x in e.lower() for x in bl_domains) and len(e) < 100}
|
|
|
|
for email in emails:
|
|
cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1", (email,))
|
|
if cur.fetchone():
|
|
continue
|
|
|
|
domain = email.split("@")[1]
|
|
country = "MA"
|
|
if any(x in domain for x in [".tn","tunisie"]):
|
|
country = "TN"
|
|
elif any(x in domain for x in [".dz","algerie"]):
|
|
country = "DZ"
|
|
|
|
name = email.split("@")[0].replace("."," ").replace("_"," ").replace("-"," ")
|
|
first = name.split()[0].title() if name.split() else ""
|
|
|
|
try:
|
|
cur.execute("""INSERT INTO admin.send_contacts
|
|
(email,first_name,domain,country,source,status,created_at)
|
|
VALUES(%s,%s,%s,%s,'playwright_maghreb','active',NOW())""",
|
|
(email, first or None, domain, country))
|
|
conn.commit()
|
|
total += 1
|
|
print(f"+EMAIL {email} ({country})")
|
|
except:
|
|
conn.rollback()
|
|
|
|
time.sleep(3)
|
|
except:
|
|
try: page.close()
|
|
except: pass
|
|
|
|
browser.close()
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print(f"MAGHREB_CONTACTS: +{total}")
|