Files
html/api/pw_email.py
2026-04-12 22:57:03 +02:00

55 lines
2.3 KiB
Python

#!/usr/bin/env python3
import re, sys, time, psycopg2
from playwright.sync_api import sync_playwright
DB=dict(host='10.1.0.3',dbname='adx_system',user='admin',password='admin123')
batch=int(sys.argv[1]) if len(sys.argv)>1 else 20
pays=sys.argv[2] if len(sys.argv)>2 else 'MA'
conn=psycopg2.connect(**DB);cur=conn.cursor()
cur.execute("SELECT id,nom,prenom,specialite,ville FROM ethica.medecins_validated WHERE pays=%s AND (email IS NULL OR email='') AND nom!='' AND LENGTH(nom)>2 ORDER BY RANDOM() LIMIT %s",(pays,batch))
rows=cur.fetchall();en=0
bl=['google','bing','example','facebook','1sante','dzdoc','lespages','pagesjaunes','wikipedia','speedtest','annuaire']
with sync_playwright() as p:
browser=p.chromium.launch(headless=True,args=['--no-sandbox','--disable-dev-shm-usage'])
ctx=browser.new_context(user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0')
for rid,nom,prenom,spec,ville in rows:
q=f"Dr {nom} {prenom or ''} {spec or ''} {ville or ''} email"
if pays=='MA': q+=" maroc"
elif pays=='TN': q+=" tunisie"
else: q+=" algerie"
try:
page=ctx.new_page()
page.goto(f"https://duckduckgo.com/?q={q.replace(' ','+')}", timeout=10000)
page.wait_for_timeout(2000)
text=page.content()
page.close()
emails=set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}',text))
emails={e.lower() for e in emails if not any(x in e.lower() for x in bl)}
if emails:
nl=nom.lower()
best=None
for em in emails:
if nl in em.split('@')[0]:best=em;break
if not best:best=list(emails)[0]
cur.execute("SELECT 1 FROM ethica.medecins_validated WHERE email=%s LIMIT 1",(best,))
if not cur.fetchone():
cur.execute("UPDATE ethica.medecins_validated SET email=%s,enriched_at=NOW() WHERE id=%s",(best,rid))
conn.commit();en+=1
print(f"+EMAIL {nom} {prenom}: {best}")
time.sleep(3)
except:
try:page.close()
except:pass
browser.close()
cur.close();conn.close()
print(f"PW_EMAIL_{pays}:+{en}/{len(rows)}")