#!/usr/bin/env python3 import re, sys, time, psycopg2 from playwright.sync_api import sync_playwright DB=dict(host='10.1.0.3',dbname='adx_system',user='admin',password='admin123') batch=int(sys.argv[1]) if len(sys.argv)>1 else 20 pays=sys.argv[2] if len(sys.argv)>2 else 'MA' conn=psycopg2.connect(**DB);cur=conn.cursor() cur.execute("SELECT id,nom,prenom,specialite,ville FROM ethica.medecins_validated WHERE pays=%s AND (email IS NULL OR email='') AND nom!='' AND LENGTH(nom)>2 ORDER BY RANDOM() LIMIT %s",(pays,batch)) rows=cur.fetchall();en=0 bl=['google','bing','example','facebook','1sante','dzdoc','lespages','pagesjaunes','wikipedia','speedtest','annuaire'] with sync_playwright() as p: browser=p.chromium.launch(headless=True,args=['--no-sandbox','--disable-dev-shm-usage']) ctx=browser.new_context(user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0') for rid,nom,prenom,spec,ville in rows: q=f"Dr {nom} {prenom or ''} {spec or ''} {ville or ''} email" if pays=='MA': q+=" maroc" elif pays=='TN': q+=" tunisie" else: q+=" algerie" try: page=ctx.new_page() page.goto(f"https://duckduckgo.com/?q={q.replace(' ','+')}", timeout=10000) page.wait_for_timeout(2000) text=page.content() page.close() emails=set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}',text)) emails={e.lower() for e in emails if not any(x in e.lower() for x in bl)} if emails: nl=nom.lower() best=None for em in emails: if nl in em.split('@')[0]:best=em;break if not best:best=list(emails)[0] cur.execute("SELECT 1 FROM ethica.medecins_validated WHERE email=%s LIMIT 1",(best,)) if not cur.fetchone(): cur.execute("UPDATE ethica.medecins_validated SET email=%s,enriched_at=NOW() WHERE id=%s",(best,rid)) conn.commit();en+=1 print(f"+EMAIL {nom} {prenom}: {best}") time.sleep(3) except: try:page.close() except:pass browser.close() cur.close();conn.close() print(f"PW_EMAIL_{pays}:+{en}/{len(rows)}")