#!/usr/bin/env python3 import subprocess, re, psycopg2, sys, time DB=dict(host='10.1.0.3',dbname='adx_system',user='admin',password='admin123') batch=int(sys.argv[1]) if len(sys.argv)>1 else 20 pays=sys.argv[2] if len(sys.argv)>2 else 'MA' conn=psycopg2.connect(**DB);cur=conn.cursor() cur.execute("SELECT id,nom,prenom,specialite,ville FROM ethica.medecins_validated WHERE pays=%s AND (email IS NULL OR email='') AND nom!='' ORDER BY RANDOM() LIMIT %s",(pays,batch)) rows=cur.fetchall();en=0 for rid,nom,prenom,spec,ville in rows: q=f"{nom} {prenom or ''} {spec or ''} {ville or ''} {pays.lower()}" try: # Use theHarvester with multiple engines r=subprocess.run(['theHarvester','-d',q,'-b','bing,duckduckgo','-l','10'], capture_output=True,text=True,timeout=30) emails=re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}',r.stdout) bl=['google','bing','example','facebook','1sante','dzdoc','lespages','pagesjaunes','yahoo.com'] emails=[e.lower() for e in emails if not any(x in e.lower() for x in bl)] if emails: best=None nl=nom.lower() for em in emails: if nl in em.split('@')[0]:best=em;break if not best:best=emails[0] cur.execute("SELECT 1 FROM ethica.medecins_validated WHERE email=%s LIMIT 1",(best,)) if not cur.fetchone(): cur.execute("UPDATE ethica.medecins_validated SET email=%s,enriched_at=NOW() WHERE id=%s",(best,rid)) conn.commit();en+=1 print(f"+EMAIL {nom} {prenom}: {best}") time.sleep(2) except:pass cur.close();conn.close() print(f"HARVESTER_{pays}:+{en}/{len(rows)}")