Files
html/api/email_harvester.py
2026-04-12 22:57:03 +02:00

36 lines
1.7 KiB
Python

#!/usr/bin/env python3
import subprocess, re, psycopg2, sys, time
DB=dict(host='10.1.0.3',dbname='adx_system',user='admin',password='admin123')
batch=int(sys.argv[1]) if len(sys.argv)>1 else 20
pays=sys.argv[2] if len(sys.argv)>2 else 'MA'
conn=psycopg2.connect(**DB);cur=conn.cursor()
cur.execute("SELECT id,nom,prenom,specialite,ville FROM ethica.medecins_validated WHERE pays=%s AND (email IS NULL OR email='') AND nom!='' ORDER BY RANDOM() LIMIT %s",(pays,batch))
rows=cur.fetchall();en=0
for rid,nom,prenom,spec,ville in rows:
q=f"{nom} {prenom or ''} {spec or ''} {ville or ''} {pays.lower()}"
try:
# Use theHarvester with multiple engines
r=subprocess.run(['theHarvester','-d',q,'-b','bing,duckduckgo','-l','10'],
capture_output=True,text=True,timeout=30)
emails=re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}',r.stdout)
bl=['google','bing','example','facebook','1sante','dzdoc','lespages','pagesjaunes','yahoo.com']
emails=[e.lower() for e in emails if not any(x in e.lower() for x in bl)]
if emails:
best=None
nl=nom.lower()
for em in emails:
if nl in em.split('@')[0]:best=em;break
if not best:best=emails[0]
cur.execute("SELECT 1 FROM ethica.medecins_validated WHERE email=%s LIMIT 1",(best,))
if not cur.fetchone():
cur.execute("UPDATE ethica.medecins_validated SET email=%s,enriched_at=NOW() WHERE id=%s",(best,rid))
conn.commit();en+=1
print(f"+EMAIL {nom} {prenom}: {best}")
time.sleep(2)
except:pass
cur.close();conn.close()
print(f"HARVESTER_{pays}:+{en}/{len(rows)}")