36 lines
1.7 KiB
Python
36 lines
1.7 KiB
Python
#!/usr/bin/env python3
|
|
import subprocess, re, psycopg2, sys, time
|
|
DB=dict(host='10.1.0.3',dbname='adx_system',user='admin',password='admin123')
|
|
batch=int(sys.argv[1]) if len(sys.argv)>1 else 20
|
|
pays=sys.argv[2] if len(sys.argv)>2 else 'MA'
|
|
|
|
conn=psycopg2.connect(**DB);cur=conn.cursor()
|
|
cur.execute("SELECT id,nom,prenom,specialite,ville FROM ethica.medecins_validated WHERE pays=%s AND (email IS NULL OR email='') AND nom!='' ORDER BY RANDOM() LIMIT %s",(pays,batch))
|
|
rows=cur.fetchall();en=0
|
|
|
|
for rid,nom,prenom,spec,ville in rows:
|
|
q=f"{nom} {prenom or ''} {spec or ''} {ville or ''} {pays.lower()}"
|
|
try:
|
|
# Use theHarvester with multiple engines
|
|
r=subprocess.run(['theHarvester','-d',q,'-b','bing,duckduckgo','-l','10'],
|
|
capture_output=True,text=True,timeout=30)
|
|
emails=re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}',r.stdout)
|
|
bl=['google','bing','example','facebook','1sante','dzdoc','lespages','pagesjaunes','yahoo.com']
|
|
emails=[e.lower() for e in emails if not any(x in e.lower() for x in bl)]
|
|
if emails:
|
|
best=None
|
|
nl=nom.lower()
|
|
for em in emails:
|
|
if nl in em.split('@')[0]:best=em;break
|
|
if not best:best=emails[0]
|
|
cur.execute("SELECT 1 FROM ethica.medecins_validated WHERE email=%s LIMIT 1",(best,))
|
|
if not cur.fetchone():
|
|
cur.execute("UPDATE ethica.medecins_validated SET email=%s,enriched_at=NOW() WHERE id=%s",(best,rid))
|
|
conn.commit();en+=1
|
|
print(f"+EMAIL {nom} {prenom}: {best}")
|
|
time.sleep(2)
|
|
except:pass
|
|
|
|
cur.close();conn.close()
|
|
print(f"HARVESTER_{pays}:+{en}/{len(rows)}")
|