Files
weval-l99/ethica-enrich-ma.py
2026-04-20 11:53:11 +02:00

71 lines
2.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""ETHICA MA BOOST v2 - Maroc HCPs email enrichment via SearXNG
With email validation filter to avoid false positives"""
import json, re, time, sys, urllib.request, urllib.parse, psycopg2
DB = "host=10.1.0.3 dbname=adx_system user=admin password=admin123"
SEARX = "http://localhost:8080/search"
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 300
LOG = "/opt/weval-l99/logs/ethica-enrich-ma.log"
def log(msg):
line = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] MA: {msg}"
print(line)
with open(LOG, "a") as f:
f.write(line + "\n")
def validate_email(email, nom):
if not email or "@" not in email:
return False
dom = email.split("@")[1].lower()
trusted = [".ma", "gmail.com", "hotmail.com", "yahoo.fr", "yahoo.com",
"outlook.com", "live.fr", "hotmail.fr", "outlook.fr", "menara.ma"]
if any(dom.endswith(t) for t in trusted):
return True
nom_parts = [p.lower() for p in nom.split() if len(p) > 3]
local = email.split("@")[0].lower()
if any(p in local for p in nom_parts):
return True
return False
def enrich_one(nom, spec, ville):
q = f"{nom} {spec} {ville} maroc email"
try:
url = f"{SEARX}?q={urllib.parse.quote(q)}&format=json&engines=bing"
req = urllib.request.Request(url, headers={"User-Agent": "EthicaBot/1.0"})
r = urllib.request.urlopen(req, timeout=10)
data = json.loads(r.read())
for res in data.get("results", []):
text = res.get("content", "") + " " + res.get("title", "")
for e in re.findall(r'[\w.+-]+@[\w-]+\.[\w.]+', text):
e = e.lower().strip(".")
if not any(x in e for x in ["example.", "noreply", "admin@", "info@", "contact@", "support@"]):
if validate_email(e, nom):
return e
except Exception:
pass
return None
def main():
conn = psycopg2.connect(DB)
cur = conn.cursor()
cur.execute("""SELECT id, nom, specialite, ville FROM ethica.medecins_real
WHERE pays='MA' AND (email IS NULL OR email='' OR email='N/A')
ORDER BY google_verified DESC NULLS LAST, id LIMIT %s""", (BATCH,))
rows = cur.fetchall()
log(f"START {len(rows)} MA HCPs")
ok = 0
for hid, nom, spec, ville in rows:
email = enrich_one(nom, spec or "medecin", ville or "maroc")
if email:
cur.execute("UPDATE ethica.medecins_real SET email=%s WHERE id=%s AND (email IS NULL OR email='' OR email='N/A')", (email, hid))
conn.commit()
ok += 1
log(f" OK id={hid} {nom} -> {email}")
time.sleep(2)
log(f"DONE {ok}/{len(rows)}")
conn.close()
if __name__ == "__main__":
main()