#!/usr/bin/env python3 """ETHICA CNAM TN Scraper — Conseil National Ordre des Médecins Tunisie Scrapes www.cnom.nat.tn for registered doctors SAFE: UPSERT only, never overwrite existing data""" import json, re, time, sys, urllib.request, urllib.parse import psycopg2 DB = "host=10.1.0.3 dbname=adx_system user=admin password=admin123" SEARX = "http://localhost:8080/search" BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 100 LOG = "/opt/weval-l99/logs/ethica-scraper-cnam.log" # CNAM TN base URL CNAM_BASE = "http://www.cnom.nat.tn" # CROMC MA base URL CROMC_BASE = "https://www.ordremedecins.ma" SPECIALITES = [ "generaliste", "cardiologue", "pediatre", "dentiste", "gastro-enterologue", "allergologue", "pneumologue", "gynecologue", "orthopediste", "rhumatologue", "dermatologue", "ophtalmologue", "neurologue", "urologue" ] def log(msg): line = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] CNAM: {msg}" print(line) try: with open(LOG, "a") as f: f.write(line + "\n") except: pass def search_cnam(specialite, ville="", page=1): """Search CNAM via SearXNG (safer than direct scraping)""" q = f"site:cnom.nat.tn {specialite} {ville} medecin tunisie" try: url = f"{SEARX}?q={urllib.parse.quote(q)}&format=json&engines=google,bing" req = urllib.request.Request(url, headers={"User-Agent": "EthicaBot/1.0"}) r = urllib.request.urlopen(req, timeout=15) data = json.loads(r.read()) results = [] for res in data.get("results", []): title = res.get("title", "") content = res.get("content", "") url = res.get("url", "") # Extract doctor info from search results nom_match = re.search(r"Dr\.?\s*([\w\s]+)", title + " " + content) email_match = re.findall(r'[\w.+-]+@[\w-]+\.[\w.]+', content) phone_match = re.findall(r'[\+]?[0-9\s\-]{8,15}', content) if nom_match: results.append({ "nom": nom_match.group(1).strip()[:100], "specialite": specialite, "pays": "TN", "source": "cnam_search", "source_url": url, "email": email_match[0] if email_match else None, "telephone": phone_match[0].strip() if phone_match else None, }) return results except Exception as e: log(f" ERROR search: {e}") return [] def upsert_hcp(conn, hcp): """Insert or update HCP - NEVER overwrite existing email/phone""" cur = conn.cursor() try: cur.execute(""" INSERT INTO ethica.medecins_real (nom, specialite, pays, source, source_url, email, telephone, created_at) VALUES (%s, %s, %s, %s, %s, %s, %s, NOW()) ON CONFLICT (nom, pays) DO UPDATE SET source_url = COALESCE(EXCLUDED.source_url, ethica.medecins_real.source_url), email = COALESCE(ethica.medecins_real.email, EXCLUDED.email), telephone = COALESCE(ethica.medecins_real.telephone, EXCLUDED.telephone) WHERE ethica.medecins_real.source != 'manual' """, (hcp["nom"], hcp["specialite"], hcp["pays"], hcp["source"], hcp.get("source_url"), hcp.get("email"), hcp.get("telephone"))) conn.commit() return True except Exception as e: conn.rollback() return False def main(): conn = psycopg2.connect(DB) log(f"START CNAM TN scraper batch={BATCH}") total_found = 0 total_inserted = 0 for spec in SPECIALITES[:BATCH]: results = search_cnam(spec) total_found += len(results) for hcp in results: if upsert_hcp(conn, hcp): total_inserted += 1 log(f" {spec}: {len(results)} found") time.sleep(3) # Rate limit log(f"DONE found={total_found} inserted={total_inserted}") conn.close() if __name__ == "__main__": main()