Files
weval-l99/ethica-scraper-cnam.py
2026-04-15 01:38:46 +02:00

106 lines
3.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""ETHICA CNAM TN Scraper — Conseil National Ordre des Médecins Tunisie
Scrapes www.cnom.nat.tn for registered doctors
SAFE: UPSERT only, never overwrite existing data"""
import json, re, time, sys, urllib.request, urllib.parse
import psycopg2
DB = "host=10.1.0.3 dbname=adx_system user=admin password=admin123"
SEARX = "http://localhost:8080/search"
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 100
LOG = "/opt/weval-l99/logs/ethica-scraper-cnam.log"
# CNAM TN base URL
CNAM_BASE = "http://www.cnom.nat.tn"
# CROMC MA base URL
CROMC_BASE = "https://www.ordremedecins.ma"
SPECIALITES = [
"generaliste", "cardiologue", "pediatre", "dentiste",
"gastro-enterologue", "allergologue", "pneumologue",
"gynecologue", "orthopediste", "rhumatologue",
"dermatologue", "ophtalmologue", "neurologue", "urologue"
]
def log(msg):
line = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] CNAM: {msg}"
print(line)
try:
with open(LOG, "a") as f:
f.write(line + "\n")
except: pass
def search_cnam(specialite, ville="", page=1):
"""Search CNAM via SearXNG (safer than direct scraping)"""
q = f"site:cnom.nat.tn {specialite} {ville} medecin tunisie"
try:
url = f"{SEARX}?q={urllib.parse.quote(q)}&format=json&engines=google,bing"
req = urllib.request.Request(url, headers={"User-Agent": "EthicaBot/1.0"})
r = urllib.request.urlopen(req, timeout=15)
data = json.loads(r.read())
results = []
for res in data.get("results", []):
title = res.get("title", "")
content = res.get("content", "")
url = res.get("url", "")
# Extract doctor info from search results
nom_match = re.search(r"Dr\.?\s*([\w\s]+)", title + " " + content)
email_match = re.findall(r'[\w.+-]+@[\w-]+\.[\w.]+', content)
phone_match = re.findall(r'[\+]?[0-9\s\-]{8,15}', content)
if nom_match:
results.append({
"nom": nom_match.group(1).strip()[:100],
"specialite": specialite,
"pays": "TN",
"source": "cnam_search",
"source_url": url,
"email": email_match[0] if email_match else None,
"telephone": phone_match[0].strip() if phone_match else None,
})
return results
except Exception as e:
log(f" ERROR search: {e}")
return []
def upsert_hcp(conn, hcp):
"""Insert or update HCP - NEVER overwrite existing email/phone"""
cur = conn.cursor()
try:
cur.execute("""
INSERT INTO ethica.medecins_real (nom, specialite, pays, source, source_url, email, telephone, created_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())
ON CONFLICT (nom, pays) DO UPDATE SET
source_url = COALESCE(EXCLUDED.source_url, ethica.medecins_real.source_url),
email = COALESCE(ethica.medecins_real.email, EXCLUDED.email),
telephone = COALESCE(ethica.medecins_real.telephone, EXCLUDED.telephone)
WHERE ethica.medecins_real.source != 'manual'
""", (hcp["nom"], hcp["specialite"], hcp["pays"], hcp["source"],
hcp.get("source_url"), hcp.get("email"), hcp.get("telephone")))
conn.commit()
return True
except Exception as e:
conn.rollback()
return False
def main():
conn = psycopg2.connect(DB)
log(f"START CNAM TN scraper batch={BATCH}")
total_found = 0
total_inserted = 0
for spec in SPECIALITES[:BATCH]:
results = search_cnam(spec)
total_found += len(results)
for hcp in results:
if upsert_hcp(conn, hcp):
total_inserted += 1
log(f" {spec}: {len(results)} found")
time.sleep(3) # Rate limit
log(f"DONE found={total_found} inserted={total_inserted}")
conn.close()
if __name__ == "__main__":
main()