106 lines
3.9 KiB
Python
Executable File
106 lines
3.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""ETHICA CNAM TN Scraper — Conseil National Ordre des Médecins Tunisie
|
|
Scrapes www.cnom.nat.tn for registered doctors
|
|
SAFE: UPSERT only, never overwrite existing data"""
|
|
import json, re, time, sys, urllib.request, urllib.parse
|
|
import psycopg2
|
|
|
|
DB = "host=10.1.0.3 dbname=adx_system user=admin password=admin123"
|
|
SEARX = "http://localhost:8080/search"
|
|
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 100
|
|
LOG = "/opt/weval-l99/logs/ethica-scraper-cnam.log"
|
|
|
|
# CNAM TN base URL
|
|
CNAM_BASE = "http://www.cnom.nat.tn"
|
|
# CROMC MA base URL
|
|
CROMC_BASE = "https://www.ordremedecins.ma"
|
|
|
|
SPECIALITES = [
|
|
"generaliste", "cardiologue", "pediatre", "dentiste",
|
|
"gastro-enterologue", "allergologue", "pneumologue",
|
|
"gynecologue", "orthopediste", "rhumatologue",
|
|
"dermatologue", "ophtalmologue", "neurologue", "urologue"
|
|
]
|
|
|
|
def log(msg):
|
|
line = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] CNAM: {msg}"
|
|
print(line)
|
|
try:
|
|
with open(LOG, "a") as f:
|
|
f.write(line + "\n")
|
|
except: pass
|
|
|
|
def search_cnam(specialite, ville="", page=1):
|
|
"""Search CNAM via SearXNG (safer than direct scraping)"""
|
|
q = f"site:cnom.nat.tn {specialite} {ville} medecin tunisie"
|
|
try:
|
|
url = f"{SEARX}?q={urllib.parse.quote(q)}&format=json&engines=google,bing"
|
|
req = urllib.request.Request(url, headers={"User-Agent": "EthicaBot/1.0"})
|
|
r = urllib.request.urlopen(req, timeout=15)
|
|
data = json.loads(r.read())
|
|
results = []
|
|
for res in data.get("results", []):
|
|
title = res.get("title", "")
|
|
content = res.get("content", "")
|
|
url = res.get("url", "")
|
|
# Extract doctor info from search results
|
|
nom_match = re.search(r"Dr\.?\s*([\w\s]+)", title + " " + content)
|
|
email_match = re.findall(r'[\w.+-]+@[\w-]+\.[\w.]+', content)
|
|
phone_match = re.findall(r'[\+]?[0-9\s\-]{8,15}', content)
|
|
if nom_match:
|
|
results.append({
|
|
"nom": nom_match.group(1).strip()[:100],
|
|
"specialite": specialite,
|
|
"pays": "TN",
|
|
"source": "cnam_search",
|
|
"source_url": url,
|
|
"email": email_match[0] if email_match else None,
|
|
"telephone": phone_match[0].strip() if phone_match else None,
|
|
})
|
|
return results
|
|
except Exception as e:
|
|
log(f" ERROR search: {e}")
|
|
return []
|
|
|
|
def upsert_hcp(conn, hcp):
|
|
"""Insert or update HCP - NEVER overwrite existing email/phone"""
|
|
cur = conn.cursor()
|
|
try:
|
|
cur.execute("""
|
|
INSERT INTO ethica.medecins_real (nom, specialite, pays, source, source_url, email, telephone, created_at)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())
|
|
ON CONFLICT (nom, pays) DO UPDATE SET
|
|
source_url = COALESCE(EXCLUDED.source_url, ethica.medecins_real.source_url),
|
|
email = COALESCE(ethica.medecins_real.email, EXCLUDED.email),
|
|
telephone = COALESCE(ethica.medecins_real.telephone, EXCLUDED.telephone)
|
|
WHERE ethica.medecins_real.source != 'manual'
|
|
""", (hcp["nom"], hcp["specialite"], hcp["pays"], hcp["source"],
|
|
hcp.get("source_url"), hcp.get("email"), hcp.get("telephone")))
|
|
conn.commit()
|
|
return True
|
|
except Exception as e:
|
|
conn.rollback()
|
|
return False
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DB)
|
|
log(f"START CNAM TN scraper batch={BATCH}")
|
|
|
|
total_found = 0
|
|
total_inserted = 0
|
|
|
|
for spec in SPECIALITES[:BATCH]:
|
|
results = search_cnam(spec)
|
|
total_found += len(results)
|
|
for hcp in results:
|
|
if upsert_hcp(conn, hcp):
|
|
total_inserted += 1
|
|
log(f" {spec}: {len(results)} found")
|
|
time.sleep(3) # Rate limit
|
|
|
|
log(f"DONE found={total_found} inserted={total_inserted}")
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|