Files
weval-l99/ethica-cromc-playwright.py
2026-04-15 01:38:46 +02:00

118 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""ETHICA CROMC MA Scraper via Playwright - scrapes cromc.ma doctor directory"""
import json, re, time, sys, subprocess
try:
import psycopg2
except ImportError:
subprocess.run(['pip3', 'install', 'psycopg2-binary', '--break-system-packages'], capture_output=True)
import psycopg2
DB = 'host=10.1.0.3 dbname=adx_system user=admin password=admin123'
LOG = '/opt/weval-l99/logs/ethica-cromc-pw.log'
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 50
BASE = 'https://cromc.ma'
SPECS_MAP = {
'medecine generale': 'generaliste', 'generaliste': 'generaliste',
'cardiologie': 'cardiologue', 'dermatologie': 'dermatologue',
'gastro': 'gastro-enterologue', 'gynecologie': 'gynecologue',
'neurologie': 'neurologue', 'ophtalmologie': 'ophtalmologue',
'orl': 'orl', 'pediatrie': 'pediatre', 'psychiatrie': 'psychiatre',
'pneumologie': 'pneumologue', 'rhumatologie': 'rhumatologue',
'urologie': 'urologue', 'chirurgie': 'chirurgien',
'radiologie': 'radiologue', 'endocrinologie': 'endocrinologue',
'nephrologie': 'nephrologue', 'oncologie': 'oncologue',
'hematologie': 'hematologue', 'allergologie': 'allergologue',
'anesthesie': 'anesthesiste', 'dentiste': 'dentiste',
'pharmacie': 'pharmacien', 'medecine interne': 'medecin-interne',
'medecine physique': 'medecin-physique',
}
def log(msg):
line = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] CROMC-PW: {msg}"
print(line, flush=True)
try:
with open(LOG, 'a') as f:
f.write(line + chr(10))
except:
pass
def mapspec(raw):
raw = raw.lower().strip()
for k, v in SPECS_MAP.items():
if k in raw:
return v
return raw.replace(' ', '-')[:50]
def scrape_cromc_page(page, page_num):
url = f"{BASE}/recherche/?page={page_num}"
log(f" Fetching page {page_num}: {url}")
try:
page.goto(url, timeout=30000, wait_until='domcontentloaded')
time.sleep(2)
rows = page.query_selector_all('.doctor-card, .result-item, tr.doctor, .media, .list-group-item')
if not rows:
rows = page.query_selector_all('table tr')
results = []
for row in rows:
text = row.inner_text()
lines = [l.strip() for l in text.split(chr(10)) if l.strip()]
if len(lines) >= 2:
nom_parts = lines[0].replace('Dr.', '').replace('DR.', '').strip().split()
if len(nom_parts) >= 2:
results.append({
'nom': ' '.join(nom_parts[:3])[:100],
'specialite': mapspec(lines[1] if len(lines) > 1 else ''),
'ville': lines[2] if len(lines) > 2 else '',
'secteur': lines[3] if len(lines) > 3 else '',
'pays': 'MA',
'source': 'cromc_playwright',
})
log(f" Page {page_num}: {len(results)} doctors found")
return results
except Exception as e:
log(f" ERROR page {page_num}: {e}")
return []
def upsert(conn, hcp):
cur = conn.cursor()
try:
cur.execute("""INSERT INTO ethica.cromc_reference (nom, prenom, specialite, ville, secteur, source_url, source, created_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())
ON CONFLICT DO NOTHING""",
(hcp.get('nom',''), '', hcp.get('specialite',''), hcp.get('ville',''),
hcp.get('secteur',''), BASE, hcp.get('source','')))
conn.commit()
return True
except Exception as e:
conn.rollback()
log(f" DB ERROR: {e}")
return False
def main():
from playwright.sync_api import sync_playwright
conn = psycopg2.connect(DB)
log(f"START CROMC Playwright scraper batch={BATCH}")
total = 0
inserted = 0
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.set_extra_http_headers({"Accept-Language": "fr-FR,fr;q=0.9"})
for pg in range(1, BATCH + 1):
results = scrape_cromc_page(page, pg)
if not results:
log(f" No more results at page {pg}, stopping")
break
total += len(results)
for hcp in results:
if upsert(conn, hcp):
inserted += 1
time.sleep(3)
browser.close()
log(f"DONE total={total} inserted={inserted}")
conn.close()
if __name__ == "__main__":
main()