118 lines
4.4 KiB
Python
118 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""ETHICA CROMC MA Scraper via Playwright - scrapes cromc.ma doctor directory"""
|
|
import json, re, time, sys, subprocess
|
|
try:
|
|
import psycopg2
|
|
except ImportError:
|
|
subprocess.run(['pip3', 'install', 'psycopg2-binary', '--break-system-packages'], capture_output=True)
|
|
import psycopg2
|
|
|
|
DB = 'host=10.1.0.3 dbname=adx_system user=admin password=admin123'
|
|
LOG = '/opt/weval-l99/logs/ethica-cromc-pw.log'
|
|
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 50
|
|
BASE = 'https://cromc.ma'
|
|
|
|
SPECS_MAP = {
|
|
'medecine generale': 'generaliste', 'generaliste': 'generaliste',
|
|
'cardiologie': 'cardiologue', 'dermatologie': 'dermatologue',
|
|
'gastro': 'gastro-enterologue', 'gynecologie': 'gynecologue',
|
|
'neurologie': 'neurologue', 'ophtalmologie': 'ophtalmologue',
|
|
'orl': 'orl', 'pediatrie': 'pediatre', 'psychiatrie': 'psychiatre',
|
|
'pneumologie': 'pneumologue', 'rhumatologie': 'rhumatologue',
|
|
'urologie': 'urologue', 'chirurgie': 'chirurgien',
|
|
'radiologie': 'radiologue', 'endocrinologie': 'endocrinologue',
|
|
'nephrologie': 'nephrologue', 'oncologie': 'oncologue',
|
|
'hematologie': 'hematologue', 'allergologie': 'allergologue',
|
|
'anesthesie': 'anesthesiste', 'dentiste': 'dentiste',
|
|
'pharmacie': 'pharmacien', 'medecine interne': 'medecin-interne',
|
|
'medecine physique': 'medecin-physique',
|
|
}
|
|
|
|
def log(msg):
|
|
line = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] CROMC-PW: {msg}"
|
|
print(line, flush=True)
|
|
try:
|
|
with open(LOG, 'a') as f:
|
|
f.write(line + chr(10))
|
|
except:
|
|
pass
|
|
|
|
def mapspec(raw):
|
|
raw = raw.lower().strip()
|
|
for k, v in SPECS_MAP.items():
|
|
if k in raw:
|
|
return v
|
|
return raw.replace(' ', '-')[:50]
|
|
|
|
def scrape_cromc_page(page, page_num):
|
|
url = f"{BASE}/recherche/?page={page_num}"
|
|
log(f" Fetching page {page_num}: {url}")
|
|
try:
|
|
page.goto(url, timeout=30000, wait_until='domcontentloaded')
|
|
time.sleep(2)
|
|
rows = page.query_selector_all('.doctor-card, .result-item, tr.doctor, .media, .list-group-item')
|
|
if not rows:
|
|
rows = page.query_selector_all('table tr')
|
|
results = []
|
|
for row in rows:
|
|
text = row.inner_text()
|
|
lines = [l.strip() for l in text.split(chr(10)) if l.strip()]
|
|
if len(lines) >= 2:
|
|
nom_parts = lines[0].replace('Dr.', '').replace('DR.', '').strip().split()
|
|
if len(nom_parts) >= 2:
|
|
results.append({
|
|
'nom': ' '.join(nom_parts[:3])[:100],
|
|
'specialite': mapspec(lines[1] if len(lines) > 1 else ''),
|
|
'ville': lines[2] if len(lines) > 2 else '',
|
|
'secteur': lines[3] if len(lines) > 3 else '',
|
|
'pays': 'MA',
|
|
'source': 'cromc_playwright',
|
|
})
|
|
log(f" Page {page_num}: {len(results)} doctors found")
|
|
return results
|
|
except Exception as e:
|
|
log(f" ERROR page {page_num}: {e}")
|
|
return []
|
|
|
|
def upsert(conn, hcp):
|
|
cur = conn.cursor()
|
|
try:
|
|
cur.execute("""INSERT INTO ethica.cromc_reference (nom, prenom, specialite, ville, secteur, source_url, source, created_at)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())
|
|
ON CONFLICT DO NOTHING""",
|
|
(hcp.get('nom',''), '', hcp.get('specialite',''), hcp.get('ville',''),
|
|
hcp.get('secteur',''), BASE, hcp.get('source','')))
|
|
conn.commit()
|
|
return True
|
|
except Exception as e:
|
|
conn.rollback()
|
|
log(f" DB ERROR: {e}")
|
|
return False
|
|
|
|
def main():
|
|
from playwright.sync_api import sync_playwright
|
|
conn = psycopg2.connect(DB)
|
|
log(f"START CROMC Playwright scraper batch={BATCH}")
|
|
total = 0
|
|
inserted = 0
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
page = browser.new_page()
|
|
page.set_extra_http_headers({"Accept-Language": "fr-FR,fr;q=0.9"})
|
|
for pg in range(1, BATCH + 1):
|
|
results = scrape_cromc_page(page, pg)
|
|
if not results:
|
|
log(f" No more results at page {pg}, stopping")
|
|
break
|
|
total += len(results)
|
|
for hcp in results:
|
|
if upsert(conn, hcp):
|
|
inserted += 1
|
|
time.sleep(3)
|
|
browser.close()
|
|
log(f"DONE total={total} inserted={inserted}")
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|