#!/usr/bin/env python3 """Playwright DabaDoc deep scraper - bypasses JS, gets all doctors""" import re, json, sys, time, psycopg2 from playwright.sync_api import sync_playwright DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123") country = sys.argv[1] if len(sys.argv) > 1 else "ma" batch = int(sys.argv[2]) if len(sys.argv) > 2 else 99999 pays_code = country.upper() SPECS = [ "medecin-generaliste","dentiste","cardiologue","pediatre","gynecologue", "dermatologue","ophtalmologue","orl","gastro-enterologue","pneumologue", "rhumatologue","endocrinologue","neurologue","urologue","nephrologue", "psychiatre","allergologue","orthopediste","radiologue","pharmacien", "chirurgien","anesthesiste","hematologue","oncologue","nutritionniste", "medecin-du-travail","kinesitherapeute","sage-femme" ] SPEC_MAP = {"medecin-generaliste": "generaliste"} conn = psycopg2.connect(**DB) cur = conn.cursor() total_new = 0 with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"]) ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36") for spec in SPECS: if total_new >= batch: break page_num = 1 consecutive_empty = 0 while page_num <= 100 and total_new < batch: url = f"https://www.dabadoc.com/{country}/{spec}" if page_num > 1: url += f"/page/{page_num}" try: page = ctx.new_page() page.goto(url, timeout=15000, wait_until="domcontentloaded") page.wait_for_timeout(2000) # Extract all doctor cards doctors = page.evaluate("""() => { const results = []; // Get all h2/h3 links (doctor names) document.querySelectorAll('h2 a, h3 a').forEach(el => { const name = el.textContent.trim(); const href = el.href || ''; if (href.includes('dabadoc.com') && name.length > 3) { // Try to get specialty and location from parent card const card = el.closest('article, .card, div[class*=card], div[class*=result]') || el.parentElement.parentElement; let specialty = '', city = '', phone = ''; if (card) { const texts = card.innerText.split('\\n').map(t => t.trim()).filter(t => t.length > 0); // Usually: Name, Specialty, City for (const t of texts) { if (t.match(/^(Dr|Pr|Prof)/i)) continue; if (!specialty && t.match(/(médecin|dentiste|cardio|pédiatre|gynéco|dermato|ophtalmo|orl|gastro|pneumo|rhumato|endocrino|neuro|uro|néphro|psychiatre|allergo|orthopéd|radio|pharma|chirurg|anesthés|hémato|onco|nutri|kiné|sage)/i)) { specialty = t; } } } // Extract city from URL const urlParts = href.split('/'); const cityFromUrl = urlParts.length > 5 ? urlParts[4] : ''; results.push({name, href, specialty, city: cityFromUrl}); } }); return results; }""") page.close() if not doctors: consecutive_empty += 1 if consecutive_empty >= 2: break page_num += 1 continue consecutive_empty = 0 spec_new = 0 for doc in doctors: name = re.sub(r'^(Dr|Pr|Prof)\.?\s*', '', doc['name'], flags=re.I).strip() parts = name.split(None, 1) if not parts or len(parts[0]) < 2: continue nom = parts[0].upper() prenom = parts[1].title() if len(parts) > 1 else "" canon_spec = SPEC_MAP.get(spec, spec) ville = doc.get('city', '').replace('-', ' ').title() if doc.get('city') else '' profile_url = doc.get('href', '') # Dedup check cur.execute( "SELECT 1 FROM ethica.medecins_validated WHERE LOWER(TRIM(nom))=LOWER(TRIM(%s)) AND LOWER(TRIM(prenom))=LOWER(TRIM(%s)) AND specialite=%s AND pays=%s LIMIT 1", (nom, prenom, canon_spec, pays_code) ) if cur.fetchone(): continue try: cur.execute( "INSERT INTO ethica.medecins_validated (nom,prenom,specialite,ville,pays,source,profile_url,created_at) VALUES(%s,%s,%s,%s,%s,'playwright_deep',%s,NOW())", (nom, prenom, canon_spec, ville, pays_code, profile_url) ) conn.commit() total_new += 1 spec_new += 1 except: conn.rollback() page_num += 1 time.sleep(1) except Exception as e: try: page.close() except: pass page_num += 1 continue print(f"[{spec}] +{total_new} total ({pays_code})") browser.close() cur.close() conn.close() print(f"PLAYWRIGHT_{pays_code}: +{total_new} new doctors")