Files
html/api/pw_scraper.py
2026-04-12 22:57:03 +02:00

137 lines
6.0 KiB
Python

#!/usr/bin/env python3
"""Playwright DabaDoc deep scraper - bypasses JS, gets all doctors"""
import re, json, sys, time, psycopg2
from playwright.sync_api import sync_playwright
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
country = sys.argv[1] if len(sys.argv) > 1 else "ma"
batch = int(sys.argv[2]) if len(sys.argv) > 2 else 99999
pays_code = country.upper()
SPECS = [
"medecin-generaliste","dentiste","cardiologue","pediatre","gynecologue",
"dermatologue","ophtalmologue","orl","gastro-enterologue","pneumologue",
"rhumatologue","endocrinologue","neurologue","urologue","nephrologue",
"psychiatre","allergologue","orthopediste","radiologue","pharmacien",
"chirurgien","anesthesiste","hematologue","oncologue","nutritionniste",
"medecin-du-travail","kinesitherapeute","sage-femme"
]
SPEC_MAP = {"medecin-generaliste": "generaliste"}
conn = psycopg2.connect(**DB)
cur = conn.cursor()
total_new = 0
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36")
for spec in SPECS:
if total_new >= batch:
break
page_num = 1
consecutive_empty = 0
while page_num <= 100 and total_new < batch:
url = f"https://www.dabadoc.com/{country}/{spec}"
if page_num > 1:
url += f"/page/{page_num}"
try:
page = ctx.new_page()
page.goto(url, timeout=15000, wait_until="domcontentloaded")
page.wait_for_timeout(2000)
# Extract all doctor cards
doctors = page.evaluate("""() => {
const results = [];
// Get all h2/h3 links (doctor names)
document.querySelectorAll('h2 a, h3 a').forEach(el => {
const name = el.textContent.trim();
const href = el.href || '';
if (href.includes('dabadoc.com') && name.length > 3) {
// Try to get specialty and location from parent card
const card = el.closest('article, .card, div[class*=card], div[class*=result]') || el.parentElement.parentElement;
let specialty = '', city = '', phone = '';
if (card) {
const texts = card.innerText.split('\\n').map(t => t.trim()).filter(t => t.length > 0);
// Usually: Name, Specialty, City
for (const t of texts) {
if (t.match(/^(Dr|Pr|Prof)/i)) continue;
if (!specialty && t.match(/(médecin|dentiste|cardio|pédiatre|gynéco|dermato|ophtalmo|orl|gastro|pneumo|rhumato|endocrino|neuro|uro|néphro|psychiatre|allergo|orthopéd|radio|pharma|chirurg|anesthés|hémato|onco|nutri|kiné|sage)/i)) {
specialty = t;
}
}
}
// Extract city from URL
const urlParts = href.split('/');
const cityFromUrl = urlParts.length > 5 ? urlParts[4] : '';
results.push({name, href, specialty, city: cityFromUrl});
}
});
return results;
}""")
page.close()
if not doctors:
consecutive_empty += 1
if consecutive_empty >= 2:
break
page_num += 1
continue
consecutive_empty = 0
spec_new = 0
for doc in doctors:
name = re.sub(r'^(Dr|Pr|Prof)\.?\s*', '', doc['name'], flags=re.I).strip()
parts = name.split(None, 1)
if not parts or len(parts[0]) < 2:
continue
nom = parts[0].upper()
prenom = parts[1].title() if len(parts) > 1 else ""
canon_spec = SPEC_MAP.get(spec, spec)
ville = doc.get('city', '').replace('-', ' ').title() if doc.get('city') else ''
profile_url = doc.get('href', '')
# Dedup check
cur.execute(
"SELECT 1 FROM ethica.medecins_validated WHERE LOWER(TRIM(nom))=LOWER(TRIM(%s)) AND LOWER(TRIM(prenom))=LOWER(TRIM(%s)) AND specialite=%s AND pays=%s LIMIT 1",
(nom, prenom, canon_spec, pays_code)
)
if cur.fetchone():
continue
try:
cur.execute(
"INSERT INTO ethica.medecins_validated (nom,prenom,specialite,ville,pays,source,profile_url,created_at) VALUES(%s,%s,%s,%s,%s,'playwright_deep',%s,NOW())",
(nom, prenom, canon_spec, ville, pays_code, profile_url)
)
conn.commit()
total_new += 1
spec_new += 1
except:
conn.rollback()
page_num += 1
time.sleep(1)
except Exception as e:
try:
page.close()
except:
pass
page_num += 1
continue
print(f"[{spec}] +{total_new} total ({pays_code})")
browser.close()
cur.close()
conn.close()
print(f"PLAYWRIGHT_{pays_code}: +{total_new} new doctors")