137 lines
6.0 KiB
Python
137 lines
6.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Playwright DabaDoc deep scraper - bypasses JS, gets all doctors"""
|
|
import re, json, sys, time, psycopg2
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
|
|
country = sys.argv[1] if len(sys.argv) > 1 else "ma"
|
|
batch = int(sys.argv[2]) if len(sys.argv) > 2 else 99999
|
|
pays_code = country.upper()
|
|
|
|
SPECS = [
|
|
"medecin-generaliste","dentiste","cardiologue","pediatre","gynecologue",
|
|
"dermatologue","ophtalmologue","orl","gastro-enterologue","pneumologue",
|
|
"rhumatologue","endocrinologue","neurologue","urologue","nephrologue",
|
|
"psychiatre","allergologue","orthopediste","radiologue","pharmacien",
|
|
"chirurgien","anesthesiste","hematologue","oncologue","nutritionniste",
|
|
"medecin-du-travail","kinesitherapeute","sage-femme"
|
|
]
|
|
SPEC_MAP = {"medecin-generaliste": "generaliste"}
|
|
|
|
conn = psycopg2.connect(**DB)
|
|
cur = conn.cursor()
|
|
total_new = 0
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
|
|
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36")
|
|
|
|
for spec in SPECS:
|
|
if total_new >= batch:
|
|
break
|
|
page_num = 1
|
|
consecutive_empty = 0
|
|
|
|
while page_num <= 100 and total_new < batch:
|
|
url = f"https://www.dabadoc.com/{country}/{spec}"
|
|
if page_num > 1:
|
|
url += f"/page/{page_num}"
|
|
|
|
try:
|
|
page = ctx.new_page()
|
|
page.goto(url, timeout=15000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(2000)
|
|
|
|
# Extract all doctor cards
|
|
doctors = page.evaluate("""() => {
|
|
const results = [];
|
|
// Get all h2/h3 links (doctor names)
|
|
document.querySelectorAll('h2 a, h3 a').forEach(el => {
|
|
const name = el.textContent.trim();
|
|
const href = el.href || '';
|
|
if (href.includes('dabadoc.com') && name.length > 3) {
|
|
// Try to get specialty and location from parent card
|
|
const card = el.closest('article, .card, div[class*=card], div[class*=result]') || el.parentElement.parentElement;
|
|
let specialty = '', city = '', phone = '';
|
|
if (card) {
|
|
const texts = card.innerText.split('\\n').map(t => t.trim()).filter(t => t.length > 0);
|
|
// Usually: Name, Specialty, City
|
|
for (const t of texts) {
|
|
if (t.match(/^(Dr|Pr|Prof)/i)) continue;
|
|
if (!specialty && t.match(/(médecin|dentiste|cardio|pédiatre|gynéco|dermato|ophtalmo|orl|gastro|pneumo|rhumato|endocrino|neuro|uro|néphro|psychiatre|allergo|orthopéd|radio|pharma|chirurg|anesthés|hémato|onco|nutri|kiné|sage)/i)) {
|
|
specialty = t;
|
|
}
|
|
}
|
|
}
|
|
// Extract city from URL
|
|
const urlParts = href.split('/');
|
|
const cityFromUrl = urlParts.length > 5 ? urlParts[4] : '';
|
|
|
|
results.push({name, href, specialty, city: cityFromUrl});
|
|
}
|
|
});
|
|
return results;
|
|
}""")
|
|
|
|
page.close()
|
|
|
|
if not doctors:
|
|
consecutive_empty += 1
|
|
if consecutive_empty >= 2:
|
|
break
|
|
page_num += 1
|
|
continue
|
|
|
|
consecutive_empty = 0
|
|
spec_new = 0
|
|
|
|
for doc in doctors:
|
|
name = re.sub(r'^(Dr|Pr|Prof)\.?\s*', '', doc['name'], flags=re.I).strip()
|
|
parts = name.split(None, 1)
|
|
if not parts or len(parts[0]) < 2:
|
|
continue
|
|
|
|
nom = parts[0].upper()
|
|
prenom = parts[1].title() if len(parts) > 1 else ""
|
|
canon_spec = SPEC_MAP.get(spec, spec)
|
|
ville = doc.get('city', '').replace('-', ' ').title() if doc.get('city') else ''
|
|
profile_url = doc.get('href', '')
|
|
|
|
# Dedup check
|
|
cur.execute(
|
|
"SELECT 1 FROM ethica.medecins_validated WHERE LOWER(TRIM(nom))=LOWER(TRIM(%s)) AND LOWER(TRIM(prenom))=LOWER(TRIM(%s)) AND specialite=%s AND pays=%s LIMIT 1",
|
|
(nom, prenom, canon_spec, pays_code)
|
|
)
|
|
if cur.fetchone():
|
|
continue
|
|
|
|
try:
|
|
cur.execute(
|
|
"INSERT INTO ethica.medecins_validated (nom,prenom,specialite,ville,pays,source,profile_url,created_at) VALUES(%s,%s,%s,%s,%s,'playwright_deep',%s,NOW())",
|
|
(nom, prenom, canon_spec, ville, pays_code, profile_url)
|
|
)
|
|
conn.commit()
|
|
total_new += 1
|
|
spec_new += 1
|
|
except:
|
|
conn.rollback()
|
|
|
|
page_num += 1
|
|
time.sleep(1)
|
|
|
|
except Exception as e:
|
|
try:
|
|
page.close()
|
|
except:
|
|
pass
|
|
page_num += 1
|
|
continue
|
|
|
|
print(f"[{spec}] +{total_new} total ({pays_code})")
|
|
|
|
browser.close()
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print(f"PLAYWRIGHT_{pays_code}: +{total_new} new doctors")
|