#!/usr/bin/env python3 """ ETHICA Mega Scraper v1.0 — Souverain Sources: algerie-docto.com, med.tn, DabaDoc DZ, DZDoc, medecinsalgerie.org Enriches existing + adds new HCPs with email/phone """ import re, sys, time, psycopg2, asyncio, json from playwright.async_api import async_playwright DB = dict(host='10.1.0.3', port=5432, dbname='adx_system', user='admin', password='admin123') BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 100 SOURCES = { 'algerie_docto': { 'base': 'https://algerie-docto.com', 'pages': [ '/generaliste/alger', '/generaliste/oran', '/generaliste/constantine', '/generaliste/annaba', '/generaliste/setif', '/generaliste/blida', '/cardiologue/alger', '/cardiologue/oran', '/cardiologue/constantine', '/pediatre/alger', '/pediatre/oran', '/gynecologue/alger', '/ophtalmologue/alger', '/dermatologue/alger', '/dentiste/alger', '/psychiatre/alger', '/neurologue/alger', '/pneumologue/alger', ], 'pays': 'DZ', }, 'med_tn': { 'base': 'https://www.med.tn', 'pages': [ '/medecin/generaliste/tunis', '/medecin/generaliste/sfax', '/medecin/generaliste/sousse', '/medecin/cardiologue/tunis', '/medecin/pediatre/tunis', '/medecin/gynecologue/tunis', '/medecin/ophtalmologue/tunis', '/medecin/dermatologue/tunis', '/medecin/dentiste/tunis', '/medecin/psychiatre/tunis', ], 'pays': 'TN', }, 'dabadoc_dz': { 'base': 'https://www.dabadoc.com', 'pages': [ '/dz/medecin-generaliste/alger', '/dz/medecin-generaliste/oran', '/dz/medecin-generaliste/constantine', '/dz/cardiologue/alger', '/dz/pediatre/alger', '/dz/gynecologue/alger', '/dz/ophtalmologue/alger', '/dz/dermatologue/alger', ], 'pays': 'DZ', }, 'medecinsalgerie': { 'base': 'https://www.medecinsalgerie.org', 'pages': [ '/fr/medecins/alger/', '/fr/medecins/oran/', '/fr/medecins/constantine/', '/fr/medecins/annaba/', '/fr/medecins/setif/', '/fr/medecins/blida/', ], 'pays': 'DZ', }, } def extract_contacts(text, url): """Extract email and phone from page text""" emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', text) emails = [e.lower() for e in emails if not e.endswith(('.png', '.jpg', '.gif')) and 'example' not in e and 'noreply' not in e and 'support@' not in e and 'contact@' not in e and 'info@' not in e and 'admin@' not in e] phones = [] # DZ phones phones += re.findall(r'(?:\+213|0)[5-7]\s*[\d\s.-]{8,12}', text) phones += re.findall(r'(?:\+213)\s*[\d\s.-]{9,13}', text) # TN phones phones += re.findall(r'(?:\+216|[2-9])\s*[\d\s.-]{7,10}', text) # MA phones phones += re.findall(r'(?:\+212|0[5-7])\s*[\d\s.-]{8,12}', text) phones = list(set([re.sub(r'[\s.-]', '', p)[:20] for p in phones if len(re.sub(r'[\s.-]', '', p)) >= 8])) return emails[:5], phones[:5] def extract_doctors(text, source_name, pays): """Extract doctor names and specialties from page text""" doctors = [] # Pattern: "Dr. Name Surname" or "Dr Name" dr_patterns = [ r'(?:Dr\.?|Docteur|Pr\.?)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})', r'(?:Dr\.?|Docteur|Pr\.?)\s+([A-Z]{2,}(?:\s+[A-Z]{2,}){0,2}(?:\s+[a-z]+){0,2})', ] for pat in dr_patterns: matches = re.findall(pat, text) for m in matches: name = m.strip() if len(name) > 3 and len(name) < 60: parts = name.split() if len(parts) >= 2: doctors.append({ 'nom': parts[-1].upper(), 'prenom': ' '.join(parts[:-1]).title(), 'pays': pays, }) return doctors[:50] async def scrape_source(page, source_name, config, conn, cur, batch_limit): """Scrape a single source""" base = config['base'] pays = config['pays'] total_new = 0 total_enriched = 0 for path in config['pages']: if total_new + total_enriched >= batch_limit: break url = base + path try: await page.goto(url, timeout=15000) await page.wait_for_timeout(3000) body = await page.inner_text('body') emails, phones = extract_contacts(body, url) doctors = extract_doctors(body, source_name, pays) # Extract specialty from URL path spec = '' spec_map = { 'generaliste': 'generaliste', 'cardiologue': 'cardiologue', 'pediatre': 'pediatre', 'gynecologue': 'gynecologue', 'ophtalmologue': 'ophtalmologue', 'dermatologue': 'dermatologue', 'dentiste': 'dentiste', 'psychiatre': 'psychiatre', 'neurologue': 'neurologue', 'pneumologue': 'pneumologue', } for k, v in spec_map.items(): if k in path.lower(): spec = v break # Extract ville from URL ville = '' for v in ['alger', 'oran', 'constantine', 'annaba', 'setif', 'blida', 'tunis', 'sfax', 'sousse', 'casablanca', 'rabat']: if v in path.lower(): ville = v.capitalize() break print("[%s] %s -> %d doctors, %d emails, %d phones" % ( source_name, path[:30], len(doctors), len(emails), len(phones))) for doc in doctors: nom = doc['nom'] prenom = doc['prenom'] # Check if exists cur.execute("""SELECT id, email, telephone FROM ethica.medecins_validated WHERE UPPER(nom)=UPPER(%s) AND pays=%s LIMIT 1""", (nom, pays)) existing = cur.fetchone() if existing: mid, ex_email, ex_tel = existing updates = [] params = [] if not ex_email and emails: updates.append("email=%s") params.append(emails[0]) if not ex_tel and phones: updates.append("telephone=%s") params.append(phones[0]) if spec and not ville: pass if ville: updates.append("ville=%s") params.append(ville) if updates: params.append(mid) cur.execute("UPDATE ethica.medecins_validated SET %s, enriched_at=NOW() WHERE id=%%s" % ', '.join(updates), params) conn.commit() total_enriched += 1 else: # Insert new cur.execute("""INSERT INTO ethica.medecins_validated (nom, prenom, specialite, ville, pays, email, telephone, source, source_url, created_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())""", (nom, prenom, spec, ville, pays, emails[0] if emails else None, phones[0] if phones else None, 'mega_scraper_%s' % source_name, url)) conn.commit() total_new += 1 await page.wait_for_timeout(2000) except Exception as e: print(" ERR %s: %s" % (path[:20], str(e)[:50])) await page.wait_for_timeout(3000) return total_new, total_enriched async def main(): conn = psycopg2.connect(**DB) cur = conn.cursor() grand_new = 0 grand_enriched = 0 async with async_playwright() as p: browser = await p.chromium.launch(headless=True) ctx = await browser.new_context( user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' ) page = await ctx.new_page() per_source = BATCH // len(SOURCES) for name, config in SOURCES.items(): print("\n=== SOURCE: %s ===" % name) new, enriched = await scrape_source(page, name, config, conn, cur, per_source) grand_new += new grand_enriched += enriched print(" Result: +%d new, +%d enriched" % (new, enriched)) await browser.close() # Stats cur.execute("SELECT pays, count(*) as n FROM ethica.medecins_validated GROUP BY pays ORDER BY n DESC") stats = cur.fetchall() cur.close() conn.close() print("\n" + "="*50) print("MEGA SCRAPER: +%d new, +%d enriched" % (grand_new, grand_enriched)) for pays, n in stats: print(" %s: %d" % (pays, n)) asyncio.run(main())