Files
html/api/ethica-mega-scraper.py
2026-04-12 22:57:03 +02:00

233 lines
8.9 KiB
Python

#!/usr/bin/env python3
"""
ETHICA Mega Scraper v1.0 — Souverain
Sources: algerie-docto.com, med.tn, DabaDoc DZ, DZDoc, medecinsalgerie.org
Enriches existing + adds new HCPs with email/phone
"""
import re, sys, time, psycopg2, asyncio, json
from playwright.async_api import async_playwright
DB = dict(host='10.1.0.3', port=5432, dbname='adx_system', user='admin', password='admin123')
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 100
SOURCES = {
'algerie_docto': {
'base': 'https://algerie-docto.com',
'pages': [
'/generaliste/alger', '/generaliste/oran', '/generaliste/constantine',
'/generaliste/annaba', '/generaliste/setif', '/generaliste/blida',
'/cardiologue/alger', '/cardiologue/oran', '/cardiologue/constantine',
'/pediatre/alger', '/pediatre/oran', '/gynecologue/alger',
'/ophtalmologue/alger', '/dermatologue/alger', '/dentiste/alger',
'/psychiatre/alger', '/neurologue/alger', '/pneumologue/alger',
],
'pays': 'DZ',
},
'med_tn': {
'base': 'https://www.med.tn',
'pages': [
'/medecin/generaliste/tunis', '/medecin/generaliste/sfax',
'/medecin/generaliste/sousse', '/medecin/cardiologue/tunis',
'/medecin/pediatre/tunis', '/medecin/gynecologue/tunis',
'/medecin/ophtalmologue/tunis', '/medecin/dermatologue/tunis',
'/medecin/dentiste/tunis', '/medecin/psychiatre/tunis',
],
'pays': 'TN',
},
'dabadoc_dz': {
'base': 'https://www.dabadoc.com',
'pages': [
'/dz/medecin-generaliste/alger', '/dz/medecin-generaliste/oran',
'/dz/medecin-generaliste/constantine', '/dz/cardiologue/alger',
'/dz/pediatre/alger', '/dz/gynecologue/alger',
'/dz/ophtalmologue/alger', '/dz/dermatologue/alger',
],
'pays': 'DZ',
},
'medecinsalgerie': {
'base': 'https://www.medecinsalgerie.org',
'pages': [
'/fr/medecins/alger/', '/fr/medecins/oran/',
'/fr/medecins/constantine/', '/fr/medecins/annaba/',
'/fr/medecins/setif/', '/fr/medecins/blida/',
],
'pays': 'DZ',
},
}
def extract_contacts(text, url):
"""Extract email and phone from page text"""
emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', text)
emails = [e.lower() for e in emails if not e.endswith(('.png', '.jpg', '.gif'))
and 'example' not in e and 'noreply' not in e and 'support@' not in e
and 'contact@' not in e and 'info@' not in e and 'admin@' not in e]
phones = []
# DZ phones
phones += re.findall(r'(?:\+213|0)[5-7]\s*[\d\s.-]{8,12}', text)
phones += re.findall(r'(?:\+213)\s*[\d\s.-]{9,13}', text)
# TN phones
phones += re.findall(r'(?:\+216|[2-9])\s*[\d\s.-]{7,10}', text)
# MA phones
phones += re.findall(r'(?:\+212|0[5-7])\s*[\d\s.-]{8,12}', text)
phones = list(set([re.sub(r'[\s.-]', '', p)[:20] for p in phones if len(re.sub(r'[\s.-]', '', p)) >= 8]))
return emails[:5], phones[:5]
def extract_doctors(text, source_name, pays):
"""Extract doctor names and specialties from page text"""
doctors = []
# Pattern: "Dr. Name Surname" or "Dr Name"
dr_patterns = [
r'(?:Dr\.?|Docteur|Pr\.?)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})',
r'(?:Dr\.?|Docteur|Pr\.?)\s+([A-Z]{2,}(?:\s+[A-Z]{2,}){0,2}(?:\s+[a-z]+){0,2})',
]
for pat in dr_patterns:
matches = re.findall(pat, text)
for m in matches:
name = m.strip()
if len(name) > 3 and len(name) < 60:
parts = name.split()
if len(parts) >= 2:
doctors.append({
'nom': parts[-1].upper(),
'prenom': ' '.join(parts[:-1]).title(),
'pays': pays,
})
return doctors[:50]
async def scrape_source(page, source_name, config, conn, cur, batch_limit):
"""Scrape a single source"""
base = config['base']
pays = config['pays']
total_new = 0
total_enriched = 0
for path in config['pages']:
if total_new + total_enriched >= batch_limit:
break
url = base + path
try:
await page.goto(url, timeout=15000)
await page.wait_for_timeout(3000)
body = await page.inner_text('body')
emails, phones = extract_contacts(body, url)
doctors = extract_doctors(body, source_name, pays)
# Extract specialty from URL path
spec = ''
spec_map = {
'generaliste': 'generaliste', 'cardiologue': 'cardiologue',
'pediatre': 'pediatre', 'gynecologue': 'gynecologue',
'ophtalmologue': 'ophtalmologue', 'dermatologue': 'dermatologue',
'dentiste': 'dentiste', 'psychiatre': 'psychiatre',
'neurologue': 'neurologue', 'pneumologue': 'pneumologue',
}
for k, v in spec_map.items():
if k in path.lower():
spec = v
break
# Extract ville from URL
ville = ''
for v in ['alger', 'oran', 'constantine', 'annaba', 'setif', 'blida',
'tunis', 'sfax', 'sousse', 'casablanca', 'rabat']:
if v in path.lower():
ville = v.capitalize()
break
print("[%s] %s -> %d doctors, %d emails, %d phones" % (
source_name, path[:30], len(doctors), len(emails), len(phones)))
for doc in doctors:
nom = doc['nom']
prenom = doc['prenom']
# Check if exists
cur.execute("""SELECT id, email, telephone FROM ethica.medecins_validated
WHERE UPPER(nom)=UPPER(%s) AND pays=%s LIMIT 1""", (nom, pays))
existing = cur.fetchone()
if existing:
mid, ex_email, ex_tel = existing
updates = []
params = []
if not ex_email and emails:
updates.append("email=%s")
params.append(emails[0])
if not ex_tel and phones:
updates.append("telephone=%s")
params.append(phones[0])
if spec and not ville:
pass
if ville:
updates.append("ville=%s")
params.append(ville)
if updates:
params.append(mid)
cur.execute("UPDATE ethica.medecins_validated SET %s, enriched_at=NOW() WHERE id=%%s" % ', '.join(updates), params)
conn.commit()
total_enriched += 1
else:
# Insert new
cur.execute("""INSERT INTO ethica.medecins_validated
(nom, prenom, specialite, ville, pays, email, telephone, source, source_url, created_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())""",
(nom, prenom, spec, ville, pays,
emails[0] if emails else None,
phones[0] if phones else None,
'mega_scraper_%s' % source_name, url))
conn.commit()
total_new += 1
await page.wait_for_timeout(2000)
except Exception as e:
print(" ERR %s: %s" % (path[:20], str(e)[:50]))
await page.wait_for_timeout(3000)
return total_new, total_enriched
async def main():
conn = psycopg2.connect(**DB)
cur = conn.cursor()
grand_new = 0
grand_enriched = 0
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
ctx = await browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
)
page = await ctx.new_page()
per_source = BATCH // len(SOURCES)
for name, config in SOURCES.items():
print("\n=== SOURCE: %s ===" % name)
new, enriched = await scrape_source(page, name, config, conn, cur, per_source)
grand_new += new
grand_enriched += enriched
print(" Result: +%d new, +%d enriched" % (new, enriched))
await browser.close()
# Stats
cur.execute("SELECT pays, count(*) as n FROM ethica.medecins_validated GROUP BY pays ORDER BY n DESC")
stats = cur.fetchall()
cur.close()
conn.close()
print("\n" + "="*50)
print("MEGA SCRAPER: +%d new, +%d enriched" % (grand_new, grand_enriched))
for pays, n in stats:
print(" %s: %d" % (pays, n))
asyncio.run(main())