233 lines
8.9 KiB
Python
233 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ETHICA Mega Scraper v1.0 — Souverain
|
|
Sources: algerie-docto.com, med.tn, DabaDoc DZ, DZDoc, medecinsalgerie.org
|
|
Enriches existing + adds new HCPs with email/phone
|
|
"""
|
|
import re, sys, time, psycopg2, asyncio, json
|
|
from playwright.async_api import async_playwright
|
|
|
|
DB = dict(host='10.1.0.3', port=5432, dbname='adx_system', user='admin', password='admin123')
|
|
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 100
|
|
|
|
SOURCES = {
|
|
'algerie_docto': {
|
|
'base': 'https://algerie-docto.com',
|
|
'pages': [
|
|
'/generaliste/alger', '/generaliste/oran', '/generaliste/constantine',
|
|
'/generaliste/annaba', '/generaliste/setif', '/generaliste/blida',
|
|
'/cardiologue/alger', '/cardiologue/oran', '/cardiologue/constantine',
|
|
'/pediatre/alger', '/pediatre/oran', '/gynecologue/alger',
|
|
'/ophtalmologue/alger', '/dermatologue/alger', '/dentiste/alger',
|
|
'/psychiatre/alger', '/neurologue/alger', '/pneumologue/alger',
|
|
],
|
|
'pays': 'DZ',
|
|
},
|
|
'med_tn': {
|
|
'base': 'https://www.med.tn',
|
|
'pages': [
|
|
'/medecin/generaliste/tunis', '/medecin/generaliste/sfax',
|
|
'/medecin/generaliste/sousse', '/medecin/cardiologue/tunis',
|
|
'/medecin/pediatre/tunis', '/medecin/gynecologue/tunis',
|
|
'/medecin/ophtalmologue/tunis', '/medecin/dermatologue/tunis',
|
|
'/medecin/dentiste/tunis', '/medecin/psychiatre/tunis',
|
|
],
|
|
'pays': 'TN',
|
|
},
|
|
'dabadoc_dz': {
|
|
'base': 'https://www.dabadoc.com',
|
|
'pages': [
|
|
'/dz/medecin-generaliste/alger', '/dz/medecin-generaliste/oran',
|
|
'/dz/medecin-generaliste/constantine', '/dz/cardiologue/alger',
|
|
'/dz/pediatre/alger', '/dz/gynecologue/alger',
|
|
'/dz/ophtalmologue/alger', '/dz/dermatologue/alger',
|
|
],
|
|
'pays': 'DZ',
|
|
},
|
|
'medecinsalgerie': {
|
|
'base': 'https://www.medecinsalgerie.org',
|
|
'pages': [
|
|
'/fr/medecins/alger/', '/fr/medecins/oran/',
|
|
'/fr/medecins/constantine/', '/fr/medecins/annaba/',
|
|
'/fr/medecins/setif/', '/fr/medecins/blida/',
|
|
],
|
|
'pays': 'DZ',
|
|
},
|
|
}
|
|
|
|
def extract_contacts(text, url):
|
|
"""Extract email and phone from page text"""
|
|
emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', text)
|
|
emails = [e.lower() for e in emails if not e.endswith(('.png', '.jpg', '.gif'))
|
|
and 'example' not in e and 'noreply' not in e and 'support@' not in e
|
|
and 'contact@' not in e and 'info@' not in e and 'admin@' not in e]
|
|
|
|
phones = []
|
|
# DZ phones
|
|
phones += re.findall(r'(?:\+213|0)[5-7]\s*[\d\s.-]{8,12}', text)
|
|
phones += re.findall(r'(?:\+213)\s*[\d\s.-]{9,13}', text)
|
|
# TN phones
|
|
phones += re.findall(r'(?:\+216|[2-9])\s*[\d\s.-]{7,10}', text)
|
|
# MA phones
|
|
phones += re.findall(r'(?:\+212|0[5-7])\s*[\d\s.-]{8,12}', text)
|
|
phones = list(set([re.sub(r'[\s.-]', '', p)[:20] for p in phones if len(re.sub(r'[\s.-]', '', p)) >= 8]))
|
|
|
|
return emails[:5], phones[:5]
|
|
|
|
def extract_doctors(text, source_name, pays):
|
|
"""Extract doctor names and specialties from page text"""
|
|
doctors = []
|
|
|
|
# Pattern: "Dr. Name Surname" or "Dr Name"
|
|
dr_patterns = [
|
|
r'(?:Dr\.?|Docteur|Pr\.?)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})',
|
|
r'(?:Dr\.?|Docteur|Pr\.?)\s+([A-Z]{2,}(?:\s+[A-Z]{2,}){0,2}(?:\s+[a-z]+){0,2})',
|
|
]
|
|
|
|
for pat in dr_patterns:
|
|
matches = re.findall(pat, text)
|
|
for m in matches:
|
|
name = m.strip()
|
|
if len(name) > 3 and len(name) < 60:
|
|
parts = name.split()
|
|
if len(parts) >= 2:
|
|
doctors.append({
|
|
'nom': parts[-1].upper(),
|
|
'prenom': ' '.join(parts[:-1]).title(),
|
|
'pays': pays,
|
|
})
|
|
|
|
return doctors[:50]
|
|
|
|
async def scrape_source(page, source_name, config, conn, cur, batch_limit):
|
|
"""Scrape a single source"""
|
|
base = config['base']
|
|
pays = config['pays']
|
|
total_new = 0
|
|
total_enriched = 0
|
|
|
|
for path in config['pages']:
|
|
if total_new + total_enriched >= batch_limit:
|
|
break
|
|
|
|
url = base + path
|
|
try:
|
|
await page.goto(url, timeout=15000)
|
|
await page.wait_for_timeout(3000)
|
|
body = await page.inner_text('body')
|
|
|
|
emails, phones = extract_contacts(body, url)
|
|
doctors = extract_doctors(body, source_name, pays)
|
|
|
|
# Extract specialty from URL path
|
|
spec = ''
|
|
spec_map = {
|
|
'generaliste': 'generaliste', 'cardiologue': 'cardiologue',
|
|
'pediatre': 'pediatre', 'gynecologue': 'gynecologue',
|
|
'ophtalmologue': 'ophtalmologue', 'dermatologue': 'dermatologue',
|
|
'dentiste': 'dentiste', 'psychiatre': 'psychiatre',
|
|
'neurologue': 'neurologue', 'pneumologue': 'pneumologue',
|
|
}
|
|
for k, v in spec_map.items():
|
|
if k in path.lower():
|
|
spec = v
|
|
break
|
|
|
|
# Extract ville from URL
|
|
ville = ''
|
|
for v in ['alger', 'oran', 'constantine', 'annaba', 'setif', 'blida',
|
|
'tunis', 'sfax', 'sousse', 'casablanca', 'rabat']:
|
|
if v in path.lower():
|
|
ville = v.capitalize()
|
|
break
|
|
|
|
print("[%s] %s -> %d doctors, %d emails, %d phones" % (
|
|
source_name, path[:30], len(doctors), len(emails), len(phones)))
|
|
|
|
for doc in doctors:
|
|
nom = doc['nom']
|
|
prenom = doc['prenom']
|
|
|
|
# Check if exists
|
|
cur.execute("""SELECT id, email, telephone FROM ethica.medecins_validated
|
|
WHERE UPPER(nom)=UPPER(%s) AND pays=%s LIMIT 1""", (nom, pays))
|
|
existing = cur.fetchone()
|
|
|
|
if existing:
|
|
mid, ex_email, ex_tel = existing
|
|
updates = []
|
|
params = []
|
|
if not ex_email and emails:
|
|
updates.append("email=%s")
|
|
params.append(emails[0])
|
|
if not ex_tel and phones:
|
|
updates.append("telephone=%s")
|
|
params.append(phones[0])
|
|
if spec and not ville:
|
|
pass
|
|
if ville:
|
|
updates.append("ville=%s")
|
|
params.append(ville)
|
|
if updates:
|
|
params.append(mid)
|
|
cur.execute("UPDATE ethica.medecins_validated SET %s, enriched_at=NOW() WHERE id=%%s" % ', '.join(updates), params)
|
|
conn.commit()
|
|
total_enriched += 1
|
|
else:
|
|
# Insert new
|
|
cur.execute("""INSERT INTO ethica.medecins_validated
|
|
(nom, prenom, specialite, ville, pays, email, telephone, source, source_url, created_at)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())""",
|
|
(nom, prenom, spec, ville, pays,
|
|
emails[0] if emails else None,
|
|
phones[0] if phones else None,
|
|
'mega_scraper_%s' % source_name, url))
|
|
conn.commit()
|
|
total_new += 1
|
|
|
|
await page.wait_for_timeout(2000)
|
|
except Exception as e:
|
|
print(" ERR %s: %s" % (path[:20], str(e)[:50]))
|
|
await page.wait_for_timeout(3000)
|
|
|
|
return total_new, total_enriched
|
|
|
|
async def main():
|
|
conn = psycopg2.connect(**DB)
|
|
cur = conn.cursor()
|
|
|
|
grand_new = 0
|
|
grand_enriched = 0
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
ctx = await browser.new_context(
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
)
|
|
page = await ctx.new_page()
|
|
|
|
per_source = BATCH // len(SOURCES)
|
|
|
|
for name, config in SOURCES.items():
|
|
print("\n=== SOURCE: %s ===" % name)
|
|
new, enriched = await scrape_source(page, name, config, conn, cur, per_source)
|
|
grand_new += new
|
|
grand_enriched += enriched
|
|
print(" Result: +%d new, +%d enriched" % (new, enriched))
|
|
|
|
await browser.close()
|
|
|
|
# Stats
|
|
cur.execute("SELECT pays, count(*) as n FROM ethica.medecins_validated GROUP BY pays ORDER BY n DESC")
|
|
stats = cur.fetchall()
|
|
|
|
cur.close()
|
|
conn.close()
|
|
|
|
print("\n" + "="*50)
|
|
print("MEGA SCRAPER: +%d new, +%d enriched" % (grand_new, grand_enriched))
|
|
for pays, n in stats:
|
|
print(" %s: %d" % (pays, n))
|
|
|
|
asyncio.run(main())
|