Files
wevia-brain/ethica-boost-scraper.php
2026-04-12 23:01:36 +02:00

138 lines
7.9 KiB
PHP

<?php
// ETHICA BOOST SCRAPER - ALG + MA - Targets dabadoc DZ/MA + doctoralia MA
$pdo = new PDO("pgsql:host=localhost;port=5432;dbname=adx_system", "admin", "admin123");
$pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$mode = $argv[1] ?? 'all';
$log = '/tmp/ethica-boost.log';
function l($m) { global $log; $t=date('H:i:s'); file_put_contents($log,"[$t] $m\n",FILE_APPEND); echo "[$t] $m\n"; }
function g($u,$r=3) {
for($i=0;$i<$r;$i++) {
$ch=curl_init($u); curl_setopt_array($ch,[CURLOPT_RETURNTRANSFER=>1,CURLOPT_FOLLOWLOCATION=>1,CURLOPT_TIMEOUT=>30,CURLOPT_SSL_VERIFYPEER=>0,
CURLOPT_USERAGENT=>'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36']); $h=curl_exec($ch); $c=curl_getinfo($ch,CURLINFO_HTTP_CODE); curl_close($ch);
if($c==200&&!empty($h)) return $h; sleep(rand(2,4));
} return false;
}
function ins($pdo,$d) {
if(!empty($d['email'])){$s=$pdo->prepare("SELECT 1 FROM ethica.medecins_real WHERE email=:e LIMIT 1");$s->execute(['e'=>$d['email']]);if($s->fetch())return 0;}
if(!empty($d['nom'])&&!empty($d['ville'])){$s=$pdo->prepare("SELECT 1 FROM ethica.medecins_real WHERE nom=:n AND prenom=:p AND ville=:v AND pays=:c LIMIT 1");
$s->execute(['n'=>$d['nom'],'p'=>$d['prenom']??'','v'=>$d['ville'],'c'=>$d['pays']]);if($s->fetch())return 0;}
$pdo->prepare("INSERT INTO ethica.medecins_real(nom,prenom,specialite,ville,pays,telephone,source,profile_url,scraped_at,created_at) VALUES(:n,:p,:s,:v,:c,:t,:src,:u,NOW(),NOW())")
->execute(['n'=>$d['nom']??'','p'=>$d['prenom']??'','s'=>$d['specialite']??'','v'=>$d['ville']??'','c'=>$d['pays']??'','t'=>$d['telephone']??'','src'=>$d['source']??'','u'=>$d['profile_url']??'']);
return 1;
}
$specs=['medecin-generaliste','dentiste','gynecologue','pediatre','ophtalmologue','cardiologue','dermatologue','orl','psychiatre','radiologue',
'chirurgien','urologue','neurologue','gastro-enterologue','rhumatologue','endocrinologue','pneumologue','nephrologue','oncologue',
'orthopediste','kinesitherapeute','sage-femme','pharmacien','allergologue','hematologue','anesthesiste'];
// DABADOC SCRAPER
function dabadoc($pdo,$cc,$specs) {
$pays=$cc=='dz'?'ALG':'MA'; $src="dabadoc_boost_$cc"; $ins=0; $scan=0;
l("[$pays] dabadoc.$cc START - ".count($specs)." specs");
foreach($specs as $sp) {
for($pg=1;$pg<=50;$pg++) {
$h=g("https://www.dabadoc.com/$cc/$sp?page=$pg");
if(!$h)break;
// Extract doctor names + profile links
preg_match_all('/href="(\/'.preg_quote($cc).'\/[^"]+)"[^>]*>\s*(?:Dr\.?\s*)?([A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+(?:\s+[A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+)+)/u',$h,$m,PREG_SET_ORDER);
$pfound=0;
foreach($m as $doc) {
$parts=explode(' ',trim($doc[2]),2);
$r=ins($pdo,['pays'=>$pays,'source'=>$src,'specialite'=>ucfirst(str_replace('-',' ',$sp)),
'prenom'=>$parts[0],'nom'=>$parts[1]??$doc[2],'profile_url'=>'https://www.dabadoc.com'.$doc[1]]);
$ins+=$r; $scan++; $pfound+=$r;
}
// Also try JSON-LD
preg_match_all('/"@type"\s*:\s*"Physician"[^}]*?"name"\s*:\s*"([^"]+)"/s',$h,$jm);
foreach($jm[1] as $n) {
$parts=explode(' ',trim($n),2);
$ins+=ins($pdo,['pays'=>$pays,'source'=>$src,'specialite'=>ucfirst(str_replace('-',' ',$sp)),'prenom'=>$parts[0],'nom'=>$parts[1]??$n]);
$scan++;
}
if(strpos($h,'aucun')!==false && $pfound==0 && count($m)==0) break;
usleep(rand(500000,1500000));
}
if($scan>0 && $scan%100==0) l("[$pays] dabadoc $sp: scan=$scan ins=$ins");
}
l("[$pays] dabadoc DONE: scanned=$scan inserted=$ins");
return $ins;
}
// 1SANTE.DZ DEEP (48 wilayas)
function sante1dz($pdo,$specs) {
$src='1sante_boost'; $ins=0;
$wilayas=['adrar','chlef','laghouat','oum-el-bouaghi','batna','bejaia','biskra','bechar','blida','bouira',
'tamanrasset','tebessa','tlemcen','tiaret','tizi-ouzou','alger','djelfa','jijel','setif','saida','skikda',
'sidi-bel-abbes','annaba','guelma','constantine','medea','mostaganem','msila','mascara','ouargla','oran',
'el-bayadh','illizi','bordj-bou-arreridj','boumerdes','el-tarf','tindouf','tissemsilt','el-oued','khenchela',
'souk-ahras','tipaza','mila','ain-defla','naama','ain-temouchent','ghardaia','relizane'];
l("[ALG] 1sante.dz START - ".count($wilayas)." wilayas");
foreach($wilayas as $w) {
for($pg=1;$pg<=20;$pg++) {
$h=g("https://www.1sante.dz/annuaire/medecin/$w?page=$pg");
if(!$h||strlen($h)<500) break;
preg_match_all('/(?:Dr\.?\s+)?([A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+(?:\s+[A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+)+)/u',$h,$names);
preg_match_all('/(?:0[5-7]\d{2}[\s.\-]?\d{2}[\s.\-]?\d{2}[\s.\-]?\d{2})/',$h,$phones);
$pi=0;
foreach($names[0] as $idx=>$fn) {
$parts=explode(' ',trim($fn),2);
$pi+=ins($pdo,['pays'=>'ALG','source'=>$src,'specialite'=>'Medecin','ville'=>ucfirst(str_replace('-',' ',$w)),
'prenom'=>$parts[0],'nom'=>$parts[1]??$fn,'telephone'=>$phones[0][$idx]??'']);
}
$ins+=$pi;
if($pi==0&&$pg>1) break;
usleep(rand(300000,800000));
}
l("[ALG] 1sante $w: total=$ins");
}
l("[ALG] 1sante DONE: inserted=$ins");
return $ins;
}
// DOCTORALIA.MA
function doctoralia($pdo,$specs) {
$src='doctoralia_ma'; $ins=0;
$villes=['casablanca','rabat','marrakech','fes','tanger','agadir','meknes','oujda','kenitra','tetouan',
'safi','el-jadida','nador','beni-mellal','khouribga','settat','mohammedia','taza','sale','temara'];
l("[MA] doctoralia.ma START - ".count($villes)." cities");
foreach($villes as $v) {
foreach($specs as $sp) {
for($pg=1;$pg<=20;$pg++) {
$h=g("https://www.doctoralia.ma/$sp/$v?page=$pg");
if(!$h||strlen($h)<1000) break;
preg_match_all('/"@type"\s*:\s*"Physician"[^}]*?"name"\s*:\s*"([^"]+)"/s',$h,$jm);
foreach($jm[1] as $n) {
$parts=explode(' ',trim($n),2);
$ins+=ins($pdo,['pays'=>'MA','source'=>$src,'specialite'=>ucfirst(str_replace('-',' ',$sp)),
'prenom'=>$parts[0],'nom'=>$parts[1]??$n,'ville'=>ucfirst(str_replace('-',' ',$v))]);
}
preg_match_all('/href="\/([^"]+)"[^>]*>\s*(?:Dr\.?\s*)?([A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+(?:\s+[A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+)+)/u',$h,$lnk,PREG_SET_ORDER);
foreach($lnk as $l2) {
$parts=explode(' ',trim($l2[2]),2);
$ins+=ins($pdo,['pays'=>'MA','source'=>$src,'specialite'=>ucfirst(str_replace('-',' ',$sp)),
'prenom'=>$parts[0],'nom'=>$parts[1]??$l2[2],'ville'=>ucfirst(str_replace('-',' ',$v)),
'profile_url'=>'https://www.doctoralia.ma/'.$l2[1]]);
}
if(count($jm[1])==0&&count($lnk)==0) break;
usleep(rand(500000,1500000));
}
}
l("[MA] doctoralia $v: total=$ins");
}
l("[MA] doctoralia DONE: inserted=$ins");
return $ins;
}
// MAIN
l("=== BOOST START mode=$mode ===");
$r=$pdo->query("SELECT pays,COUNT(*) c FROM ethica.medecins_real GROUP BY pays ORDER BY c DESC");
while($row=$r->fetch(PDO::FETCH_ASSOC)) l(" {$row['pays']}: {$row['c']}");
$tot=0;
if($mode=='alg'||$mode=='all'){l("--- ALG ---");$tot+=dabadoc($pdo,'dz',$specs);$tot+=sante1dz($pdo,$specs);}
if($mode=='ma'||$mode=='all'){l("--- MA ---");$tot+=dabadoc($pdo,'ma',$specs);$tot+=doctoralia($pdo,$specs);}
l("=== TOTAL INSERTED: $tot ===");
$r=$pdo->query("SELECT pays,COUNT(*) c FROM ethica.medecins_real GROUP BY pays ORDER BY c DESC");
while($row=$r->fetch(PDO::FETCH_ASSOC)) l(" {$row['pays']}: {$row['c']}");
l("=== DONE ===");