138 lines
7.9 KiB
PHP
138 lines
7.9 KiB
PHP
<?php
|
|
// ETHICA BOOST SCRAPER - ALG + MA - Targets dabadoc DZ/MA + doctoralia MA
|
|
$pdo = new PDO("pgsql:host=localhost;port=5432;dbname=adx_system", "admin", "admin123");
|
|
$pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
|
|
$mode = $argv[1] ?? 'all';
|
|
$log = '/tmp/ethica-boost.log';
|
|
|
|
function l($m) { global $log; $t=date('H:i:s'); file_put_contents($log,"[$t] $m\n",FILE_APPEND); echo "[$t] $m\n"; }
|
|
function g($u,$r=3) {
|
|
for($i=0;$i<$r;$i++) {
|
|
$ch=curl_init($u); curl_setopt_array($ch,[CURLOPT_RETURNTRANSFER=>1,CURLOPT_FOLLOWLOCATION=>1,CURLOPT_TIMEOUT=>30,CURLOPT_SSL_VERIFYPEER=>0,
|
|
CURLOPT_USERAGENT=>'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36']); $h=curl_exec($ch); $c=curl_getinfo($ch,CURLINFO_HTTP_CODE); curl_close($ch);
|
|
if($c==200&&!empty($h)) return $h; sleep(rand(2,4));
|
|
} return false;
|
|
}
|
|
function ins($pdo,$d) {
|
|
if(!empty($d['email'])){$s=$pdo->prepare("SELECT 1 FROM ethica.medecins_real WHERE email=:e LIMIT 1");$s->execute(['e'=>$d['email']]);if($s->fetch())return 0;}
|
|
if(!empty($d['nom'])&&!empty($d['ville'])){$s=$pdo->prepare("SELECT 1 FROM ethica.medecins_real WHERE nom=:n AND prenom=:p AND ville=:v AND pays=:c LIMIT 1");
|
|
$s->execute(['n'=>$d['nom'],'p'=>$d['prenom']??'','v'=>$d['ville'],'c'=>$d['pays']]);if($s->fetch())return 0;}
|
|
$pdo->prepare("INSERT INTO ethica.medecins_real(nom,prenom,specialite,ville,pays,telephone,source,profile_url,scraped_at,created_at) VALUES(:n,:p,:s,:v,:c,:t,:src,:u,NOW(),NOW())")
|
|
->execute(['n'=>$d['nom']??'','p'=>$d['prenom']??'','s'=>$d['specialite']??'','v'=>$d['ville']??'','c'=>$d['pays']??'','t'=>$d['telephone']??'','src'=>$d['source']??'','u'=>$d['profile_url']??'']);
|
|
return 1;
|
|
}
|
|
|
|
$specs=['medecin-generaliste','dentiste','gynecologue','pediatre','ophtalmologue','cardiologue','dermatologue','orl','psychiatre','radiologue',
|
|
'chirurgien','urologue','neurologue','gastro-enterologue','rhumatologue','endocrinologue','pneumologue','nephrologue','oncologue',
|
|
'orthopediste','kinesitherapeute','sage-femme','pharmacien','allergologue','hematologue','anesthesiste'];
|
|
|
|
// DABADOC SCRAPER
|
|
function dabadoc($pdo,$cc,$specs) {
|
|
$pays=$cc=='dz'?'ALG':'MA'; $src="dabadoc_boost_$cc"; $ins=0; $scan=0;
|
|
l("[$pays] dabadoc.$cc START - ".count($specs)." specs");
|
|
foreach($specs as $sp) {
|
|
for($pg=1;$pg<=50;$pg++) {
|
|
$h=g("https://www.dabadoc.com/$cc/$sp?page=$pg");
|
|
if(!$h)break;
|
|
// Extract doctor names + profile links
|
|
preg_match_all('/href="(\/'.preg_quote($cc).'\/[^"]+)"[^>]*>\s*(?:Dr\.?\s*)?([A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+(?:\s+[A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+)+)/u',$h,$m,PREG_SET_ORDER);
|
|
$pfound=0;
|
|
foreach($m as $doc) {
|
|
$parts=explode(' ',trim($doc[2]),2);
|
|
$r=ins($pdo,['pays'=>$pays,'source'=>$src,'specialite'=>ucfirst(str_replace('-',' ',$sp)),
|
|
'prenom'=>$parts[0],'nom'=>$parts[1]??$doc[2],'profile_url'=>'https://www.dabadoc.com'.$doc[1]]);
|
|
$ins+=$r; $scan++; $pfound+=$r;
|
|
}
|
|
// Also try JSON-LD
|
|
preg_match_all('/"@type"\s*:\s*"Physician"[^}]*?"name"\s*:\s*"([^"]+)"/s',$h,$jm);
|
|
foreach($jm[1] as $n) {
|
|
$parts=explode(' ',trim($n),2);
|
|
$ins+=ins($pdo,['pays'=>$pays,'source'=>$src,'specialite'=>ucfirst(str_replace('-',' ',$sp)),'prenom'=>$parts[0],'nom'=>$parts[1]??$n]);
|
|
$scan++;
|
|
}
|
|
if(strpos($h,'aucun')!==false && $pfound==0 && count($m)==0) break;
|
|
usleep(rand(500000,1500000));
|
|
}
|
|
if($scan>0 && $scan%100==0) l("[$pays] dabadoc $sp: scan=$scan ins=$ins");
|
|
}
|
|
l("[$pays] dabadoc DONE: scanned=$scan inserted=$ins");
|
|
return $ins;
|
|
}
|
|
|
|
// 1SANTE.DZ DEEP (48 wilayas)
|
|
function sante1dz($pdo,$specs) {
|
|
$src='1sante_boost'; $ins=0;
|
|
$wilayas=['adrar','chlef','laghouat','oum-el-bouaghi','batna','bejaia','biskra','bechar','blida','bouira',
|
|
'tamanrasset','tebessa','tlemcen','tiaret','tizi-ouzou','alger','djelfa','jijel','setif','saida','skikda',
|
|
'sidi-bel-abbes','annaba','guelma','constantine','medea','mostaganem','msila','mascara','ouargla','oran',
|
|
'el-bayadh','illizi','bordj-bou-arreridj','boumerdes','el-tarf','tindouf','tissemsilt','el-oued','khenchela',
|
|
'souk-ahras','tipaza','mila','ain-defla','naama','ain-temouchent','ghardaia','relizane'];
|
|
l("[ALG] 1sante.dz START - ".count($wilayas)." wilayas");
|
|
foreach($wilayas as $w) {
|
|
for($pg=1;$pg<=20;$pg++) {
|
|
$h=g("https://www.1sante.dz/annuaire/medecin/$w?page=$pg");
|
|
if(!$h||strlen($h)<500) break;
|
|
preg_match_all('/(?:Dr\.?\s+)?([A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+(?:\s+[A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+)+)/u',$h,$names);
|
|
preg_match_all('/(?:0[5-7]\d{2}[\s.\-]?\d{2}[\s.\-]?\d{2}[\s.\-]?\d{2})/',$h,$phones);
|
|
$pi=0;
|
|
foreach($names[0] as $idx=>$fn) {
|
|
$parts=explode(' ',trim($fn),2);
|
|
$pi+=ins($pdo,['pays'=>'ALG','source'=>$src,'specialite'=>'Medecin','ville'=>ucfirst(str_replace('-',' ',$w)),
|
|
'prenom'=>$parts[0],'nom'=>$parts[1]??$fn,'telephone'=>$phones[0][$idx]??'']);
|
|
}
|
|
$ins+=$pi;
|
|
if($pi==0&&$pg>1) break;
|
|
usleep(rand(300000,800000));
|
|
}
|
|
l("[ALG] 1sante $w: total=$ins");
|
|
}
|
|
l("[ALG] 1sante DONE: inserted=$ins");
|
|
return $ins;
|
|
}
|
|
|
|
// DOCTORALIA.MA
|
|
function doctoralia($pdo,$specs) {
|
|
$src='doctoralia_ma'; $ins=0;
|
|
$villes=['casablanca','rabat','marrakech','fes','tanger','agadir','meknes','oujda','kenitra','tetouan',
|
|
'safi','el-jadida','nador','beni-mellal','khouribga','settat','mohammedia','taza','sale','temara'];
|
|
l("[MA] doctoralia.ma START - ".count($villes)." cities");
|
|
foreach($villes as $v) {
|
|
foreach($specs as $sp) {
|
|
for($pg=1;$pg<=20;$pg++) {
|
|
$h=g("https://www.doctoralia.ma/$sp/$v?page=$pg");
|
|
if(!$h||strlen($h)<1000) break;
|
|
preg_match_all('/"@type"\s*:\s*"Physician"[^}]*?"name"\s*:\s*"([^"]+)"/s',$h,$jm);
|
|
foreach($jm[1] as $n) {
|
|
$parts=explode(' ',trim($n),2);
|
|
$ins+=ins($pdo,['pays'=>'MA','source'=>$src,'specialite'=>ucfirst(str_replace('-',' ',$sp)),
|
|
'prenom'=>$parts[0],'nom'=>$parts[1]??$n,'ville'=>ucfirst(str_replace('-',' ',$v))]);
|
|
}
|
|
preg_match_all('/href="\/([^"]+)"[^>]*>\s*(?:Dr\.?\s*)?([A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+(?:\s+[A-Z\x{00C0}-\x{024F}][a-z\x{00C0}-\x{024F}]+)+)/u',$h,$lnk,PREG_SET_ORDER);
|
|
foreach($lnk as $l2) {
|
|
$parts=explode(' ',trim($l2[2]),2);
|
|
$ins+=ins($pdo,['pays'=>'MA','source'=>$src,'specialite'=>ucfirst(str_replace('-',' ',$sp)),
|
|
'prenom'=>$parts[0],'nom'=>$parts[1]??$l2[2],'ville'=>ucfirst(str_replace('-',' ',$v)),
|
|
'profile_url'=>'https://www.doctoralia.ma/'.$l2[1]]);
|
|
}
|
|
if(count($jm[1])==0&&count($lnk)==0) break;
|
|
usleep(rand(500000,1500000));
|
|
}
|
|
}
|
|
l("[MA] doctoralia $v: total=$ins");
|
|
}
|
|
l("[MA] doctoralia DONE: inserted=$ins");
|
|
return $ins;
|
|
}
|
|
|
|
// MAIN
|
|
l("=== BOOST START mode=$mode ===");
|
|
$r=$pdo->query("SELECT pays,COUNT(*) c FROM ethica.medecins_real GROUP BY pays ORDER BY c DESC");
|
|
while($row=$r->fetch(PDO::FETCH_ASSOC)) l(" {$row['pays']}: {$row['c']}");
|
|
$tot=0;
|
|
if($mode=='alg'||$mode=='all'){l("--- ALG ---");$tot+=dabadoc($pdo,'dz',$specs);$tot+=sante1dz($pdo,$specs);}
|
|
if($mode=='ma'||$mode=='all'){l("--- MA ---");$tot+=dabadoc($pdo,'ma',$specs);$tot+=doctoralia($pdo,$specs);}
|
|
l("=== TOTAL INSERTED: $tot ===");
|
|
$r=$pdo->query("SELECT pays,COUNT(*) c FROM ethica.medecins_real GROUP BY pays ORDER BY c DESC");
|
|
while($row=$r->fetch(PDO::FETCH_ASSOC)) l(" {$row['pays']}: {$row['c']}");
|
|
l("=== DONE ===");
|