Files
wevads-platform/scripts/ethica/scraper-tabibi.php

207 lines
8.9 KiB
PHP
Executable File

<?php
/**
* TABIBI.TN Scraper v2
* Mode: listing-based with pagination
* Scrapes: listing pages -> profile URLs (all pages) -> visit each profile for contacts
*/
error_reporting(E_ALL); set_time_limit(0); ini_set('memory_limit','512M');
$db = new PDO('pgsql:host=localhost;dbname=adx_system','admin','admin123');
$db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$LOG = '/opt/wevads/logs/ethica-tabibi-scraper.log';
$PID = '/tmp/ethica-tabibi.pid';
if (file_exists($PID) && posix_kill((int)file_get_contents($PID), 0)) {
echo "Already running PID ".file_get_contents($PID)."\n"; exit(1);
}
file_put_contents($PID, getmypid());
register_shutdown_function(function() use ($PID) { @unlink($PID); });
function lg($m) { global $LOG; $l = date('H:i:s')." $m\n"; echo $l; @file_put_contents($LOG, date('Y-m-d ').$l, FILE_APPEND); }
$AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0',
];
function fetch($url, $retries = 2) {
global $AGENTS;
for ($try = 0; $try <= $retries; $try++) {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_TIMEOUT => 20,
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_USERAGENT => $AGENTS[array_rand($AGENTS)],
CURLOPT_HTTPHEADER => [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: fr-FR,fr;q=0.9,en;q=0.5',
'Cache-Control: no-cache',
],
CURLOPT_ENCODING => '',
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_COOKIESESSION => 1,
]);
$html = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($code >= 200 && $code < 400 && strlen($html) > 500) return $html;
if ($try < $retries) usleep(rand(1000000, 2000000));
}
return false;
}
$SPECIALTIES = [
'allergologue','anesthesiste-reanimateur','cardiologue','chirurgien-general',
'chirurgien-urologue','dentiste','dermatologue','endocrinologue',
'gastro-enterologue','gynecologue','medecin-esthetique','medecine-generale',
'nephrologue','neurochirurgie','neurologue','nutritionniste',
'oncologue','ophtalmologue','orl','orthodontiste','orthopediste',
'pediatre','pneumologue','psychiatre','radiologue',
'radiotherapie-carcinologique','rhumatologue','urologue',
];
$SPEC_MAP = [
'medecine-generale' => 'generaliste', 'chirurgien-general' => 'chirurgien',
'chirurgien-urologue' => 'urologue', 'radiotherapie-carcinologique' => 'oncologue',
'anesthesiste-reanimateur' => 'anesthesiste', 'neurochirurgie' => 'neurochirurgien',
'medecin-esthetique' => 'esthetique',
];
$GOUVERNORATS = ['Tunis','Ariana','Ben Arous','Manouba','Nabeul','Zaghouan','Bizerte','Beja','Jendouba','Le Kef','Siliana','Sousse','Monastir','Mahdia','Sfax','Kairouan','Kasserine','Sidi Bouzid','Gabes','Medenine','Tataouine','Gafsa','Tozeur','Kebili'];
$GOV_PATTERN = implode('|', array_map('preg_quote', $GOUVERNORATS));
$insert = $db->prepare("INSERT INTO ethica.medecins_real
(nom, prenom, specialite, ville, pays, telephone, email, adresse, source, source_url, profile_url, scraped_at)
VALUES (?,?,?,?,?,?,?,?,?,?,?,NOW())
ON CONFLICT (nom, prenom, specialite, ville, pays) DO UPDATE SET
telephone=COALESCE(NULLIF(EXCLUDED.telephone,''), ethica.medecins_real.telephone),
email=COALESCE(NULLIF(EXCLUDED.email,''), ethica.medecins_real.email),
adresse=COALESCE(NULLIF(EXCLUDED.adresse,''), ethica.medecins_real.adresse),
profile_url=COALESCE(NULLIF(EXCLUDED.profile_url,''), ethica.medecins_real.profile_url),
source=CASE WHEN ethica.medecins_real.source NOT LIKE '%tabibi%'
THEN ethica.medecins_real.source || ',tabibi' ELSE ethica.medecins_real.source END");
lg("=== TABIBI.TN SCRAPER v2 START ===");
$total = 0; $total_phones = 0; $total_emails = 0;
foreach ($SPECIALTIES as $spec_slug) {
$spec_clean = $SPEC_MAP[$spec_slug] ?? str_replace('-', ' ', $spec_slug);
lg(">> $spec_slug ($spec_clean)");
// ── Step 1: Collect ALL profile URLs from ALL listing pages ──
$profile_urls = [];
$page = 1;
$max_pages = 20; // safety cap
while ($page <= $max_pages) {
$list_url = "https://tabibi.tn/$spec_slug" . ($page > 1 ? "?page=$page" : "");
$html = fetch($list_url);
if (!$html) { lg(" Page $page unreachable"); break; }
// Extract profile links — deduplicate by numeric ID
$found_on_page = 0;
if (preg_match_all('#href="(/medecine/[^/]+/[^/]+/(\d+))"#', $html, $ms, PREG_SET_ORDER)) {
foreach ($ms as $m) {
if (!isset($profile_urls[$m[2]])) {
$profile_urls[$m[2]] = 'https://tabibi.tn' . $m[1];
$found_on_page++;
}
}
}
// Detect last page: no new profiles OR no next page link
$has_next = preg_match("#/$spec_slug\?page=" . ($page + 1) . "#", $html);
if ($found_on_page === 0 || !$has_next) break;
$page++;
usleep(rand(500000, 1000000)); // 0.5-1s between pages
}
$count = count($profile_urls);
lg(" $count unique profiles across $page pages");
if ($count === 0) { sleep(1); continue; }
// ── Step 2: Visit each profile ──
foreach ($profile_urls as $pid => $profile_url) {
$html = fetch($profile_url);
if (!$html) { usleep(rand(500000, 1200000)); continue; }
$nom = ''; $prenom = ''; $phone = null; $email = null; $address = null; $ville = null;
// Name
if (preg_match('/<h1[^>]*>([^<]+)/i', $html, $m)) {
$raw = trim(preg_replace('/^(Dr|Pr|Prof)\.?\s+/iu', '', trim($m[1])));
$parts = preg_split('/\s+/', $raw);
$nom_p = []; $pre_p = [];
foreach ($parts as $p) {
if (strlen($p) > 1 && strtoupper($p) === $p) $nom_p[] = $p;
else $pre_p[] = $p;
}
$nom = strtoupper(implode(' ', $nom_p ?: [end($parts)]));
$prenom = ucwords(strtolower(implode(' ', $pre_p ?: [$parts[0]])));
}
if (!$nom || strlen($nom) < 2) { usleep(300000); continue; }
// Phone
if (preg_match_all('/href="tel:([^"]+)"/i', $html, $ms)) {
foreach ($ms[1] as $raw_phone) {
$d = preg_replace('/\D/', '', $raw_phone);
if (strlen($d) === 8) {
$phone = '+216 ' . substr($d,0,2) . ' ' . substr($d,2,3) . ' ' . substr($d,5);
} elseif (strlen($d) >= 11 && substr($d,0,3) === '216') {
$d2 = substr($d,3);
$phone = '+216 ' . substr($d2,0,2) . ' ' . substr($d2,2,3) . ' ' . substr($d2,5);
} elseif (strlen($d) >= 7) {
$phone = trim($raw_phone);
}
if ($phone) break;
}
}
// Email
if (preg_match_all('/href="mailto:([^"]+)"/i', $html, $ms)) {
foreach ($ms[1] as $c) {
$c = strtolower(trim($c));
if (!preg_match('/tabibi\.tn|contact@|info@|admin@|noreply/i', $c) && filter_var($c, FILTER_VALIDATE_EMAIL)) {
$email = $c; break;
}
}
}
// City
if (!$ville && preg_match("#/$spec_slug/[^/]+#i", $profile_url, $m)) {
$url_parts = explode('/', trim($m[0], '/'));
// city might be in URL after specialty
}
if (preg_match("/class=\"[^\"]*gouvernorat[^\"]*\"[^>]*>([^<]+)/i", $html, $m))
$ville = trim($m[1]);
if (!$ville && preg_match("/($GOV_PATTERN)/iu", $html, $m))
$ville = ucfirst(strtolower(trim($m[1])));
if (!$ville) $ville = 'Tunisie';
// Address
if (preg_match('/adresse[^:]*:\s*([^\n\r<]{10,100})/i', $html, $m))
$address = trim(html_entity_decode($m[1], ENT_QUOTES, 'UTF-8'));
try {
$insert->execute([$nom, $prenom, $spec_clean, $ville, 'TN', $phone, $email, $address, 'tabibi', "https://tabibi.tn/$spec_slug", $profile_url]);
$total++;
if ($phone) $total_phones++;
if ($email) $total_emails++;
} catch (Exception $e) {}
if ($total % 100 === 0) lg(" Progress: total=$total phones=$total_phones emails=$total_emails");
usleep(rand(600000, 1200000)); // 0.6-1.2s per profile
}
lg(" $spec_slug done — total: $total");
sleep(rand(2, 5));
}
lg("=== TABIBI v2 COMPLETE: $total HCPs | $total_phones phones | $total_emails emails ===");