Files
wevads-platform/scripts/ethica/ethica-scraper.php
2026-02-26 04:53:11 +01:00

149 lines
6.0 KiB
PHP

<?php
/**
* ETHICA SCRAPER — Médecins Maghreb (MA/TN/DZ)
* Usage: php ethica-scraper.php [specialite] [pays]
* Cron: 0 3 1,15 * * php /opt/wevads/scripts/ethica/ethica-scraper.php
*/
error_reporting(E_ALL);
set_time_limit(600);
$db = new PDO("pgsql:host=localhost;dbname=adx_system", "admin", "admin123");
$db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$specialites = ['generaliste','pharmacien','gastro-enterologue','pediatre','pneumologue','allergologue','orl','rhumatologue','medecin-physique','orthopediste','dentiste','medecin-travail','gynecologue','cardiologue','dermatologue','ophtalmologue','urologue','endocrinologue','neurologue','oncologue','nephrologue','chirurgien','radiologue','anesthesiste','psychiatre','medecin-interne','hematologue','infectiologue'];
$pays_list = ['MA','TN','DZ'];
$pays_names = ['MA'=>'Maroc','TN'=>'Tunisie','DZ'=>'Algerie'];
$target_spec = $argv[1] ?? 'all';
$target_pays = $argv[2] ?? 'all';
$cities = [
'MA' => ['Casablanca','Rabat','Marrakech','Fes','Tanger','Meknes','Agadir','Oujda','Kenitra','Tetouan','Safi','El Jadida','Nador','Beni Mellal','Mohammedia','Khouribga','Settat'],
'TN' => ['Tunis','Sfax','Sousse','Kairouan','Bizerte','Gabes','Ariana','Gafsa','Monastir','Ben Arous','Kasserine','Medenine','Nabeul','Tataouine'],
'DZ' => ['Alger','Oran','Constantine','Annaba','Blida','Batna','Setif','Djelfa','Biskra','Sidi Bel Abbes','Tlemcen','Bejaia','Tiaret','Tizi Ouzou','Bouira'],
];
function log_msg($msg) { echo date('H:i:s')." $msg\n"; }
// Google Maps Places API scraping via text search
function scrape_google_maps($specialite, $city, $pays_code, $pays_name) {
$query = urlencode("medecin $specialite $city $pays_name");
$url = "https://www.google.com/maps/search/$query";
// We use curl to fetch and parse
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => "https://www.google.com/search?q=medecin+$specialite+$city+$pays_name+email",
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => 30,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
CURLOPT_HTTPHEADER => ['Accept-Language: fr-FR,fr;q=0.9'],
]);
$html = curl_exec($ch);
curl_close($ch);
// Extract emails from HTML
$emails = [];
if (preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $html, $matches)) {
foreach ($matches[0] as $email) {
$email = strtolower(trim($email));
// Filter out junk
if (strpos($email, 'google') !== false) continue;
if (strpos($email, 'example') !== false) continue;
if (strpos($email, 'sentry') !== false) continue;
if (strlen($email) > 5 && strlen($email) < 100) {
$emails[] = $email;
}
}
}
return array_unique($emails);
}
// Pages Jaunes scraping
function scrape_pages_jaunes($specialite, $city, $pays_code) {
$domains = ['MA'=>'pagesjaunes.ma','TN'=>'pagesjaunes.tn','DZ'=>'pagesjaunes-dz.com'];
$domain = $domains[$pays_code] ?? 'pagesjaunes.ma';
$query = urlencode("medecin $specialite");
$loc = urlencode($city);
$url = "https://www.$domain/recherche/$query/$loc";
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => 30,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
]);
$html = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
$emails = [];
if ($code == 200 && $html) {
if (preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $html, $matches)) {
foreach ($matches[0] as $email) {
$email = strtolower(trim($email));
if (strlen($email) > 5 && strlen($email) < 100 && strpos($email, 'pagesjaunes') === false) {
$emails[] = $email;
}
}
}
}
return array_unique($emails);
}
// Main scrape loop
$total_new = 0;
$total_found = 0;
$specs = ($target_spec === 'all') ? $specialites : [$target_spec];
$pays = ($target_pays === 'all') ? $pays_list : [$target_pays];
$insert = $db->prepare("INSERT INTO ethica.medecins (email, specialite, ville, pays, source) VALUES (?,?,?,?,?) ON CONFLICT (email) DO NOTHING");
$log_insert = $db->prepare("INSERT INTO ethica.scraping_log (source_id, specialite, pays, emails_found, emails_new, duration_sec) VALUES (?,?,?,?,?,?)");
foreach ($pays as $p) {
foreach ($specs as $spec) {
$city_list = $cities[$p] ?? [];
foreach ($city_list as $city) {
$start = time();
log_msg("Scraping $spec / $city / $p...");
// Google search
$emails_g = scrape_google_maps($spec, $city, $p, $pays_names[$p]);
// Pages Jaunes
$emails_pj = scrape_pages_jaunes($spec, $city, $p);
$all_emails = array_unique(array_merge($emails_g, $emails_pj));
$new = 0;
foreach ($all_emails as $email) {
$insert->execute([$email, $spec, $city, $p, "scraper_".date('Y-m')]);
if ($insert->rowCount() > 0) $new++;
}
$found = count($all_emails);
$total_found += $found;
$total_new += $new;
$duration = time() - $start;
log_msg(" Found: $found | New: $new | Time: {$duration}s");
// Log
$log_insert->execute([1, $spec, $p, $found, $new, $duration]);
// Rate limit
usleep(500000);
}
}
}
log_msg("=== DONE === Total found: $total_found | New: $total_new");
// Update source stats
$db->exec("UPDATE ethica.scraping_sources SET last_scraped_at=NOW(), contacts_found=(SELECT COUNT(*) FROM ethica.medecins)");