149 lines
6.0 KiB
PHP
149 lines
6.0 KiB
PHP
<?php
|
|
/**
|
|
* ETHICA SCRAPER — Médecins Maghreb (MA/TN/DZ)
|
|
* Usage: php ethica-scraper.php [specialite] [pays]
|
|
* Cron: 0 3 1,15 * * php /opt/wevads/scripts/ethica/ethica-scraper.php
|
|
*/
|
|
error_reporting(E_ALL);
|
|
set_time_limit(600);
|
|
|
|
$db = new PDO("pgsql:host=localhost;dbname=adx_system", "admin", "admin123");
|
|
$db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
|
|
|
|
$specialites = ['generaliste','pharmacien','gastro-enterologue','pediatre','pneumologue','allergologue','orl','rhumatologue','medecin-physique','orthopediste','dentiste','medecin-travail','gynecologue','cardiologue','dermatologue','ophtalmologue','urologue','endocrinologue','neurologue','oncologue','nephrologue','chirurgien','radiologue','anesthesiste','psychiatre','medecin-interne','hematologue','infectiologue'];
|
|
$pays_list = ['MA','TN','DZ'];
|
|
$pays_names = ['MA'=>'Maroc','TN'=>'Tunisie','DZ'=>'Algerie'];
|
|
|
|
$target_spec = $argv[1] ?? 'all';
|
|
$target_pays = $argv[2] ?? 'all';
|
|
|
|
$cities = [
|
|
'MA' => ['Casablanca','Rabat','Marrakech','Fes','Tanger','Meknes','Agadir','Oujda','Kenitra','Tetouan','Safi','El Jadida','Nador','Beni Mellal','Mohammedia','Khouribga','Settat'],
|
|
'TN' => ['Tunis','Sfax','Sousse','Kairouan','Bizerte','Gabes','Ariana','Gafsa','Monastir','Ben Arous','Kasserine','Medenine','Nabeul','Tataouine'],
|
|
'DZ' => ['Alger','Oran','Constantine','Annaba','Blida','Batna','Setif','Djelfa','Biskra','Sidi Bel Abbes','Tlemcen','Bejaia','Tiaret','Tizi Ouzou','Bouira'],
|
|
];
|
|
|
|
function log_msg($msg) { echo date('H:i:s')." $msg\n"; }
|
|
|
|
// Google Maps Places API scraping via text search
|
|
function scrape_google_maps($specialite, $city, $pays_code, $pays_name) {
|
|
$query = urlencode("medecin $specialite $city $pays_name");
|
|
$url = "https://www.google.com/maps/search/$query";
|
|
|
|
// We use curl to fetch and parse
|
|
$ch = curl_init();
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_URL => "https://www.google.com/search?q=medecin+$specialite+$city+$pays_name+email",
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_TIMEOUT => 30,
|
|
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
CURLOPT_HTTPHEADER => ['Accept-Language: fr-FR,fr;q=0.9'],
|
|
]);
|
|
$html = curl_exec($ch);
|
|
curl_close($ch);
|
|
|
|
// Extract emails from HTML
|
|
$emails = [];
|
|
if (preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $html, $matches)) {
|
|
foreach ($matches[0] as $email) {
|
|
$email = strtolower(trim($email));
|
|
// Filter out junk
|
|
if (strpos($email, 'google') !== false) continue;
|
|
if (strpos($email, 'example') !== false) continue;
|
|
if (strpos($email, 'sentry') !== false) continue;
|
|
if (strlen($email) > 5 && strlen($email) < 100) {
|
|
$emails[] = $email;
|
|
}
|
|
}
|
|
}
|
|
return array_unique($emails);
|
|
}
|
|
|
|
// Pages Jaunes scraping
|
|
function scrape_pages_jaunes($specialite, $city, $pays_code) {
|
|
$domains = ['MA'=>'pagesjaunes.ma','TN'=>'pagesjaunes.tn','DZ'=>'pagesjaunes-dz.com'];
|
|
$domain = $domains[$pays_code] ?? 'pagesjaunes.ma';
|
|
|
|
$query = urlencode("medecin $specialite");
|
|
$loc = urlencode($city);
|
|
$url = "https://www.$domain/recherche/$query/$loc";
|
|
|
|
$ch = curl_init();
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_URL => $url,
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_TIMEOUT => 30,
|
|
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
]);
|
|
$html = curl_exec($ch);
|
|
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
$emails = [];
|
|
if ($code == 200 && $html) {
|
|
if (preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $html, $matches)) {
|
|
foreach ($matches[0] as $email) {
|
|
$email = strtolower(trim($email));
|
|
if (strlen($email) > 5 && strlen($email) < 100 && strpos($email, 'pagesjaunes') === false) {
|
|
$emails[] = $email;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return array_unique($emails);
|
|
}
|
|
|
|
// Main scrape loop
|
|
$total_new = 0;
|
|
$total_found = 0;
|
|
|
|
$specs = ($target_spec === 'all') ? $specialites : [$target_spec];
|
|
$pays = ($target_pays === 'all') ? $pays_list : [$target_pays];
|
|
|
|
$insert = $db->prepare("INSERT INTO ethica.medecins (email, specialite, ville, pays, source) VALUES (?,?,?,?,?) ON CONFLICT (email) DO NOTHING");
|
|
$log_insert = $db->prepare("INSERT INTO ethica.scraping_log (source_id, specialite, pays, emails_found, emails_new, duration_sec) VALUES (?,?,?,?,?,?)");
|
|
|
|
foreach ($pays as $p) {
|
|
foreach ($specs as $spec) {
|
|
$city_list = $cities[$p] ?? [];
|
|
foreach ($city_list as $city) {
|
|
$start = time();
|
|
log_msg("Scraping $spec / $city / $p...");
|
|
|
|
// Google search
|
|
$emails_g = scrape_google_maps($spec, $city, $p, $pays_names[$p]);
|
|
|
|
// Pages Jaunes
|
|
$emails_pj = scrape_pages_jaunes($spec, $city, $p);
|
|
|
|
$all_emails = array_unique(array_merge($emails_g, $emails_pj));
|
|
$new = 0;
|
|
|
|
foreach ($all_emails as $email) {
|
|
$insert->execute([$email, $spec, $city, $p, "scraper_".date('Y-m')]);
|
|
if ($insert->rowCount() > 0) $new++;
|
|
}
|
|
|
|
$found = count($all_emails);
|
|
$total_found += $found;
|
|
$total_new += $new;
|
|
$duration = time() - $start;
|
|
|
|
log_msg(" Found: $found | New: $new | Time: {$duration}s");
|
|
|
|
// Log
|
|
$log_insert->execute([1, $spec, $p, $found, $new, $duration]);
|
|
|
|
// Rate limit
|
|
usleep(500000);
|
|
}
|
|
}
|
|
}
|
|
|
|
log_msg("=== DONE === Total found: $total_found | New: $total_new");
|
|
|
|
// Update source stats
|
|
$db->exec("UPDATE ethica.scraping_sources SET last_scraped_at=NOW(), contacts_found=(SELECT COUNT(*) FROM ethica.medecins)");
|