Files
wevads-platform/scripts/auto-scraper.php
2026-03-02 03:00:02 +01:00

146 lines
5.8 KiB
PHP

<?php
/**
* AUTO-SCRAPER: Scrape les ISPs manquants et injecte dans send_contacts
* Utilise admin.scraping_targets pour les sources
*
* Cron: every 6 hours * * * php /opt/wevads/scripts/auto-scraper.php
*/
$db = pg_connect('host=localhost dbname=adx_system user=admin password=admin123');
pg_query($db, 'SET search_path TO admin,public');
date_default_timezone_set('Europe/Paris');
$log = function($msg) { echo '['.date('Y-m-d H:i:s').'] '.$msg."\n"; };
// Get active targets
$targets = [];
$r = pg_query($db, "SELECT * FROM scraping_targets WHERE status='active' ORDER BY COALESCE(last_run,'2000-01-01') ASC");
while($row = pg_fetch_assoc($r)) $targets[] = $row;
$log(count($targets).' scraping targets loaded');
$total_new = 0;
foreach($targets as $t) {
$isp = $t['isp_target'];
$keywords = str_replace(['{','}','"'], '', $t['keywords']);
$domains = explode(',', $keywords);
$urls = str_replace(['{','}','"'], '', $t['source_urls']);
$sources = explode(',', $urls);
$log("Scraping target: {$t['name']} (ISP: $isp)");
$found = 0;
foreach($sources as $url) {
$url = trim($url);
if(empty($url)) continue;
// Fetch the page
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => 30,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
CURLOPT_SSL_VERIFYPEER => false,
]);
$html = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if($httpCode != 200 || empty($html)) {
$log(" SKIP $url (HTTP $httpCode)");
continue;
}
// Extract ALL emails from the page
preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $html, $matches);
$emails = array_unique($matches[0]);
// Filter by target domains
$filtered = [];
foreach($emails as $email) {
$email = strtolower(trim($email));
$emailDomain = substr($email, strpos($email, '@') + 1);
foreach($domains as $d) {
if(trim($d) === $emailDomain) {
$filtered[] = $email;
break;
}
}
}
// Also grab ALL emails regardless of domain (valuable data)
$allEmails = [];
foreach($emails as $email) {
$email = strtolower(trim($email));
if(filter_var($email, FILTER_VALIDATE_EMAIL) && !preg_match('/(example|test|noreply|no-reply|info@|admin@|support@|webmaster@)/', $email)) {
$allEmails[] = $email;
}
}
$log(" $url".count($allEmails)." emails (".count($filtered)." target ISP)");
// Insert into scrapping_results
foreach($allEmails as $email) {
$emailDomain = substr($email, strpos($email, '@') + 1);
$esc_email = pg_escape_string($db, $email);
$esc_url = pg_escape_string($db, $url);
pg_query($db, "INSERT INTO scrapping_results (target_id, email, source_url, confidence_score, is_verified, extracted_at)
VALUES ({$t['id']}, '$esc_email', '$esc_url', 60, true, NOW())
ON CONFLICT DO NOTHING");
}
// Insert into send_contacts
foreach($allEmails as $email) {
$emailDomain = substr($email, strpos($email, '@') + 1);
$esc_email = pg_escape_string($db, $email);
// Detect ISP
$detectedIsp = 'OTHER';
$ispMap = [
'gmail'=>['gmail.com','googlemail.com'],
'hotmail'=>['hotmail.com','hotmail.de','hotmail.fr','hotmail.co.uk','live.com','live.de','live.se','live.co.uk','outlook.com','outlook.de','outlook.fr','msn.com'],
'yahoo'=>['yahoo.com','yahoo.fr','yahoo.de','yahoo.co.uk','yahoo.ca','ymail.com','rocketmail.com'],
'aol'=>['aol.com','aol.de','aol.fr'],
'gmx'=>['gmx.de','gmx.net','gmx.at','gmx.ch'],
'tonline'=>['t-online.de'],
'webde'=>['web.de'],
'orange'=>['orange.fr','wanadoo.fr'],
'free'=>['free.fr'],
'sfr'=>['sfr.fr','neuf.fr'],
'laposte'=>['laposte.net'],
'comcast'=>['comcast.net','xfinity.com'],
'att'=>['att.net','sbcglobal.net','bellsouth.net'],
'bluewin'=>['bluewin.ch'],
'videotron'=>['videotron.ca'],
'proton'=>['protonmail.com','proton.me'],
];
foreach($ispMap as $ispName => $ispDomains) {
if(in_array($emailDomain, $ispDomains)) { $detectedIsp = $ispName; break; }
}
pg_query($db, "INSERT INTO send_contacts (email, isp, domain, country, status, source)
VALUES ('$esc_email', '$detectedIsp', '$emailDomain', '', 'active', 'auto_scrape')
ON CONFLICT (email) DO NOTHING");
if(pg_affected_rows(pg_query($db, "SELECT 1")) >= 0) $found++;
}
$total_new += $found;
}
// Update last_run
pg_query($db, "UPDATE scraping_targets SET last_run=NOW(), emails_found=emails_found+$found WHERE id={$t['id']}");
$log("$found new contacts injected for $isp");
}
$log("TOTAL: $total_new new contacts");
// Final stats
$r = pg_query($db, "SELECT isp, COUNT(*) as c FROM send_contacts WHERE status='active' GROUP BY isp ORDER BY c DESC");
$log("=== SEND CONTACTS BY ISP ===");
while($row = pg_fetch_assoc($r)) {
$log(" {$row['isp']}: {$row['c']}");
}