146 lines
5.8 KiB
PHP
146 lines
5.8 KiB
PHP
<?php
|
|
/**
|
|
* AUTO-SCRAPER: Scrape les ISPs manquants et injecte dans send_contacts
|
|
* Utilise admin.scraping_targets pour les sources
|
|
*
|
|
* Cron: every 6 hours * * * php /opt/wevads/scripts/auto-scraper.php
|
|
*/
|
|
|
|
$db = pg_connect('host=localhost dbname=adx_system user=admin password=admin123');
|
|
pg_query($db, 'SET search_path TO admin,public');
|
|
date_default_timezone_set('Europe/Paris');
|
|
|
|
$log = function($msg) { echo '['.date('Y-m-d H:i:s').'] '.$msg."\n"; };
|
|
|
|
// Get active targets
|
|
$targets = [];
|
|
$r = pg_query($db, "SELECT * FROM scraping_targets WHERE status='active' ORDER BY COALESCE(last_run,'2000-01-01') ASC");
|
|
while($row = pg_fetch_assoc($r)) $targets[] = $row;
|
|
|
|
$log(count($targets).' scraping targets loaded');
|
|
|
|
$total_new = 0;
|
|
|
|
foreach($targets as $t) {
|
|
$isp = $t['isp_target'];
|
|
$keywords = str_replace(['{','}','"'], '', $t['keywords']);
|
|
$domains = explode(',', $keywords);
|
|
$urls = str_replace(['{','}','"'], '', $t['source_urls']);
|
|
$sources = explode(',', $urls);
|
|
|
|
$log("Scraping target: {$t['name']} (ISP: $isp)");
|
|
|
|
$found = 0;
|
|
foreach($sources as $url) {
|
|
$url = trim($url);
|
|
if(empty($url)) continue;
|
|
|
|
// Fetch the page
|
|
$ch = curl_init($url);
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_TIMEOUT => 30,
|
|
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
CURLOPT_SSL_VERIFYPEER => false,
|
|
]);
|
|
$html = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if($httpCode != 200 || empty($html)) {
|
|
$log(" SKIP $url (HTTP $httpCode)");
|
|
continue;
|
|
}
|
|
|
|
// Extract ALL emails from the page
|
|
preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $html, $matches);
|
|
$emails = array_unique($matches[0]);
|
|
|
|
// Filter by target domains
|
|
$filtered = [];
|
|
foreach($emails as $email) {
|
|
$email = strtolower(trim($email));
|
|
$emailDomain = substr($email, strpos($email, '@') + 1);
|
|
foreach($domains as $d) {
|
|
if(trim($d) === $emailDomain) {
|
|
$filtered[] = $email;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Also grab ALL emails regardless of domain (valuable data)
|
|
$allEmails = [];
|
|
foreach($emails as $email) {
|
|
$email = strtolower(trim($email));
|
|
if(filter_var($email, FILTER_VALIDATE_EMAIL) && !preg_match('/(example|test|noreply|no-reply|info@|admin@|support@|webmaster@)/', $email)) {
|
|
$allEmails[] = $email;
|
|
}
|
|
}
|
|
|
|
$log(" $url → ".count($allEmails)." emails (".count($filtered)." target ISP)");
|
|
|
|
// Insert into scrapping_results
|
|
foreach($allEmails as $email) {
|
|
$emailDomain = substr($email, strpos($email, '@') + 1);
|
|
$esc_email = pg_escape_string($db, $email);
|
|
$esc_url = pg_escape_string($db, $url);
|
|
pg_query($db, "INSERT INTO scrapping_results (target_id, email, source_url, confidence_score, is_verified, extracted_at)
|
|
VALUES ({$t['id']}, '$esc_email', '$esc_url', 60, true, NOW())
|
|
ON CONFLICT DO NOTHING");
|
|
}
|
|
|
|
// Insert into send_contacts
|
|
foreach($allEmails as $email) {
|
|
$emailDomain = substr($email, strpos($email, '@') + 1);
|
|
$esc_email = pg_escape_string($db, $email);
|
|
|
|
// Detect ISP
|
|
$detectedIsp = 'OTHER';
|
|
$ispMap = [
|
|
'gmail'=>['gmail.com','googlemail.com'],
|
|
'hotmail'=>['hotmail.com','hotmail.de','hotmail.fr','hotmail.co.uk','live.com','live.de','live.se','live.co.uk','outlook.com','outlook.de','outlook.fr','msn.com'],
|
|
'yahoo'=>['yahoo.com','yahoo.fr','yahoo.de','yahoo.co.uk','yahoo.ca','ymail.com','rocketmail.com'],
|
|
'aol'=>['aol.com','aol.de','aol.fr'],
|
|
'gmx'=>['gmx.de','gmx.net','gmx.at','gmx.ch'],
|
|
'tonline'=>['t-online.de'],
|
|
'webde'=>['web.de'],
|
|
'orange'=>['orange.fr','wanadoo.fr'],
|
|
'free'=>['free.fr'],
|
|
'sfr'=>['sfr.fr','neuf.fr'],
|
|
'laposte'=>['laposte.net'],
|
|
'comcast'=>['comcast.net','xfinity.com'],
|
|
'att'=>['att.net','sbcglobal.net','bellsouth.net'],
|
|
'bluewin'=>['bluewin.ch'],
|
|
'videotron'=>['videotron.ca'],
|
|
'proton'=>['protonmail.com','proton.me'],
|
|
];
|
|
foreach($ispMap as $ispName => $ispDomains) {
|
|
if(in_array($emailDomain, $ispDomains)) { $detectedIsp = $ispName; break; }
|
|
}
|
|
|
|
pg_query($db, "INSERT INTO send_contacts (email, isp, domain, country, status, source)
|
|
VALUES ('$esc_email', '$detectedIsp', '$emailDomain', '', 'active', 'auto_scrape')
|
|
ON CONFLICT (email) DO NOTHING");
|
|
|
|
if(pg_affected_rows(pg_query($db, "SELECT 1")) >= 0) $found++;
|
|
}
|
|
|
|
$total_new += $found;
|
|
}
|
|
|
|
// Update last_run
|
|
pg_query($db, "UPDATE scraping_targets SET last_run=NOW(), emails_found=emails_found+$found WHERE id={$t['id']}");
|
|
$log(" → $found new contacts injected for $isp");
|
|
}
|
|
|
|
$log("TOTAL: $total_new new contacts");
|
|
|
|
// Final stats
|
|
$r = pg_query($db, "SELECT isp, COUNT(*) as c FROM send_contacts WHERE status='active' GROUP BY isp ORDER BY c DESC");
|
|
$log("=== SEND CONTACTS BY ISP ===");
|
|
while($row = pg_fetch_assoc($r)) {
|
|
$log(" {$row['isp']}: {$row['c']}");
|
|
}
|