true, CURLOPT_FOLLOWLOCATION => true, CURLOPT_TIMEOUT => 30, CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', CURLOPT_SSL_VERIFYPEER => false, ]); $html = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if($httpCode != 200 || empty($html)) { $log(" SKIP $url (HTTP $httpCode)"); continue; } // Extract ALL emails from the page preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $html, $matches); $emails = array_unique($matches[0]); // Filter by target domains $filtered = []; foreach($emails as $email) { $email = strtolower(trim($email)); $emailDomain = substr($email, strpos($email, '@') + 1); foreach($domains as $d) { if(trim($d) === $emailDomain) { $filtered[] = $email; break; } } } // Also grab ALL emails regardless of domain (valuable data) $allEmails = []; foreach($emails as $email) { $email = strtolower(trim($email)); if(filter_var($email, FILTER_VALIDATE_EMAIL) && !preg_match('/(example|test|noreply|no-reply|info@|admin@|support@|webmaster@)/', $email)) { $allEmails[] = $email; } } $log(" $url → ".count($allEmails)." emails (".count($filtered)." target ISP)"); // Insert into scrapping_results foreach($allEmails as $email) { $emailDomain = substr($email, strpos($email, '@') + 1); $esc_email = pg_escape_string($db, $email); $esc_url = pg_escape_string($db, $url); pg_query($db, "INSERT INTO scrapping_results (target_id, email, source_url, confidence_score, is_verified, extracted_at) VALUES ({$t['id']}, '$esc_email', '$esc_url', 60, true, NOW()) ON CONFLICT DO NOTHING"); } // Insert into send_contacts foreach($allEmails as $email) { $emailDomain = substr($email, strpos($email, '@') + 1); $esc_email = pg_escape_string($db, $email); // Detect ISP $detectedIsp = 'OTHER'; $ispMap = [ 'gmail'=>['gmail.com','googlemail.com'], 'hotmail'=>['hotmail.com','hotmail.de','hotmail.fr','hotmail.co.uk','live.com','live.de','live.se','live.co.uk','outlook.com','outlook.de','outlook.fr','msn.com'], 'yahoo'=>['yahoo.com','yahoo.fr','yahoo.de','yahoo.co.uk','yahoo.ca','ymail.com','rocketmail.com'], 'aol'=>['aol.com','aol.de','aol.fr'], 'gmx'=>['gmx.de','gmx.net','gmx.at','gmx.ch'], 'tonline'=>['t-online.de'], 'webde'=>['web.de'], 'orange'=>['orange.fr','wanadoo.fr'], 'free'=>['free.fr'], 'sfr'=>['sfr.fr','neuf.fr'], 'laposte'=>['laposte.net'], 'comcast'=>['comcast.net','xfinity.com'], 'att'=>['att.net','sbcglobal.net','bellsouth.net'], 'bluewin'=>['bluewin.ch'], 'videotron'=>['videotron.ca'], 'proton'=>['protonmail.com','proton.me'], ]; foreach($ispMap as $ispName => $ispDomains) { if(in_array($emailDomain, $ispDomains)) { $detectedIsp = $ispName; break; } } pg_query($db, "INSERT INTO send_contacts (email, isp, domain, country, status, source) VALUES ('$esc_email', '$detectedIsp', '$emailDomain', '', 'active', 'auto_scrape') ON CONFLICT (email) DO NOTHING"); if(pg_affected_rows(pg_query($db, "SELECT 1")) >= 0) $found++; } $total_new += $found; } // Update last_run pg_query($db, "UPDATE scraping_targets SET last_run=NOW(), emails_found=emails_found+$found WHERE id={$t['id']}"); $log(" → $found new contacts injected for $isp"); } $log("TOTAL: $total_new new contacts"); // Final stats $r = pg_query($db, "SELECT isp, COUNT(*) as c FROM send_contacts WHERE status='active' GROUP BY isp ORDER BY c DESC"); $log("=== SEND CONTACTS BY ISP ==="); while($row = pg_fetch_assoc($r)) { $log(" {$row['isp']}: {$row['c']}"); }