106 lines
4.7 KiB
PHP
Executable File
106 lines
4.7 KiB
PHP
Executable File
<?php
|
|
/**
|
|
* DARK SCRAPER CRON — Triggers dark-scraper API to process pending jobs
|
|
* and creates new jobs from scrapping_targets
|
|
*/
|
|
$db = pg_connect('host=localhost dbname=adx_system user=admin password=admin123');
|
|
pg_query($db, 'SET search_path TO admin,public');
|
|
$log = function($m) { echo '['.date('Y-m-d H:i:s').'] '.$m."\n"; };
|
|
|
|
// 1. Get scrapping_targets
|
|
$targets = [];
|
|
$r = pg_query($db, "SELECT id, name, url_pattern, category, daily_limit FROM scrapping_targets ORDER BY priority ASC");
|
|
while($row = pg_fetch_assoc($r)) $targets[] = $row;
|
|
$log(count($targets).' scrapping targets loaded');
|
|
|
|
// 2. Create dark_scraper_jobs from targets
|
|
$created = 0;
|
|
foreach($targets as $t) {
|
|
$url = pg_escape_string($db, $t['url_pattern']);
|
|
$cat = pg_escape_string($db, $t['category']);
|
|
|
|
// Check not already running
|
|
$existing = pg_fetch_result(pg_query($db, "SELECT COUNT(*) FROM dark_scraper_jobs WHERE source_url='$url' AND status='running'"), 0, 0);
|
|
if($existing > 0) { $log(" SKIP {$t['name']} (already running)"); continue; }
|
|
|
|
pg_query($db, "INSERT INTO dark_scraper_jobs (source_url, source_type, emails_found, phones_found, status, results, created_at) VALUES ('$url', '$cat', 0, 0, 'running', '[]', NOW())");
|
|
$created++;
|
|
$log(" JOB created: {$t['name']} → $url");
|
|
}
|
|
$log("$created new jobs created");
|
|
|
|
// 3. Process running jobs (scrape URLs)
|
|
$jobs = [];
|
|
$r = pg_query($db, "SELECT id, source_url, source_type FROM dark_scraper_jobs WHERE status='running' ORDER BY created_at DESC LIMIT 10");
|
|
while($row = pg_fetch_assoc($r)) $jobs[] = $row;
|
|
$log(count($jobs).' running jobs to process');
|
|
|
|
$totalEmails = 0;
|
|
foreach($jobs as $j) {
|
|
$ch = curl_init($j['source_url']);
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_TIMEOUT => 20,
|
|
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
CURLOPT_SSL_VERIFYPEER => false,
|
|
]);
|
|
$html = curl_exec($ch);
|
|
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if($code != 200 || empty($html)) {
|
|
pg_query($db, "UPDATE dark_scraper_jobs SET status='failed' WHERE id={$j['id']}");
|
|
$log(" FAIL job#{$j['id']} HTTP $code");
|
|
continue;
|
|
}
|
|
|
|
// Extract emails
|
|
preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $html, $matches);
|
|
$emails = array_unique(array_filter($matches[0], function($e) {
|
|
return filter_var($e, FILTER_VALIDATE_EMAIL) && !preg_match('/(example|test|noreply|no-reply|wixpress|sentry)/', $e);
|
|
}));
|
|
|
|
// Extract phones
|
|
preg_match_all('/\+?[\d\s\-\(\)]{10,}/', $html, $pMatches);
|
|
$phones = count(array_unique($pMatches[0]));
|
|
|
|
$emailCount = count($emails);
|
|
pg_query($db, "UPDATE dark_scraper_jobs SET status='completed', emails_found=$emailCount, phones_found=$phones WHERE id={$j['id']}");
|
|
|
|
// Inject into scrapping_results + send_contacts
|
|
foreach($emails as $email) {
|
|
$email = strtolower(trim($email));
|
|
$esc = pg_escape_string($db, $email);
|
|
$domain = substr($email, strpos($email, '@') + 1);
|
|
$esc_url = pg_escape_string($db, $j['source_url']);
|
|
|
|
pg_query($db, "INSERT INTO scrapping_results (email, source_url, confidence_score, is_verified, extracted_at) VALUES ('$esc', '$esc_url', 70, true, NOW()) ON CONFLICT DO NOTHING");
|
|
|
|
// Detect ISP
|
|
$isp = 'OTHER';
|
|
$map = ['gmail.com'=>'gmail','hotmail.com'=>'hotmail','outlook.com'=>'hotmail','live.com'=>'hotmail','yahoo.com'=>'yahoo','aol.com'=>'aol','gmx.de'=>'gmx','gmx.net'=>'gmx','t-online.de'=>'tonline','web.de'=>'webde','orange.fr'=>'orange','free.fr'=>'free','sfr.fr'=>'sfr','laposte.net'=>'laposte','protonmail.com'=>'proton','proton.me'=>'proton'];
|
|
if(isset($map[$domain])) $isp = $map[$domain];
|
|
|
|
pg_query($db, "INSERT INTO send_contacts (email, isp, domain, status, source) VALUES ('$esc', '$isp', '$domain', 'active', 'dark_scraper') ON CONFLICT (email) DO NOTHING");
|
|
}
|
|
|
|
$totalEmails += $emailCount;
|
|
$log(" OK job#{$j['id']} → $emailCount emails, $phones phones");
|
|
}
|
|
|
|
$log("TOTAL: $totalEmails emails scraped");
|
|
|
|
// Update scrapping_targets last_scraped
|
|
foreach($targets as $t) {
|
|
pg_query($db, "UPDATE scrapping_targets SET last_scraped=NOW() WHERE id={$t['id']}");
|
|
}
|
|
|
|
// Stats
|
|
$r = pg_query($db, "SELECT status, COUNT(*) FROM dark_scraper_jobs GROUP BY status ORDER BY status");
|
|
$log("=== DARK SCRAPER JOBS ===");
|
|
while($row = pg_fetch_assoc($r)) $log(" {$row['status']}: {$row['count']}");
|
|
|
|
$total = pg_fetch_result(pg_query($db, "SELECT COUNT(*) FROM scrapping_results"), 0, 0);
|
|
$log("Total scrapping_results: $total");
|