Files
wevads-platform/scripts/dark-scraper-cron.php

106 lines
4.7 KiB
PHP
Executable File

<?php
/**
* DARK SCRAPER CRON — Triggers dark-scraper API to process pending jobs
* and creates new jobs from scrapping_targets
*/
$db = pg_connect('host=localhost dbname=adx_system user=admin password=admin123');
pg_query($db, 'SET search_path TO admin,public');
$log = function($m) { echo '['.date('Y-m-d H:i:s').'] '.$m."\n"; };
// 1. Get scrapping_targets
$targets = [];
$r = pg_query($db, "SELECT id, name, url_pattern, category, daily_limit FROM scrapping_targets ORDER BY priority ASC");
while($row = pg_fetch_assoc($r)) $targets[] = $row;
$log(count($targets).' scrapping targets loaded');
// 2. Create dark_scraper_jobs from targets
$created = 0;
foreach($targets as $t) {
$url = pg_escape_string($db, $t['url_pattern']);
$cat = pg_escape_string($db, $t['category']);
// Check not already running
$existing = pg_fetch_result(pg_query($db, "SELECT COUNT(*) FROM dark_scraper_jobs WHERE source_url='$url' AND status='running'"), 0, 0);
if($existing > 0) { $log(" SKIP {$t['name']} (already running)"); continue; }
pg_query($db, "INSERT INTO dark_scraper_jobs (source_url, source_type, emails_found, phones_found, status, results, created_at) VALUES ('$url', '$cat', 0, 0, 'running', '[]', NOW())");
$created++;
$log(" JOB created: {$t['name']}$url");
}
$log("$created new jobs created");
// 3. Process running jobs (scrape URLs)
$jobs = [];
$r = pg_query($db, "SELECT id, source_url, source_type FROM dark_scraper_jobs WHERE status='running' ORDER BY created_at DESC LIMIT 10");
while($row = pg_fetch_assoc($r)) $jobs[] = $row;
$log(count($jobs).' running jobs to process');
$totalEmails = 0;
foreach($jobs as $j) {
$ch = curl_init($j['source_url']);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => 20,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
CURLOPT_SSL_VERIFYPEER => false,
]);
$html = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if($code != 200 || empty($html)) {
pg_query($db, "UPDATE dark_scraper_jobs SET status='failed' WHERE id={$j['id']}");
$log(" FAIL job#{$j['id']} HTTP $code");
continue;
}
// Extract emails
preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $html, $matches);
$emails = array_unique(array_filter($matches[0], function($e) {
return filter_var($e, FILTER_VALIDATE_EMAIL) && !preg_match('/(example|test|noreply|no-reply|wixpress|sentry)/', $e);
}));
// Extract phones
preg_match_all('/\+?[\d\s\-\(\)]{10,}/', $html, $pMatches);
$phones = count(array_unique($pMatches[0]));
$emailCount = count($emails);
pg_query($db, "UPDATE dark_scraper_jobs SET status='completed', emails_found=$emailCount, phones_found=$phones WHERE id={$j['id']}");
// Inject into scrapping_results + send_contacts
foreach($emails as $email) {
$email = strtolower(trim($email));
$esc = pg_escape_string($db, $email);
$domain = substr($email, strpos($email, '@') + 1);
$esc_url = pg_escape_string($db, $j['source_url']);
pg_query($db, "INSERT INTO scrapping_results (email, source_url, confidence_score, is_verified, extracted_at) VALUES ('$esc', '$esc_url', 70, true, NOW()) ON CONFLICT DO NOTHING");
// Detect ISP
$isp = 'OTHER';
$map = ['gmail.com'=>'gmail','hotmail.com'=>'hotmail','outlook.com'=>'hotmail','live.com'=>'hotmail','yahoo.com'=>'yahoo','aol.com'=>'aol','gmx.de'=>'gmx','gmx.net'=>'gmx','t-online.de'=>'tonline','web.de'=>'webde','orange.fr'=>'orange','free.fr'=>'free','sfr.fr'=>'sfr','laposte.net'=>'laposte','protonmail.com'=>'proton','proton.me'=>'proton'];
if(isset($map[$domain])) $isp = $map[$domain];
pg_query($db, "INSERT INTO send_contacts (email, isp, domain, status, source) VALUES ('$esc', '$isp', '$domain', 'active', 'dark_scraper') ON CONFLICT (email) DO NOTHING");
}
$totalEmails += $emailCount;
$log(" OK job#{$j['id']}$emailCount emails, $phones phones");
}
$log("TOTAL: $totalEmails emails scraped");
// Update scrapping_targets last_scraped
foreach($targets as $t) {
pg_query($db, "UPDATE scrapping_targets SET last_scraped=NOW() WHERE id={$t['id']}");
}
// Stats
$r = pg_query($db, "SELECT status, COUNT(*) FROM dark_scraper_jobs GROUP BY status ORDER BY status");
$log("=== DARK SCRAPER JOBS ===");
while($row = pg_fetch_assoc($r)) $log(" {$row['status']}: {$row['count']}");
$total = pg_fetch_result(pg_query($db, "SELECT COUNT(*) FROM scrapping_results"), 0, 0);
$log("Total scrapping_results: $total");