Files
wevads-platform/scripts/rich-scraper-mega.php.v1-bak

201 lines
17 KiB
Plaintext
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
/**
* RICH INTERNATIONAL SCRAPER v1 — High-Revenue Professionals
* 24 professions × 13 countries × cities = massive B2B pool
* Mode generate: create email patterns from name+profession+city+domain
* Mode google: scrape Google for real contacts (HTTP only, no SMTP)
* Usage: php rich-scraper-mega.php [pays|all] [profession|all] [mode: generate|google|all]
*/
error_reporting(E_ALL); set_time_limit(0); ini_set('memory_limit','1G');
$db = new PDO("pgsql:host=localhost;dbname=adx_system","admin","admin123");
$db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$target_pays = $argv[1] ?? 'all';
$target_prof = $argv[2] ?? 'all';
$mode = $argv[3] ?? 'generate';
function lg($m) { $l=date('H:i:s')." $m\n"; echo $l; @file_put_contents("/opt/wevads/logs/rich-scraper.log",date('Y-m-d ').$l,FILE_APPEND); }
function tr($s) { return str_replace(['é','è','ê','ë','à','â','î','ô','ù','û','ç','ü','ö','ä','É','È','Ê','À','Â','Ç','Ü','Ö','Ä',' ','-',"'"],
['e','e','e','e','a','a','i','o','u','u','c','u','o','a','e','e','e','a','a','c','u','o','a','','',''],$s); }
$professions = [];
foreach($db->query("SELECT code,label,category FROM richscraper.professions") as $r) $professions[$r['code']]=$r;
$pays_config = [
'FR'=>['cities'=>['Paris','Lyon','Marseille','Toulouse','Nice','Nantes','Strasbourg','Montpellier','Bordeaux','Lille','Rennes','Grenoble','Dijon','Angers','Tours','Rouen','Caen','Metz','Reims','Orleans'],
'domains'=>['gmail.com','yahoo.fr','hotmail.fr','outlook.fr','orange.fr','free.fr','sfr.fr','laposte.net'],
'names_m'=>['Jean','Pierre','Michel','Philippe','Nicolas','Francois','Laurent','Patrick','Christophe','Stephane','Frederic','David','Thomas','Julien','Alexandre','Antoine','Guillaume','Olivier','Eric','Thierry'],
'names_f'=>['Marie','Nathalie','Isabelle','Sophie','Catherine','Valerie','Sandrine','Stephanie','Veronique','Christine','Anne','Florence','Caroline','Aurelie','Sylvie','Celine','Claire','Dominique','Helene','Emilie'],
'surnames'=>['Martin','Bernard','Thomas','Petit','Robert','Richard','Durand','Dubois','Moreau','Laurent','Simon','Michel','Leroy','Roux','David','Bertrand','Morel','Fournier','Girard','Bonnet']],
'DE'=>['cities'=>['Berlin','Hamburg','Munchen','Koln','Frankfurt','Stuttgart','Dusseldorf','Leipzig','Dortmund','Essen','Bremen','Dresden','Hannover','Nurnberg','Bonn'],
'domains'=>['gmail.com','gmx.de','web.de','t-online.de','yahoo.de','hotmail.de','outlook.de','freenet.de'],
'names_m'=>['Thomas','Michael','Andreas','Stefan','Christian','Peter','Wolfgang','Markus','Martin','Jens','Frank','Klaus','Matthias','Daniel','Alexander','Tobias','Bernd','Ralf','Uwe','Joerg'],
'names_f'=>['Sabine','Andrea','Petra','Claudia','Susanne','Monika','Karin','Stefanie','Birgit','Nicole','Julia','Martina','Silke','Katharina','Heike','Anja','Christina','Daniela','Angelika','Renate'],
'surnames'=>['Muller','Schmidt','Schneider','Fischer','Weber','Meyer','Wagner','Becker','Schulz','Hoffmann','Koch','Richter','Wolf','Klein','Schroeder','Neumann','Braun','Zimmermann','Kruger','Hartmann']],
'UK'=>['cities'=>['London','Birmingham','Manchester','Leeds','Glasgow','Liverpool','Newcastle','Sheffield','Bristol','Edinburgh','Cardiff','Nottingham','Leicester','Brighton','Oxford'],
'domains'=>['gmail.com','yahoo.co.uk','hotmail.co.uk','outlook.com','btinternet.com','sky.com'],
'names_m'=>['James','John','Robert','David','William','Richard','Thomas','Mark','Paul','Andrew','Stephen','Christopher','Daniel','Michael','Peter','Matthew','Simon','Jonathan','Anthony','Stuart'],
'names_f'=>['Sarah','Emma','Laura','Charlotte','Hannah','Lucy','Rebecca','Sophie','Amy','Jessica','Rachel','Victoria','Helen','Katie','Emily','Claire','Natalie','Lisa','Anna','Jennifer'],
'surnames'=>['Smith','Jones','Williams','Taylor','Brown','Davies','Evans','Wilson','Thomas','Johnson','Roberts','Walker','Wright','Robinson','Thompson','White','Hughes','Edwards','Green','Hall']],
'US'=>['cities'=>['New York','Los Angeles','Chicago','Houston','Phoenix','Philadelphia','San Diego','Dallas','Austin','San Francisco','Seattle','Denver','Miami','Atlanta','Boston','Las Vegas','Portland','Nashville','Charlotte','Minneapolis'],
'domains'=>['gmail.com','yahoo.com','hotmail.com','outlook.com','aol.com','comcast.net','att.net','icloud.com'],
'names_m'=>['James','John','Robert','Michael','William','David','Richard','Joseph','Thomas','Charles','Daniel','Matthew','Christopher','Andrew','Brian','Kevin','Steven','Mark','Jason','Ryan'],
'names_f'=>['Mary','Jennifer','Jessica','Sarah','Elizabeth','Amanda','Ashley','Stephanie','Nicole','Heather','Lauren','Megan','Emily','Rachel','Kimberly','Michelle','Christina','Brittany','Samantha','Katherine'],
'surnames'=>['Smith','Johnson','Williams','Brown','Jones','Garcia','Miller','Davis','Rodriguez','Martinez','Wilson','Anderson','Taylor','Thomas','Moore','Jackson','Martin','Lee','Thompson','Harris']],
'MA'=>['cities'=>['Casablanca','Rabat','Marrakech','Fes','Tanger','Meknes','Agadir','Oujda','Kenitra','Tetouan','Sale','Mohammedia','Settat','Safi','Nador'],
'domains'=>['gmail.com','yahoo.fr','hotmail.com','outlook.com','menara.ma','iam.ma'],
'names_m'=>['Mohamed','Ahmed','Youssef','Karim','Omar','Ali','Hassan','Mehdi','Amine','Rachid','Samir','Nabil','Mustapha','Said','Khalid','Abdelilah','Hamza','Zakaria','Othmane','Adil'],
'names_f'=>['Fatima','Amina','Khadija','Houda','Sanaa','Naima','Laila','Meryem','Soukaina','Hind','Salma','Nadia','Karima','Samira','Zineb','Imane','Sara','Ghizlane','Hajar','Asmae'],
'surnames'=>['Alaoui','Benjelloun','Berrada','Bennis','Tazi','Filali','Fassi','Idrissi','Chraibi','Belhaj','Lahlou','Sefrioui','Kettani','Benkirane','Zniber','Amrani','Tahiri','Ouazzani','Kadiri','Bouazza']],
'TN'=>['cities'=>['Tunis','Sfax','Sousse','Kairouan','Bizerte','Gabes','Ariana','Monastir','Nabeul','Mahdia'],
'domains'=>['gmail.com','yahoo.fr','hotmail.com','outlook.com','topnet.tn','planet.tn'],
'names_m'=>['Mohamed','Ahmed','Ali','Mehdi','Amine','Youssef','Karim','Hamza','Omar','Nabil','Hichem','Sami','Riadh','Slim','Bilel','Fares','Hatem','Khaled','Marouane','Sofiane'],
'names_f'=>['Fatma','Amina','Ines','Sarra','Mariem','Hanen','Rim','Olfa','Nour','Salma','Rania','Asma','Syrine','Emna','Yosra','Amel','Dorra','Wafa','Leila','Houda'],
'surnames'=>['Ben Ali','Trabelsi','Bouazizi','Chaabane','Hamdi','Jebali','Khelifi','Meddeb','Nasri','Saidi','Souissi','Tlili','Zarrouk','Dridi','Karoui','Mansouri','Riahi','Sghaier','Turki','Zouari']],
'DZ'=>['cities'=>['Alger','Oran','Constantine','Annaba','Blida','Batna','Setif','Djelfa','Biskra','Bejaia','Tlemcen','Tiaret','Tizi Ouzou','Mostaganem'],
'domains'=>['gmail.com','yahoo.fr','hotmail.com','outlook.com','djaweb.dz'],
'names_m'=>['Mohamed','Ahmed','Youcef','Karim','Amine','Omar','Ali','Mehdi','Hamza','Rachid','Djamel','Farid','Nabil','Sofiane','Walid','Mourad','Samir','Hakim','Bilal','Redouane'],
'names_f'=>['Fatima','Amina','Aicha','Khadidja','Houria','Malika','Nadia','Souad','Samira','Djamila','Asma','Meriem','Sara','Imene','Yasmine','Lina','Chaima','Lamia','Sihem','Rym'],
'surnames'=>['Bensalem','Boudiaf','Djelloul','Hammoudi','Kaci','Larbaoui','Makhlouf','Mebarki','Rahmani','Saadi','Belkacem','Chabane','Ferhat','Gherbi','Haddad','Mahmoudi','Ouali','Sebti','Taleb','Yahi']],
'IT'=>['cities'=>['Roma','Milano','Napoli','Torino','Palermo','Genova','Bologna','Firenze','Catania','Bari','Venezia','Verona','Padova'],
'domains'=>['gmail.com','yahoo.it','hotmail.it','libero.it','virgilio.it','tiscali.it','alice.it'],
'names_m'=>['Marco','Giuseppe','Giovanni','Francesco','Andrea','Luca','Alessandro','Matteo','Lorenzo','Roberto','Stefano','Paolo','Davide','Simone','Antonio','Fabio','Daniele','Riccardo','Massimo','Claudio'],
'names_f'=>['Maria','Giulia','Francesca','Sara','Anna','Valentina','Chiara','Laura','Alessia','Federica','Silvia','Elisa','Martina','Roberta','Paola','Simona','Monica','Elena','Claudia','Cristina'],
'surnames'=>['Rossi','Russo','Ferrari','Esposito','Bianchi','Romano','Colombo','Ricci','Marino','Greco','Bruno','Gallo','Conti','DeLuca','Mancini','Costa','Giordano','Rizzo','Lombardi','Moretti']],
'CH'=>['cities'=>['Zurich','Geneve','Basel','Bern','Lausanne','Winterthur','Luzern','Lugano','St Gallen','Biel'],
'domains'=>['gmail.com','bluewin.ch','sunrise.ch','gmx.ch','hotmail.com'],
'names_m'=>['Peter','Thomas','Daniel','Martin','Andreas','Christian','Michael','Stefan','Markus','Patrick','David','Philippe','Nicolas','Laurent','Marc'],
'names_f'=>['Sandra','Andrea','Claudia','Monika','Daniela','Christine','Franziska','Barbara','Nicole','Sabine','Nathalie','Caroline','Isabelle','Sophie','Marie'],
'surnames'=>['Muller','Meier','Schmid','Keller','Weber','Huber','Schneider','Meyer','Steiner','Fischer','Gerber','Brunner','Baumann','Frei','Zimmermann']],
'NL'=>['cities'=>['Amsterdam','Rotterdam','Den Haag','Utrecht','Eindhoven','Groningen','Tilburg','Almere','Breda','Nijmegen'],
'domains'=>['gmail.com','hotmail.nl','outlook.nl','ziggo.nl','kpnmail.nl','xs4all.nl'],
'names_m'=>['Jan','Pieter','Willem','Henk','Jeroen','Mark','Bas','Tom','Sander','Dennis','Rob','Erik','Frank','Marcel','Arjan'],
'names_f'=>['Anna','Maria','Johanna','Cornelia','Elisabeth','Sandra','Monique','Ingrid','Linda','Anouk','Petra','Marieke','Esther','Wendy','Ilse'],
'surnames'=>['DeJong','Jansen','DeVries','VanDijk','Bakker','Janssen','Visser','Smit','Meijer','DeGroot','Bos','Vos','Peters','Hendriks','VanLeeuwen']],
'CA'=>['cities'=>['Toronto','Montreal','Vancouver','Calgary','Edmonton','Ottawa','Winnipeg','Quebec','Hamilton','Kitchener'],
'domains'=>['gmail.com','yahoo.ca','hotmail.com','outlook.com','videotron.ca','bell.net','rogers.com'],
'names_m'=>['James','Michael','Robert','David','John','William','Daniel','Christopher','Matthew','Andrew','Ryan','Joshua','Kevin','Brian','Jason'],
'names_f'=>['Jennifer','Sarah','Jessica','Amanda','Ashley','Emily','Stephanie','Nicole','Lauren','Megan','Rachel','Samantha','Heather','Brittany','Kimberly'],
'surnames'=>['Smith','Brown','Tremblay','Martin','Roy','Wilson','Gagnon','Johnson','Taylor','MacDonald','Campbell','Anderson','Jones','Leblanc','Williams']],
'AU'=>['cities'=>['Sydney','Melbourne','Brisbane','Perth','Adelaide','Gold Coast','Canberra','Newcastle','Hobart','Cairns'],
'domains'=>['gmail.com','yahoo.com.au','hotmail.com.au','bigpond.com','optusnet.com.au','iinet.net.au'],
'names_m'=>['James','William','Jack','Thomas','Joshua','Daniel','Matthew','Oliver','Samuel','Benjamin','Liam','Noah','Ethan','Ryan','Luke'],
'names_f'=>['Charlotte','Emily','Olivia','Sophie','Jessica','Amelia','Chloe','Isabella','Emma','Mia','Grace','Lily','Hannah','Sarah','Zoe'],
'surnames'=>['Smith','Jones','Williams','Brown','Wilson','Taylor','Johnson','White','Martin','Anderson','Thompson','Nguyen','Thomas','Walker','Harris']],
'PL'=>['cities'=>['Warszawa','Krakow','Lodz','Wroclaw','Poznan','Gdansk','Szczecin','Bydgoszcz','Lublin','Katowice'],
'domains'=>['gmail.com','wp.pl','onet.pl','o2.pl','interia.pl','gazeta.pl','hotmail.com'],
'names_m'=>['Piotr','Krzysztof','Andrzej','Jan','Stanislaw','Tomasz','Pawel','Marcin','Marek','Michal','Grzegorz','Lukasz','Adam','Dariusz','Robert'],
'names_f'=>['Anna','Maria','Katarzyna','Malgorzata','Agnieszka','Barbara','Ewa','Joanna','Dorota','Monika','Magdalena','Beata','Aleksandra','Justyna','Karolina'],
'surnames'=>['Nowak','Kowalski','Wisniewski','Wojciechowski','Kaminski','Lewandowski','Zielinski','Szymanski','Wozniak','Dabrowski','Kozlowski','Jankowski','Mazur','Kwiatkowski','Krawczyk']],
];
// Email pattern generators
function gen_email($prenom,$nom,$domain) {
$p=strtolower(tr($prenom)); $n=strtolower(tr($nom));
$patterns = [
"$p.$n@$domain", "$p$n@$domain", "$p.$n".rand(1,99)."@$domain",
substr($p,0,1).".$n@$domain", "$n.$p@$domain",
"$p".substr($n,0,1)."@$domain", substr($p,0,1)."$n@$domain",
];
return $patterns[array_rand($patterns)];
}
// === GENERATE MODE ===
if($mode === 'generate' || $mode === 'all') {
lg("=== GENERATE MODE ===");
$insert = $db->prepare("INSERT INTO richscraper.professionals (email,nom,prenom,profession,secteur,ville,pays,domain,isp,source,email_valid,revenue_estimate) VALUES (?,?,?,?,?,?,?,?,?,?,?,?) ON CONFLICT(email) DO NOTHING");
$total = 0; $inserted = 0;
$countries = ($target_pays === 'all') ? array_keys($pays_config) : [strtoupper($target_pays)];
$profs = ($target_prof === 'all') ? array_keys($professions) : [$target_prof];
foreach($countries as $pays) {
if(!isset($pays_config[$pays])) continue;
$cfg = $pays_config[$pays];
$per_city_prof = 10; // contacts per city per profession
foreach($cfg['cities'] as $city) {
foreach($profs as $prof_code) {
if(!isset($professions[$prof_code])) continue;
$prof = $professions[$prof_code];
for($i=0; $i<$per_city_prof; $i++) {
$gender = rand(0,1) ? 'm' : 'f';
$prenom = $cfg['names_'.$gender][array_rand($cfg['names_'.$gender])];
$nom = $cfg['surnames'][array_rand($cfg['surnames'])];
$domain = $cfg['domains'][array_rand($cfg['domains'])];
$email = gen_email($prenom, $nom, $domain);
$isp = explode('.',$domain)[0];
try {
$insert->execute([$email,$nom,$prenom,$prof_code,$prof['category'],$city,$pays,$domain,$isp,
'gen_'.date('Ymd'),'unknown',$prof['avg_revenue'] ?? 'high']);
$inserted++;
} catch(Exception $e) { /* dup */ }
$total++;
}
}
}
lg("$pays: generated for ".count($cfg['cities'])." cities × ".count($profs)." professions");
}
lg("GENERATE DONE: $inserted/$total inserted");
}
// === GOOGLE SCRAPE MODE ===
if($mode === 'google' || $mode === 'all') {
lg("=== GOOGLE SCRAPE MODE ===");
$countries = ($target_pays === 'all') ? array_keys($pays_config) : [strtoupper($target_pays)];
$profs = ($target_prof === 'all') ? array_keys($professions) : [$target_prof];
$insert = $db->prepare("INSERT INTO richscraper.professionals (email,nom,prenom,profession,secteur,ville,pays,domain,isp,source,email_valid) VALUES (?,?,?,?,?,?,?,?,?,?,?) ON CONFLICT(email) DO NOTHING");
$scraped = 0;
foreach($countries as $pays) {
if(!isset($pays_config[$pays])) continue;
$cfg = $pays_config[$pays];
$sample_cities = array_slice($cfg['cities'], 0, 5);
$sample_profs = array_slice($profs, 0, 5);
foreach($sample_cities as $city) {
foreach($sample_profs as $prof_code) {
if(!isset($professions[$prof_code])) continue;
$label = $professions[$prof_code]['label'];
$q = urlencode("$label $city email contact");
$url = "https://www.google.com/search?q=$q&num=10";
$ch = curl_init($url);
curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER=>1, CURLOPT_FOLLOWLOCATION=>1, CURLOPT_TIMEOUT=>15,
CURLOPT_USERAGENT=>'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
CURLOPT_HTTPHEADER=>['Accept-Language: fr-FR,fr;q=0.9'], CURLOPT_SSL_VERIFYPEER=>0]);
$html = curl_exec($ch);
curl_close($ch);
if(!$html) continue;
// Extract emails from results
preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $html, $emails);
$emails = array_unique($emails[0]);
foreach($emails as $email) {
$email = strtolower(trim($email));
if(strlen($email) < 6 || strlen($email) > 100) continue;
if(preg_match('/@(google|gstatic|schema|example|sentry)\./i', $email)) continue;
$domain = substr($email, strpos($email,'@')+1);
$isp = explode('.',$domain)[0];
try {
$insert->execute([$email,'','', $prof_code, $professions[$prof_code]['category'],$city,$pays,$domain,$isp,'google_'.date('Ymd'),'unknown']);
$scraped++;
} catch(Exception $e) {}
}
usleep(500000); // 0.5s between Google queries
}
}
lg("$pays: scraped $scraped emails from Google");
}
lg("GOOGLE DONE: $scraped new emails");
}
// Final stats
$stats = $db->query("SELECT pays,COUNT(*) as c FROM richscraper.professionals GROUP BY pays ORDER BY c DESC")->fetchAll(PDO::FETCH_ASSOC);
lg("=== FINAL STATS ===");
$grand = 0;
foreach($stats as $s) { lg(" {$s['pays']}: {$s['c']}"); $grand += $s['c']; }
lg("TOTAL: $grand professionals");