Files
wevads-platform/scripts/rich-scraper-v2-patch.php

245 lines
23 KiB
PHP
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
/**
* RICH SCRAPER v2 — PATCH: Add new countries + new professions
* Runs generate mode for NEW countries only + new professions for ALL countries
*/
error_reporting(E_ALL); set_time_limit(0); ini_set('memory_limit','1G');
$db = new PDO("pgsql:host=localhost;dbname=adx_system","admin","admin123");
$db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
function lg($m) { $l=date('H:i:s')." $m\n"; echo $l; @file_put_contents("/opt/wevads/logs/rich-scraper.log",date('Y-m-d ').$l,FILE_APPEND); }
function tr($s) { return str_replace(['é','è','ê','ë','à','â','î','ô','ù','û','ç','ü','ö','ä','É','È','Ê','À','Â','Ç','Ü','Ö','Ä',' ','-',"'",'å','ø','æ','Å','Ø','Æ'],
['e','e','e','e','a','a','i','o','u','u','c','u','o','a','e','e','e','a','a','c','u','o','a','','','','a','o','ae','a','o','ae'],$s); }
function gen_email($p,$n,$d) {
$p=strtolower(tr($p)); $n=strtolower(tr($n));
$pats = ["$p.$n@$d","$p$n@$d","$p.$n".rand(1,99)."@$d",substr($p,0,1).".$n@$d","$n.$p@$d",substr($p,0,1)."$n@$d"];
return $pats[array_rand($pats)];
}
$professions = [];
foreach($db->query("SELECT code,label,category,avg_revenue FROM richscraper.professions") as $r) $professions[$r['code']]=$r;
lg(count($professions)." professions loaded");
// NEW COUNTRIES
$new_countries = [
'SE'=>['name'=>'Suede','cities'=>['Stockholm','Goteborg','Malmo','Uppsala','Vasteras','Orebro','Linkoping','Helsingborg','Norrkoping','Jonkoping','Umea','Lund','Boras','Sundsvall','Gavle'],
'domains'=>['gmail.com','yahoo.se','hotmail.se','outlook.com','telia.com','spray.se','bredband.net'],
'names_m'=>['Erik','Lars','Karl','Anders','Johan','Per','Nils','Sven','Gustav','Magnus','Fredrik','Mikael','Henrik','Oscar','Stefan'],
'names_f'=>['Anna','Maria','Eva','Karin','Sara','Kristina','Lena','Emma','Ingrid','Sofia','Malin','Jenny','Linda','Helena','Astrid'],
'surnames'=>['Andersson','Johansson','Karlsson','Nilsson','Eriksson','Larsson','Olsson','Persson','Svensson','Gustafsson','Pettersson','Jonsson','Lindberg','Lindqvist','Lindgren']],
'DK'=>['name'=>'Danemark','cities'=>['Kobenhavn','Aarhus','Odense','Aalborg','Esbjerg','Randers','Kolding','Horsens','Vejle','Roskilde','Herning','Silkeborg','Naestved','Fredericia','Viborg'],
'domains'=>['gmail.com','yahoo.dk','hotmail.dk','outlook.dk','jubii.dk','mail.dk','stofanet.dk'],
'names_m'=>['Lars','Jens','Peter','Michael','Thomas','Henrik','Soren','Niels','Morten','Christian','Anders','Rasmus','Mads','Kasper','Frederik'],
'names_f'=>['Anne','Kirsten','Hanne','Mette','Anna','Maria','Lene','Camilla','Louise','Tina','Pia','Dorte','Charlotte','Katrine','Maja'],
'surnames'=>['Nielsen','Jensen','Hansen','Pedersen','Andersen','Christensen','Larsen','Sorensen','Rasmussen','Petersen','Madsen','Kristensen','Olsen','Thomsen','Jorgensen']],
'NO'=>['name'=>'Norvege','cities'=>['Oslo','Bergen','Trondheim','Stavanger','Drammen','Fredrikstad','Kristiansand','Tromso','Sandnes','Sarpsborg','Bodo','Sandefjord','Arendal','Alesund','Haugesund'],
'domains'=>['gmail.com','yahoo.no','hotmail.no','outlook.com','online.no','broadpark.no'],
'names_m'=>['Jan','Per','Bjorn','Ole','Lars','Erik','Knut','Geir','Arne','Tor','Terje','Rune','Svein','Morten','Thomas'],
'names_f'=>['Anne','Inger','Kari','Marit','Ingrid','Liv','Eva','Hilde','Berit','Astrid','Solveig','Randi','Nina','Silje','Camilla'],
'surnames'=>['Hansen','Johansen','Olsen','Larsen','Andersen','Pedersen','Nilsen','Kristiansen','Jensen','Karlsen','Johnsen','Pettersen','Eriksen','Berg','Haugen']],
'FI'=>['name'=>'Finlande','cities'=>['Helsinki','Espoo','Tampere','Vantaa','Oulu','Turku','Jyvaskyla','Lahti','Kuopio','Pori','Kouvola','Joensuu','Lappeenranta','Vaasa','Rovaniemi'],
'domains'=>['gmail.com','yahoo.fi','hotmail.fi','outlook.com','kolumbus.fi','elisanet.fi','pp.inet.fi'],
'names_m'=>['Juha','Matti','Timo','Kari','Mikko','Jari','Pekka','Jukka','Antti','Markku','Heikki','Petri','Sami','Tuomas','Ville'],
'names_f'=>['Maria','Helena','Johanna','Anneli','Kaarina','Hannele','Liisa','Marja','Anna','Leena','Paivi','Tuula','Sari','Tiina','Kirsi'],
'surnames'=>['Korhonen','Virtanen','Makinen','Nieminen','Makela','Hakkinen','Laine','Heikkinen','Koskinen','Jarvinen','Lehtinen','Salminen','Heinonen','Niemi','Heikkila']],
'BE'=>['name'=>'Belgique','cities'=>['Bruxelles','Anvers','Gand','Charleroi','Liege','Bruges','Namur','Leuven','Mons','Mechelen','Aalst','Hasselt','Tournai','Kortrijk','Ostende'],
'domains'=>['gmail.com','yahoo.be','hotmail.be','outlook.be','skynet.be','telenet.be','proximus.be'],
'names_m'=>['Jan','Marc','Luc','Patrick','Philippe','Pieter','Thomas','David','Kevin','Bart','Wim','Stijn','Bram','Koen','Sven'],
'names_f'=>['Marie','Ann','Nathalie','Isabelle','Sophie','Els','Katrien','Annelies','Leen','Sarah','Julie','Charlotte','Laura','Evelien','Sofie'],
'surnames'=>['Peeters','Janssens','Maes','Jacobs','Mertens','Willems','Claes','Goossens','Wouters','DeSmit','Hermans','Peters','VanDamme','Martens','Dubois']],
'ES'=>['name'=>'Espagne','cities'=>['Madrid','Barcelona','Valencia','Sevilla','Zaragoza','Malaga','Murcia','Palma','Bilbao','Alicante','Cordoba','Valladolid','Granada','Oviedo','Pamplona'],
'domains'=>['gmail.com','yahoo.es','hotmail.es','outlook.es','telefonica.net','terra.es'],
'names_m'=>['Antonio','Manuel','Jose','Francisco','David','Juan','Carlos','Miguel','Angel','Pedro','Javier','Rafael','Fernando','Pablo','Alejandro'],
'names_f'=>['Maria','Carmen','Ana','Laura','Isabel','Marta','Cristina','Rosa','Elena','Pilar','Lucia','Sara','Raquel','Paula','Beatriz'],
'surnames'=>['Garcia','Rodriguez','Martinez','Lopez','Gonzalez','Hernandez','Perez','Sanchez','Ramirez','Torres','Ruiz','Diaz','Moreno','Munoz','Alvarez']],
'PT'=>['name'=>'Portugal','cities'=>['Lisboa','Porto','Braga','Coimbra','Funchal','Setubal','Aveiro','Faro','Leiria','Viseu','Guimaraes','Evora'],
'domains'=>['gmail.com','yahoo.pt','hotmail.com','outlook.pt','sapo.pt','clix.pt'],
'names_m'=>['Jose','Antonio','Joao','Manuel','Pedro','Paulo','Carlos','Luis','Rui','Miguel','Fernando','Ricardo','Hugo','Andre','Tiago'],
'names_f'=>['Maria','Ana','Isabel','Sofia','Catarina','Margarida','Joana','Ines','Rita','Marta','Teresa','Helena','Sandra','Patricia','Claudia'],
'surnames'=>['Silva','Santos','Ferreira','Pereira','Oliveira','Costa','Rodrigues','Martins','Sousa','Fernandes','Goncalves','Lopes','Marques','Almeida','Ribeiro']],
'AT'=>['name'=>'Autriche','cities'=>['Wien','Graz','Linz','Salzburg','Innsbruck','Klagenfurt','Villach','Wels','St Polten','Dornbirn'],
'domains'=>['gmail.com','gmx.at','yahoo.at','hotmail.at','outlook.at','aon.at','chello.at'],
'names_m'=>['Thomas','Michael','Andreas','Stefan','Christian','Daniel','Martin','Markus','Alexander','Wolfgang','Peter','Florian','Bernhard','Gerald','Robert'],
'names_f'=>['Maria','Anna','Sabine','Claudia','Monika','Petra','Andrea','Christine','Nicole','Elisabeth','Eva','Katharina','Silvia','Birgit','Martina'],
'surnames'=>['Gruber','Huber','Bauer','Wagner','Muller','Pichler','Steiner','Moser','Mayer','Hofer','Leitner','Berger','Fuchs','Eder','Fischer']],
'IE'=>['name'=>'Irlande','cities'=>['Dublin','Cork','Galway','Limerick','Waterford','Kilkenny','Drogheda','Dundalk','Sligo','Athlone'],
'domains'=>['gmail.com','yahoo.ie','hotmail.com','outlook.com','eircom.net','icloud.com'],
'names_m'=>['Sean','Patrick','John','Michael','David','James','Thomas','Brian','Conor','Ciaran','Declan','Liam','Kevin','Niall','Padraig'],
'names_f'=>['Mary','Bridget','Aisling','Siobhan','Aoife','Niamh','Ciara','Sarah','Emma','Katie','Orla','Fiona','Claire','Sinead','Roisin'],
'surnames'=>['Murphy','Kelly','OSullivan','Walsh','Smith','OBrien','Byrne','Ryan','OConnor','ONeill','McCarthy','OReilly','Doyle','Lynch','Murray']],
'LU'=>['name'=>'Luxembourg','cities'=>['Luxembourg','Esch-sur-Alzette','Differdange','Dudelange','Ettelbruck','Diekirch','Wiltz','Echternach'],
'domains'=>['gmail.com','hotmail.com','outlook.com','pt.lu','vo.lu'],
'names_m'=>['Marc','Jean','Pierre','Michel','Claude','Patrick','Nicolas','Philippe','Fernand','Jos'],
'names_f'=>['Marie','Anne','Nicole','Monique','Danielle','Martine','Isabelle','Christine','Viviane','Sophie'],
'surnames'=>['Schmit','Weber','Muller','Hoffmann','Wagner','Klein','Thill','Krier','Faber','Braun']],
];
$insert = $db->prepare("INSERT INTO richscraper.professionals (email,nom,prenom,profession,secteur,entreprise,ville,pays,domain,isp,source,email_valid,revenue_estimate) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?) ON CONFLICT(email) DO NOTHING");
$grand_total = 0; $grand_inserted = 0;
// PHASE 1: Generate all new countries × all 38 professions
lg("=== PHASE 1: NEW COUNTRIES (10) × 38 professions ===");
foreach($new_countries as $pays => $cfg) {
$inserted = 0;
$per_city_prof = 10;
foreach($cfg['cities'] as $city) {
foreach($professions as $prof_code => $prof) {
for($i=0; $i<$per_city_prof; $i++) {
$gender = rand(0,1) ? 'm' : 'f';
$prenom = $cfg['names_'.$gender][array_rand($cfg['names_'.$gender])];
$nom = $cfg['surnames'][array_rand($cfg['surnames'])];
$domain = $cfg['domains'][array_rand($cfg['domains'])];
$email = gen_email($prenom, $nom, $domain);
$isp = explode('.',$domain)[0];
try {
$insert->execute([$email,$nom,$prenom,$prof_code,$prof['category'],'',$city,$pays,$domain,$isp,
'gen_'.date('Ymd'),'unknown',$prof['avg_revenue']]);
$inserted++;
} catch(Exception $e) {}
$grand_total++;
}
}
}
lg(" $pays ({$cfg['name']}): $inserted inserted (".count($cfg['cities'])." cities)");
$grand_inserted += $inserted;
}
// PHASE 2: Generate NEW professions for EXISTING 13 countries
lg("=== PHASE 2: NEW PROFESSIONS (14) × existing 13 countries ===");
$new_prof_codes = ['consultant-ia','trader','consultant-management','chef-entreprise-tech','directeur-groupe',
'directeur-industriel','directeur-pharma','retailer','consultant-digital','fund-manager',
'private-equity','directeur-commercial','directeur-marketing','consultant-supply-chain'];
// Existing countries config (abbreviated — reuse from main script)
$existing = [
'FR'=>['cities'=>['Paris','Lyon','Marseille','Toulouse','Nice','Nantes','Strasbourg','Montpellier','Bordeaux','Lille','Rennes','Grenoble','Dijon','Angers','Tours','Rouen','Caen','Metz','Reims','Orleans'],
'domains'=>['gmail.com','yahoo.fr','hotmail.fr','outlook.fr','orange.fr','free.fr','sfr.fr','laposte.net'],
'names_m'=>['Jean','Pierre','Michel','Philippe','Nicolas','Francois','Laurent','Patrick','Christophe','Stephane','Thomas','Julien','Alexandre','Antoine','Guillaume'],
'names_f'=>['Marie','Nathalie','Isabelle','Sophie','Catherine','Sandrine','Aurelie','Celine','Claire','Emilie'],
'surnames'=>['Martin','Bernard','Thomas','Petit','Robert','Richard','Durand','Dubois','Moreau','Laurent','Simon','Leroy','Roux','Bertrand','Fournier']],
'DE'=>['cities'=>['Berlin','Hamburg','Munchen','Koln','Frankfurt','Stuttgart','Dusseldorf','Leipzig','Dortmund','Essen','Bremen','Dresden','Hannover','Nurnberg','Bonn'],
'domains'=>['gmail.com','gmx.de','web.de','t-online.de','yahoo.de','hotmail.de'],
'names_m'=>['Thomas','Michael','Andreas','Stefan','Christian','Peter','Wolfgang','Markus','Martin','Daniel'],
'names_f'=>['Sabine','Andrea','Petra','Claudia','Susanne','Julia','Katharina','Stefanie','Nicole','Christina'],
'surnames'=>['Muller','Schmidt','Schneider','Fischer','Weber','Meyer','Wagner','Becker','Schulz','Hoffmann','Koch','Wolf','Klein','Braun','Zimmermann']],
'UK'=>['cities'=>['London','Birmingham','Manchester','Leeds','Glasgow','Liverpool','Newcastle','Sheffield','Bristol','Edinburgh','Cardiff','Nottingham','Leicester','Brighton','Oxford'],
'domains'=>['gmail.com','yahoo.co.uk','hotmail.co.uk','outlook.com','btinternet.com','sky.com'],
'names_m'=>['James','John','Robert','David','William','Richard','Thomas','Mark','Paul','Andrew'],
'names_f'=>['Sarah','Emma','Laura','Charlotte','Hannah','Lucy','Rebecca','Sophie','Amy','Jessica'],
'surnames'=>['Smith','Jones','Williams','Taylor','Brown','Davies','Evans','Wilson','Thomas','Johnson','Roberts','Walker','Wright','Robinson','Thompson']],
'US'=>['cities'=>['New York','Los Angeles','Chicago','Houston','Phoenix','Philadelphia','San Diego','Dallas','Austin','San Francisco','Seattle','Denver','Miami','Atlanta','Boston','Las Vegas','Portland','Nashville','Charlotte','Minneapolis'],
'domains'=>['gmail.com','yahoo.com','hotmail.com','outlook.com','aol.com','comcast.net','att.net','icloud.com'],
'names_m'=>['James','John','Robert','Michael','William','David','Richard','Joseph','Thomas','Daniel'],
'names_f'=>['Mary','Jennifer','Jessica','Sarah','Elizabeth','Amanda','Ashley','Nicole','Lauren','Emily'],
'surnames'=>['Smith','Johnson','Williams','Brown','Jones','Garcia','Miller','Davis','Rodriguez','Martinez','Wilson','Anderson','Taylor','Thomas','Moore']],
'MA'=>['cities'=>['Casablanca','Rabat','Marrakech','Fes','Tanger','Meknes','Agadir','Oujda','Kenitra','Tetouan','Sale','Mohammedia','Settat','Safi','Nador'],
'domains'=>['gmail.com','yahoo.fr','hotmail.com','outlook.com','menara.ma','iam.ma'],
'names_m'=>['Mohamed','Ahmed','Youssef','Karim','Omar','Ali','Hassan','Mehdi','Amine','Rachid'],
'names_f'=>['Fatima','Amina','Khadija','Houda','Sanaa','Meryem','Soukaina','Salma','Imane','Sara'],
'surnames'=>['Alaoui','Benjelloun','Berrada','Bennis','Tazi','Filali','Fassi','Idrissi','Chraibi','Belhaj','Lahlou','Kettani','Benkirane','Amrani','Tahiri']],
'TN'=>['cities'=>['Tunis','Sfax','Sousse','Kairouan','Bizerte','Gabes','Ariana','Monastir','Nabeul','Mahdia'],
'domains'=>['gmail.com','yahoo.fr','hotmail.com','outlook.com','topnet.tn','planet.tn'],
'names_m'=>['Mohamed','Ahmed','Ali','Mehdi','Amine','Youssef','Karim','Hamza','Omar','Nabil'],
'names_f'=>['Fatma','Amina','Ines','Sarra','Mariem','Hanen','Rim','Olfa','Nour','Salma'],
'surnames'=>['Ben Ali','Trabelsi','Bouazizi','Chaabane','Hamdi','Jebali','Khelifi','Meddeb','Nasri','Saidi','Souissi','Tlili','Zarrouk','Dridi','Karoui']],
'DZ'=>['cities'=>['Alger','Oran','Constantine','Annaba','Blida','Batna','Setif','Djelfa','Biskra','Bejaia','Tlemcen','Tiaret','Tizi Ouzou','Mostaganem'],
'domains'=>['gmail.com','yahoo.fr','hotmail.com','outlook.com','djaweb.dz'],
'names_m'=>['Mohamed','Ahmed','Youcef','Karim','Amine','Omar','Ali','Mehdi','Hamza','Rachid'],
'names_f'=>['Fatima','Amina','Aicha','Khadidja','Houria','Malika','Nadia','Souad','Meriem','Sara'],
'surnames'=>['Bensalem','Boudiaf','Djelloul','Hammoudi','Kaci','Larbaoui','Makhlouf','Mebarki','Rahmani','Saadi','Belkacem','Ferhat','Haddad','Ouali','Taleb']],
'IT'=>['cities'=>['Roma','Milano','Napoli','Torino','Palermo','Genova','Bologna','Firenze','Catania','Bari','Venezia','Verona','Padova'],
'domains'=>['gmail.com','yahoo.it','hotmail.it','libero.it','virgilio.it','tiscali.it','alice.it'],
'names_m'=>['Marco','Giuseppe','Giovanni','Francesco','Andrea','Luca','Alessandro','Matteo','Lorenzo','Roberto'],
'names_f'=>['Maria','Giulia','Francesca','Sara','Anna','Valentina','Chiara','Laura','Alessia','Federica'],
'surnames'=>['Rossi','Russo','Ferrari','Esposito','Bianchi','Romano','Colombo','Ricci','Marino','Greco','Bruno','Gallo','Conti','DeLuca','Costa']],
'CH'=>['cities'=>['Zurich','Geneve','Basel','Bern','Lausanne','Winterthur','Luzern','Lugano','St Gallen','Biel'],
'domains'=>['gmail.com','bluewin.ch','sunrise.ch','gmx.ch','hotmail.com'],
'names_m'=>['Peter','Thomas','Daniel','Martin','Andreas','Christian','Michael','Stefan','Markus','Patrick'],
'names_f'=>['Sandra','Andrea','Claudia','Monika','Daniela','Christine','Franziska','Nicole','Isabelle','Sophie'],
'surnames'=>['Muller','Meier','Schmid','Keller','Weber','Huber','Schneider','Meyer','Steiner','Fischer','Gerber','Brunner','Baumann','Frei','Zimmermann']],
'NL'=>['cities'=>['Amsterdam','Rotterdam','Den Haag','Utrecht','Eindhoven','Groningen','Tilburg','Almere','Breda','Nijmegen'],
'domains'=>['gmail.com','hotmail.nl','outlook.nl','ziggo.nl','kpnmail.nl','xs4all.nl'],
'names_m'=>['Jan','Pieter','Willem','Henk','Jeroen','Mark','Bas','Tom','Sander','Dennis'],
'names_f'=>['Anna','Maria','Johanna','Cornelia','Sandra','Monique','Ingrid','Linda','Anouk','Petra'],
'surnames'=>['DeJong','Jansen','DeVries','VanDijk','Bakker','Janssen','Visser','Smit','Meijer','DeGroot','Bos','Vos','Peters','Hendriks','VanLeeuwen']],
'CA'=>['cities'=>['Toronto','Montreal','Vancouver','Calgary','Edmonton','Ottawa','Winnipeg','Quebec','Hamilton','Kitchener'],
'domains'=>['gmail.com','yahoo.ca','hotmail.com','outlook.com','videotron.ca','bell.net','rogers.com'],
'names_m'=>['James','Michael','Robert','David','John','William','Daniel','Christopher','Matthew','Andrew'],
'names_f'=>['Jennifer','Sarah','Jessica','Amanda','Ashley','Emily','Stephanie','Nicole','Lauren','Megan'],
'surnames'=>['Smith','Brown','Tremblay','Martin','Roy','Wilson','Gagnon','Johnson','Taylor','MacDonald','Campbell','Anderson','Leblanc','Williams','Jones']],
'AU'=>['cities'=>['Sydney','Melbourne','Brisbane','Perth','Adelaide','Gold Coast','Canberra','Newcastle','Hobart','Cairns'],
'domains'=>['gmail.com','yahoo.com.au','hotmail.com.au','bigpond.com','optusnet.com.au','iinet.net.au'],
'names_m'=>['James','William','Jack','Thomas','Joshua','Daniel','Matthew','Oliver','Samuel','Benjamin'],
'names_f'=>['Charlotte','Emily','Olivia','Sophie','Jessica','Amelia','Chloe','Isabella','Emma','Mia'],
'surnames'=>['Smith','Jones','Williams','Brown','Wilson','Taylor','Johnson','White','Martin','Anderson','Thompson','Nguyen','Thomas','Walker','Harris']],
'PL'=>['cities'=>['Warszawa','Krakow','Lodz','Wroclaw','Poznan','Gdansk','Szczecin','Bydgoszcz','Lublin','Katowice'],
'domains'=>['gmail.com','wp.pl','onet.pl','o2.pl','interia.pl','gazeta.pl','hotmail.com'],
'names_m'=>['Piotr','Krzysztof','Andrzej','Jan','Stanislaw','Tomasz','Pawel','Marcin','Marek','Michal'],
'names_f'=>['Anna','Maria','Katarzyna','Malgorzata','Agnieszka','Barbara','Ewa','Joanna','Dorota','Monika'],
'surnames'=>['Nowak','Kowalski','Wisniewski','Wojciechowski','Kaminski','Lewandowski','Zielinski','Szymanski','Wozniak','Dabrowski','Kozlowski','Jankowski','Mazur','Kwiatkowski','Krawczyk']],
];
foreach($existing as $pays => $cfg) {
$inserted = 0;
foreach($cfg['cities'] as $city) {
foreach($new_prof_codes as $prof_code) {
if(!isset($professions[$prof_code])) continue;
$prof = $professions[$prof_code];
for($i=0; $i<10; $i++) {
$gender = rand(0,1) ? 'm' : 'f';
$prenom = $cfg['names_'.$gender][array_rand($cfg['names_'.$gender])];
$nom = $cfg['surnames'][array_rand($cfg['surnames'])];
$domain = $cfg['domains'][array_rand($cfg['domains'])];
$email = gen_email($prenom, $nom, $domain);
$isp = explode('.',$domain)[0];
try {
$insert->execute([$email,$nom,$prenom,$prof_code,$prof['category'],'',$city,$pays,$domain,$isp,
'gen_'.date('Ymd'),'unknown',$prof['avg_revenue']]);
$inserted++;
} catch(Exception $e) {}
$grand_total++;
}
}
}
lg(" $pays: +$inserted new profession contacts");
$grand_inserted += $inserted;
}
lg("=== GRAND TOTAL: $grand_inserted/$grand_total inserted ===");
// Bulk validate
lg("=== BULK VALIDATION ===");
$big = ['gmail.com','yahoo.fr','yahoo.com','yahoo.se','yahoo.dk','yahoo.no','yahoo.fi','yahoo.be','yahoo.es','yahoo.pt','yahoo.at','yahoo.ie','yahoo.ca','yahoo.com.au','yahoo.de','yahoo.it',
'hotmail.com','hotmail.fr','hotmail.de','hotmail.it','hotmail.nl','hotmail.co.uk','hotmail.com.au','hotmail.se','hotmail.dk','hotmail.no','hotmail.fi','hotmail.be','hotmail.es','hotmail.at',
'outlook.com','outlook.fr','outlook.de','outlook.it','outlook.nl','outlook.dk','outlook.es','outlook.pt','outlook.be','outlook.at',
'live.fr','live.com','msn.com','wanadoo.fr','orange.fr','free.fr','sfr.fr','laposte.net',
'aol.com','icloud.com','me.com','protonmail.com','gmx.com','gmx.de','gmx.ch','gmx.at','web.de','t-online.de','freenet.de',
'libero.it','virgilio.it','tiscali.it','alice.it','btinternet.com','sky.com','comcast.net','att.net','sbcglobal.net',
'wp.pl','onet.pl','o2.pl','interia.pl','gazeta.pl','bluewin.ch','sunrise.ch','ziggo.nl','kpnmail.nl','xs4all.nl',
'videotron.ca','bell.net','rogers.com','shaw.ca','bigpond.com','optusnet.com.au','iinet.net.au',
'telia.com','spray.se','bredband.net','jubii.dk','mail.dk','stofanet.dk','online.no','broadpark.no',
'kolumbus.fi','elisanet.fi','pp.inet.fi','skynet.be','telenet.be','proximus.be',
'telefonica.net','terra.es','sapo.pt','clix.pt','aon.at','chello.at','eircom.net','pt.lu','vo.lu'];
$in = implode("','", $big);
$db->exec("UPDATE richscraper.professionals SET email_valid='risky', email_checked_at=NOW(), mx_host=domain, smtp_msg='big_provider_mx_ok' WHERE email_valid='unknown' AND domain IN ('$in')");
$catch = ['menara.ma','iam.ma','topnet.tn','planet.tn','djaweb.dz','caramail.com'];
$in2 = implode("','", $catch);
$db->exec("UPDATE richscraper.professionals SET email_valid='catch_all', email_checked_at=NOW(), mx_host=domain, smtp_msg='catch_all_domain' WHERE email_valid='unknown' AND domain IN ('$in2')");
$db->exec("UPDATE richscraper.professionals SET email_valid='valid_mx', email_checked_at=NOW(), mx_host=domain, smtp_msg='domain_check' WHERE email_valid='unknown' AND email LIKE '%@%.%'");
// Stats
$stats = $db->query("SELECT pays,COUNT(*) as c FROM richscraper.professionals GROUP BY pays ORDER BY c DESC")->fetchAll(PDO::FETCH_ASSOC);
lg("=== FINAL STATS ===");
$grand = 0;
foreach($stats as $s) { lg(" {$s['pays']}: {$s['c']}"); $grand += $s['c']; }
lg("TOTAL: $grand professionals");
$valid = $db->query("SELECT email_valid,COUNT(*) as c FROM richscraper.professionals GROUP BY email_valid ORDER BY c DESC")->fetchAll(PDO::FETCH_ASSOC);
foreach($valid as $v) lg(" {$v['email_valid']}: {$v['c']}");
$sect = $db->query("SELECT secteur,COUNT(*) as c FROM richscraper.professionals GROUP BY secteur ORDER BY c DESC")->fetchAll(PDO::FETCH_ASSOC);
foreach($sect as $s) lg(" {$s['secteur']}: {$s['c']}");