Files
wevads-platform/scripts/enrich.php
2026-02-26 04:53:11 +01:00

145 lines
6.3 KiB
PHP

<?php
/* Contact Enrichment Pipeline - Offline Methods
* 1. Email pattern → Name
* 2. LinkedIn source → Name
* 3. Domain → Company guess
* 4. TLD → Country
* 5. ISP detection
*/
$db = new PDO('pgsql:host=localhost;dbname=adx_system','admin','admin123');
$db->exec("SET search_path TO admin, public");
$GENERIC = array_flip(explode(',','info,contact,admin,support,sales,hello,hi,office,team,noreply,no-reply,webmaster,postmaster,marketing,billing,help,service,mail,email,newsletter,abuse,spam,security,feedback,customerservice,developer,dev,test,demo,api,bot,system,root,www,subscribe,unsubscribe,reply,bounce,return'));
$FREEMAIL = array_flip(explode(',','gmail.com,yahoo.com,hotmail.com,outlook.com,aol.com,icloud.com,mail.com,protonmail.com,gmx.de,gmx.net,web.de,t-online.de,freenet.de,arcor.de,hotmail.de,hotmail.fr,yahoo.de,yahoo.fr,live.com,live.de,live.fr,msn.com,videotron.ca,bell.net,rogers.com,shaw.ca,telus.net,orange.fr,free.fr,sfr.fr,laposte.net,wanadoo.fr,btinternet.com,sky.com,virgin.net,ntlworld.com,talktalk.net,bluewin.ch,hotmail.ch,gmx.ch,sunrise.ch,hotmail.co.uk,yahoo.co.uk,googlemail.com,comcast.net,verizon.net,att.net,cox.net,sbcglobal.net,earthlink.net,mail.ru,inbox.ru,list.ru,bk.ru,libero.it,virgilio.it,alice.it'));
$ISP_MAP = [
't-online.de'=>'T-Online','web.de'=>'Web.de','gmx.de'=>'GMX','gmx.net'=>'GMX',
'freenet.de'=>'Freenet','gmail.com'=>'Gmail','googlemail.com'=>'Gmail',
'hotmail.com'=>'Hotmail','hotmail.de'=>'Hotmail','hotmail.fr'=>'Hotmail',
'hotmail.co.uk'=>'Hotmail','hotmail.ch'=>'Hotmail',
'outlook.com'=>'Outlook','live.com'=>'Outlook','live.de'=>'Outlook',
'yahoo.com'=>'Yahoo','yahoo.de'=>'Yahoo','yahoo.fr'=>'Yahoo','yahoo.co.uk'=>'Yahoo',
'videotron.ca'=>'Videotron','bell.net'=>'Bell','rogers.com'=>'Rogers',
'orange.fr'=>'Orange','free.fr'=>'Free','sfr.fr'=>'SFR','bluewin.ch'=>'Bluewin',
'comcast.net'=>'Comcast','verizon.net'=>'Verizon','att.net'=>'AT&T',
'aol.com'=>'AOL','icloud.com'=>'iCloud',
'btinternet.com'=>'BT','sky.com'=>'Sky','ntlworld.com'=>'NTL',
'virgin.net'=>'Virgin Media','talktalk.net'=>'TalkTalk',
];
$TLD_COUNTRY = [
'.co.uk'=>'UK','.de'=>'Germany','.fr'=>'France','.uk'=>'UK','.ca'=>'Canada',
'.ch'=>'Switzerland','.at'=>'Austria','.nl'=>'Netherlands','.be'=>'Belgium',
'.it'=>'Italy','.es'=>'Spain','.pt'=>'Portugal','.pl'=>'Poland','.se'=>'Sweden',
'.no'=>'Norway','.dk'=>'Denmark','.fi'=>'Finland','.ie'=>'Ireland',
'.us'=>'USA','.au'=>'Australia','.nz'=>'New Zealand','.jp'=>'Japan',
'.br'=>'Brazil','.ma'=>'Morocco','.tn'=>'Tunisia','.dz'=>'Algeria',
'.ae'=>'UAE','.sa'=>'Saudi Arabia','.za'=>'South Africa','.ru'=>'Russia',
];
function nameFromEmail($email, $GENERIC) {
$local = strtolower(explode('@', $email)[0]);
if (isset($GENERIC[$local]) || preg_match('/^[a-z]{1,2}\d{3,}/', $local) || strlen($local) < 3) return [null, null];
$parts = preg_split('/[._\-]/', $local);
$parts = array_filter($parts, function($p) use ($GENERIC) {
$p = preg_replace('/\d+/', '', $p);
return strlen($p) > 1 && !isset($GENERIC[strtolower($p)]);
});
$parts = array_values(array_map(function($p) { return ucfirst(preg_replace('/\d+/', '', $p)); }, $parts));
if (count($parts) >= 2) return [$parts[0], end($parts)];
if (count($parts) == 1 && ctype_alpha($parts[0])) return [$parts[0], null];
return [null, null];
}
function nameFromLinkedIn($src) {
if (!$src || strpos($src, 'linkedin.com/in/') === false) return [null, null];
if (!preg_match('/linkedin\.com\/in\/([^\/?#]+)/', $src, $m)) return [null, null];
$slug = $m[1];
if (preg_match('/^(.+?)-at-(.+)$/', $slug, $am)) {
$parts = preg_split('/[\.\-_]/', $am[1]);
$parts = array_values(array_filter(array_map('ucfirst', $parts), function($p) { return strlen($p) > 1; }));
if (count($parts) >= 2) return [$parts[0], end($parts)];
if (count($parts) == 1) return [$parts[0], null];
}
return [null, null];
}
function companyFromDomain($email, $FREEMAIL) {
$dom = strtolower(explode('@', $email)[1] ?? '');
if (isset($FREEMAIL[$dom]) || !$dom) return null;
$name = explode('.', $dom)[0];
if (strlen($name) < 3) return null;
$name = str_replace(['-','_'], ' ', $name);
$name = preg_replace('/([a-z])([A-Z])/', '$1 $2', $name);
return ucwords(trim($name));
}
function countryFromTLD($email, $TLD_COUNTRY) {
$dom = strtolower(explode('@', $email)[1] ?? '');
// Check longest TLDs first
foreach ($TLD_COUNTRY as $tld => $country) {
if (substr($dom, -strlen($tld)) === $tld) return $country;
}
return null;
}
// ── MAIN ──
$stmt = $db->query("SELECT id, email, source_url FROM scrapping_results WHERE full_name IS NULL OR full_name = '' ORDER BY id");
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
$total = count($rows);
echo "Contacts to enrich: $total\n";
$nName = 0; $nComp = 0; $nLoc = 0; $nTotal = 0;
$update = $db->prepare("UPDATE scrapping_results SET full_name=:name, company=:comp, location=:loc, revenue_range=:isp WHERE id=:id");
foreach ($rows as $i => $r) {
$email = $r['email'] ?? '';
if (!$email || strpos($email, '@') === false) continue;
$src = $r['source_url'] ?? '';
$dom = strtolower(explode('@', $email)[1]);
// Name: LinkedIn first, then email
[$fn, $ln] = nameFromLinkedIn($src);
if (!$fn) [$fn, $ln] = nameFromEmail($email, $GENERIC);
$fullName = $fn ? ($ln ? "$fn $ln" : $fn) : null;
// Company from domain
$company = companyFromDomain($email, $FREEMAIL);
// ISP for freemail
$isp = $ISP_MAP[$dom] ?? null;
// Country from TLD
$country = countryFromTLD($email, $TLD_COUNTRY);
if ($fullName || $company || $country) {
$nTotal++;
if ($fullName) $nName++;
if ($company) $nComp++;
if ($country) $nLoc++;
$update->execute([
':name' => $fullName,
':comp' => $company,
':loc' => $country,
':isp' => $isp ?: $company,
':id' => $r['id']
]);
}
if ($i > 0 && $i % 500 == 0) echo " $i/$total...\n";
}
echo "\n=== ENRICHMENT DONE ===\n";
echo "Total: $total\n";
echo "Enriched: $nTotal (" . ($total > 0 ? round($nTotal*100/$total) : 0) . "%)\n";
echo "Names: $nName\n";
echo "Companies: $nComp\n";
echo "Locations: $nLoc\n";