104 lines
4.4 KiB
PHP
104 lines
4.4 KiB
PHP
<?php
|
|
/* Enrichment Cleanup v2 - Fix bad names:
|
|
* 1. Remove numbers from names
|
|
* 2. Filter non-personal names (businesses, generic)
|
|
* 3. Fix single-word names where we have first.last in email
|
|
* 4. Ensure Prénom Nom order (not reversed)
|
|
*/
|
|
$db = new PDO('pgsql:host=localhost;dbname=adx_system','admin','admin123');
|
|
$db->exec("SET search_path TO admin, public");
|
|
|
|
// Words that indicate NOT a person name
|
|
$BAD_WORDS = array_flip(explode(',','praxis,technik,service,info,team,office,contact,support,pension,zahnarzt,immobilien,versicherung,beratung,verlag,handel,marketing,consulting,media,design,solutions,studio,werkstatt,agentur,kanzlei,fotografie,fotograf,reisen,smart,post,mail,web,net,online,digital,shop,store'));
|
|
|
|
$stmt = $db->query("SELECT id, email, full_name, source_url FROM scrapping_results WHERE full_name IS NOT NULL AND full_name != '' ORDER BY id");
|
|
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
|
|
echo "Checking " . count($rows) . " contacts for name quality...\n";
|
|
|
|
$upd = $db->prepare("UPDATE scrapping_results SET full_name = :name WHERE id = :id");
|
|
$fixed = 0; $cleared = 0;
|
|
|
|
foreach ($rows as $r) {
|
|
$name = $r['full_name'];
|
|
$email = $r['email'] ?? '';
|
|
$orig = $name;
|
|
|
|
// 1. Strip numbers from name parts
|
|
$parts = explode(' ', $name);
|
|
$parts = array_map(function($p) { return preg_replace('/\d+/', '', $p); }, $parts);
|
|
$parts = array_filter($parts, function($p) { return strlen($p) > 1; });
|
|
$name = implode(' ', $parts);
|
|
|
|
// 2. Check if any part is a "bad word" (business name)
|
|
$isBusiness = false;
|
|
foreach ($parts as $p) {
|
|
if (isset($BAD_WORDS[strtolower($p)])) { $isBusiness = true; break; }
|
|
}
|
|
|
|
// 3. Single char names or too short
|
|
if (strlen($name) < 3) $name = null;
|
|
|
|
// 4. Names that are clearly not personal (all lowercase single word matching email prefix)
|
|
if ($name && strpos($name, ' ') === false) {
|
|
$local = strtolower(explode('@', $email)[0] ?? '');
|
|
// If name == email local part (just capitalized), it's not a real name extraction
|
|
if (strtolower($name) === $local) {
|
|
// Try to split by common patterns
|
|
if (preg_match('/^([a-z]+)[._\-]([a-z]+)$/i', $local, $m)) {
|
|
$name = ucfirst($m[1]) . ' ' . ucfirst($m[2]);
|
|
} else {
|
|
$name = null; // Can't determine real name
|
|
}
|
|
}
|
|
}
|
|
|
|
// 5. If business name, move to company field instead
|
|
if ($isBusiness) {
|
|
$db->prepare("UPDATE scrapping_results SET full_name = NULL, company = COALESCE(NULLIF(company,''), :biz) WHERE id = :id")
|
|
->execute([':biz' => $orig, ':id' => $r['id']]);
|
|
$cleared++;
|
|
continue;
|
|
}
|
|
|
|
// 6. For LinkedIn sources with -at- pattern, try to get better name
|
|
$src = $r['source_url'] ?? '';
|
|
if ($name && strpos($src, '-at-') !== false && strpos($name, ' ') === false) {
|
|
// Single word name from LinkedIn, try email for better extraction
|
|
if (preg_match('/^([a-z]+)[._\-]([a-z]+)/i', explode('@', $email)[0], $m)) {
|
|
$candidate = ucfirst($m[1]) . ' ' . ucfirst($m[2]);
|
|
if (strlen($candidate) > strlen($name)) {
|
|
$name = $candidate;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Apply fix if changed
|
|
if ($name !== $orig) {
|
|
$upd->execute([':name' => $name, ':id' => $r['id']]);
|
|
$fixed++;
|
|
}
|
|
}
|
|
|
|
echo "\n=== CLEANUP DONE ===\n";
|
|
echo "Fixed names: $fixed\n";
|
|
echo "Moved to company: $cleared\n";
|
|
|
|
// Stats
|
|
$stats = $db->query("SELECT
|
|
COUNT(*) as total,
|
|
SUM(CASE WHEN full_name IS NOT NULL AND full_name != '' THEN 1 ELSE 0 END) as names,
|
|
SUM(CASE WHEN full_name LIKE '% %' THEN 1 ELSE 0 END) as full_names,
|
|
SUM(CASE WHEN company IS NOT NULL AND company != '' THEN 1 ELSE 0 END) as companies,
|
|
SUM(CASE WHEN location IS NOT NULL AND location != '' THEN 1 ELSE 0 END) as locations
|
|
FROM scrapping_results")->fetch(PDO::FETCH_ASSOC);
|
|
|
|
echo "\nFinal stats:\n";
|
|
foreach ($stats as $k => $v) echo " $k: $v\n";
|
|
|
|
// Show samples
|
|
echo "\nSample enriched contacts:\n";
|
|
$samples = $db->query("SELECT email, full_name, company, location, revenue_range FROM scrapping_results WHERE full_name IS NOT NULL AND full_name LIKE '% %' ORDER BY random() LIMIT 15")->fetchAll(PDO::FETCH_ASSOC);
|
|
foreach ($samples as $s) {
|
|
echo " {$s['email']} → {$s['full_name']} | {$s['company']} | {$s['location']} | {$s['revenue_range']}\n";
|
|
}
|