Files
html/api/pw_li_email.py
2026-04-12 22:57:03 +02:00

49 lines
2.0 KiB
Python

#!/usr/bin/env python3
import re, sys, time, psycopg2
from playwright.sync_api import sync_playwright
DB=dict(host='10.1.0.3',dbname='adx_system',user='admin',password='admin123')
batch=int(sys.argv[1]) if len(sys.argv)>1 else 50
conn=psycopg2.connect(**DB);cur=conn.cursor()
cur.execute("SELECT id,full_name,company,headline FROM admin.linkedin_profiles WHERE (email IS NULL OR email='') AND full_name!='' ORDER BY RANDOM() LIMIT %s",(batch,))
rows=cur.fetchall();en=0
bl=['google','bing','example','facebook','linkedin','twitter','wikipedia','noreply','mailer-daemon']
with sync_playwright() as p:
browser=p.chromium.launch(headless=True,args=['--no-sandbox','--disable-dev-shm-usage'])
ctx=browser.new_context(user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0')
for rid,name,company,headline in rows:
q=f'"{name}" email'
if company: q+=f' {company}'
try:
page=ctx.new_page()
page.goto(f"https://duckduckgo.com/?q={q.replace(' ','+')}", timeout=10000)
page.wait_for_timeout(2000)
text=page.content()
page.close()
emails=set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}',text))
emails={e.lower() for e in emails if not any(x in e.lower() for x in bl)}
if emails:
nl=name.lower().split()[0] if name else ''
best=None
for em in emails:
if nl and nl in em.split('@')[0]:best=em;break
if not best:best=list(emails)[0]
cur.execute("UPDATE admin.linkedin_profiles SET email=%s,enriched_at=NOW(),email_source='duckduckgo' WHERE id=%s",(best,rid))
conn.commit();en+=1
print(f"+EMAIL {name}: {best}")
time.sleep(3)
except:
try:page.close()
except:pass
browser.close()
cur.close();conn.close()
print(f"LI_EMAIL:+{en}/{len(rows)}")