49 lines
2.0 KiB
Python
49 lines
2.0 KiB
Python
#!/usr/bin/env python3
|
|
import re, sys, time, psycopg2
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DB=dict(host='10.1.0.3',dbname='adx_system',user='admin',password='admin123')
|
|
batch=int(sys.argv[1]) if len(sys.argv)>1 else 50
|
|
|
|
conn=psycopg2.connect(**DB);cur=conn.cursor()
|
|
cur.execute("SELECT id,full_name,company,headline FROM admin.linkedin_profiles WHERE (email IS NULL OR email='') AND full_name!='' ORDER BY RANDOM() LIMIT %s",(batch,))
|
|
rows=cur.fetchall();en=0
|
|
|
|
bl=['google','bing','example','facebook','linkedin','twitter','wikipedia','noreply','mailer-daemon']
|
|
|
|
with sync_playwright() as p:
|
|
browser=p.chromium.launch(headless=True,args=['--no-sandbox','--disable-dev-shm-usage'])
|
|
ctx=browser.new_context(user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0')
|
|
|
|
for rid,name,company,headline in rows:
|
|
q=f'"{name}" email'
|
|
if company: q+=f' {company}'
|
|
try:
|
|
page=ctx.new_page()
|
|
page.goto(f"https://duckduckgo.com/?q={q.replace(' ','+')}", timeout=10000)
|
|
page.wait_for_timeout(2000)
|
|
text=page.content()
|
|
page.close()
|
|
|
|
emails=set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}',text))
|
|
emails={e.lower() for e in emails if not any(x in e.lower() for x in bl)}
|
|
|
|
if emails:
|
|
nl=name.lower().split()[0] if name else ''
|
|
best=None
|
|
for em in emails:
|
|
if nl and nl in em.split('@')[0]:best=em;break
|
|
if not best:best=list(emails)[0]
|
|
cur.execute("UPDATE admin.linkedin_profiles SET email=%s,enriched_at=NOW(),email_source='duckduckgo' WHERE id=%s",(best,rid))
|
|
conn.commit();en+=1
|
|
print(f"+EMAIL {name}: {best}")
|
|
time.sleep(3)
|
|
except:
|
|
try:page.close()
|
|
except:pass
|
|
|
|
browser.close()
|
|
|
|
cur.close();conn.close()
|
|
print(f"LI_EMAIL:+{en}/{len(rows)}")
|