136 lines
5.2 KiB
Python
136 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Playwright B2B Email Finder v1.0 - DDG headless search for emails"""
|
|
import re, sys, time, psycopg2, asyncio
|
|
from playwright.async_api import async_playwright
|
|
|
|
DB = dict(host='10.1.0.3', port=5432, dbname='adx_system', user='admin', password='admin123')
|
|
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 20
|
|
|
|
async def find_email_ddg(page, query):
|
|
"""Search DDG for email"""
|
|
emails = []
|
|
phones = []
|
|
try:
|
|
url = 'https://duckduckgo.com/?q=%s&ia=web' % query.replace(' ', '+')
|
|
await page.goto(url, timeout=12000)
|
|
await page.wait_for_timeout(3000)
|
|
|
|
# Get all text on page
|
|
body = await page.inner_text('body')
|
|
|
|
# Extract emails
|
|
found = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', body)
|
|
for em in found:
|
|
em = em.lower()
|
|
if em.endswith(('.png', '.jpg', '.gif', '.svg', '.css')):
|
|
continue
|
|
if 'example' in em or 'noreply' in em or 'spam' in em or 'test@' in em:
|
|
continue
|
|
emails.append(em)
|
|
|
|
# Extract phones (Morocco/Algeria/Tunisia)
|
|
ph = re.findall(r'(?:\+212|0[5-7])\s*[\d\s.-]{8,12}', body)
|
|
ph += re.findall(r'(?:\+213|0[5-7])\s*[\d\s.-]{8,12}', body)
|
|
ph += re.findall(r'(?:\+216)\s*[\d\s.-]{7,10}', body)
|
|
for p in ph:
|
|
phones.append(re.sub(r'[\s.-]', '', p)[:20])
|
|
|
|
except Exception as e:
|
|
print(" DDG ERR: %s" % str(e)[:40])
|
|
|
|
return list(set(emails))[:3], list(set(phones))[:2]
|
|
|
|
async def main():
|
|
conn = psycopg2.connect(**DB)
|
|
cur = conn.cursor()
|
|
|
|
# Get profiles without email
|
|
cur.execute("""SELECT id, full_name, company, location, linkedin_url
|
|
FROM admin.linkedin_profiles
|
|
WHERE (email IS NULL OR email='')
|
|
ORDER BY scraped_at DESC LIMIT %s""", (BATCH,))
|
|
profiles = cur.fetchall()
|
|
|
|
print("Playwright Email Finder: %d profiles" % len(profiles))
|
|
enriched_e = 0
|
|
enriched_p = 0
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
ctx = await browser.new_context(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
|
page = await ctx.new_page()
|
|
|
|
for pid, name, company, location, li_url in profiles:
|
|
if not name or len(name) < 3:
|
|
continue
|
|
|
|
# Strategy 1: Name + Company + email
|
|
queries = []
|
|
if company:
|
|
queries.append('"%s" "%s" email' % (name, company))
|
|
queries.append('"%s" email contact %s' % (name, location or 'maroc'))
|
|
queries.append('"%s" @gmail OR @yahoo OR @hotmail %s' % (name, company or 'maroc'))
|
|
|
|
found_email = ''
|
|
found_phone = ''
|
|
|
|
for q in queries[:2]:
|
|
emails, phones = await find_email_ddg(page, q)
|
|
|
|
# Pick best email (one matching name parts)
|
|
name_parts = [p.lower() for p in name.split() if len(p) > 2]
|
|
for em in emails:
|
|
prefix = em.split('@')[0].lower()
|
|
if any(part in prefix for part in name_parts):
|
|
found_email = em
|
|
break
|
|
if not found_email and emails:
|
|
found_email = emails[0]
|
|
|
|
if not found_phone and phones:
|
|
found_phone = phones[0]
|
|
|
|
if found_email:
|
|
break
|
|
await page.wait_for_timeout(2000)
|
|
|
|
# Update DB
|
|
if found_email or found_phone:
|
|
sets = []
|
|
params = []
|
|
if found_email:
|
|
sets.append("email=%s")
|
|
sets.append("email_source='playwright_ddg'")
|
|
sets.append("enriched_at=NOW()")
|
|
params.append(found_email)
|
|
enriched_e += 1
|
|
if found_phone:
|
|
sets.append("phone=%s")
|
|
params.append(found_phone)
|
|
enriched_p += 1
|
|
params.append(pid)
|
|
cur.execute("UPDATE admin.linkedin_profiles SET %s WHERE id=%%s" % ', '.join(sets), params)
|
|
conn.commit()
|
|
|
|
# Also update weval_leads
|
|
if li_url and found_email:
|
|
cur.execute("UPDATE admin.weval_leads SET email=%s WHERE linkedin_url=%s AND (email IS NULL OR email='')", (found_email, li_url))
|
|
conn.commit()
|
|
if li_url and found_phone:
|
|
cur.execute("UPDATE admin.weval_leads SET phone=%s WHERE linkedin_url=%s AND (phone IS NULL OR phone='')", (found_phone, li_url))
|
|
conn.commit()
|
|
|
|
print(" +%s: email=%s tel=%s" % (name[:30], found_email or '-', found_phone or '-'))
|
|
else:
|
|
print(" -%s" % name[:30])
|
|
|
|
await page.wait_for_timeout(3000)
|
|
|
|
await browser.close()
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print("\nPW_EMAIL: +%d emails, +%d phones" % (enriched_e, enriched_p))
|
|
|
|
asyncio.run(main())
|