Files
html/api/pw_b2b_email.py
2026-04-12 22:57:03 +02:00

136 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""Playwright B2B Email Finder v1.0 - DDG headless search for emails"""
import re, sys, time, psycopg2, asyncio
from playwright.async_api import async_playwright
DB = dict(host='10.1.0.3', port=5432, dbname='adx_system', user='admin', password='admin123')
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 20
async def find_email_ddg(page, query):
"""Search DDG for email"""
emails = []
phones = []
try:
url = 'https://duckduckgo.com/?q=%s&ia=web' % query.replace(' ', '+')
await page.goto(url, timeout=12000)
await page.wait_for_timeout(3000)
# Get all text on page
body = await page.inner_text('body')
# Extract emails
found = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', body)
for em in found:
em = em.lower()
if em.endswith(('.png', '.jpg', '.gif', '.svg', '.css')):
continue
if 'example' in em or 'noreply' in em or 'spam' in em or 'test@' in em:
continue
emails.append(em)
# Extract phones (Morocco/Algeria/Tunisia)
ph = re.findall(r'(?:\+212|0[5-7])\s*[\d\s.-]{8,12}', body)
ph += re.findall(r'(?:\+213|0[5-7])\s*[\d\s.-]{8,12}', body)
ph += re.findall(r'(?:\+216)\s*[\d\s.-]{7,10}', body)
for p in ph:
phones.append(re.sub(r'[\s.-]', '', p)[:20])
except Exception as e:
print(" DDG ERR: %s" % str(e)[:40])
return list(set(emails))[:3], list(set(phones))[:2]
async def main():
conn = psycopg2.connect(**DB)
cur = conn.cursor()
# Get profiles without email
cur.execute("""SELECT id, full_name, company, location, linkedin_url
FROM admin.linkedin_profiles
WHERE (email IS NULL OR email='')
ORDER BY scraped_at DESC LIMIT %s""", (BATCH,))
profiles = cur.fetchall()
print("Playwright Email Finder: %d profiles" % len(profiles))
enriched_e = 0
enriched_p = 0
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
ctx = await browser.new_context(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
page = await ctx.new_page()
for pid, name, company, location, li_url in profiles:
if not name or len(name) < 3:
continue
# Strategy 1: Name + Company + email
queries = []
if company:
queries.append('"%s" "%s" email' % (name, company))
queries.append('"%s" email contact %s' % (name, location or 'maroc'))
queries.append('"%s" @gmail OR @yahoo OR @hotmail %s' % (name, company or 'maroc'))
found_email = ''
found_phone = ''
for q in queries[:2]:
emails, phones = await find_email_ddg(page, q)
# Pick best email (one matching name parts)
name_parts = [p.lower() for p in name.split() if len(p) > 2]
for em in emails:
prefix = em.split('@')[0].lower()
if any(part in prefix for part in name_parts):
found_email = em
break
if not found_email and emails:
found_email = emails[0]
if not found_phone and phones:
found_phone = phones[0]
if found_email:
break
await page.wait_for_timeout(2000)
# Update DB
if found_email or found_phone:
sets = []
params = []
if found_email:
sets.append("email=%s")
sets.append("email_source='playwright_ddg'")
sets.append("enriched_at=NOW()")
params.append(found_email)
enriched_e += 1
if found_phone:
sets.append("phone=%s")
params.append(found_phone)
enriched_p += 1
params.append(pid)
cur.execute("UPDATE admin.linkedin_profiles SET %s WHERE id=%%s" % ', '.join(sets), params)
conn.commit()
# Also update weval_leads
if li_url and found_email:
cur.execute("UPDATE admin.weval_leads SET email=%s WHERE linkedin_url=%s AND (email IS NULL OR email='')", (found_email, li_url))
conn.commit()
if li_url and found_phone:
cur.execute("UPDATE admin.weval_leads SET phone=%s WHERE linkedin_url=%s AND (phone IS NULL OR phone='')", (found_phone, li_url))
conn.commit()
print(" +%s: email=%s tel=%s" % (name[:30], found_email or '-', found_phone or '-'))
else:
print(" -%s" % name[:30])
await page.wait_for_timeout(3000)
await browser.close()
cur.close()
conn.close()
print("\nPW_EMAIL: +%d emails, +%d phones" % (enriched_e, enriched_p))
asyncio.run(main())