html/api/weval_b2b_v2.py

#!/usr/bin/env python3
"""WEVAL B2B Scraper v2.0 - LinkedIn-focused via SearXNG
Quality-first: only insert verified professional profiles"""
import re, sys, time, psycopg2, requests, json

DB = dict(host='10.1.0.3', dbname='adx_system', user='admin', password='admin123')
SEARX = "http://localhost:8888/search"
country = sys.argv[1] if len(sys.argv) > 1 else 'MA'

CITIES = {
    'MA': ['casablanca','rabat','tanger','marrakech','fes','agadir'],
    'DZ': ['alger','oran','constantine','annaba','blida'],
    'TN': ['tunis','sfax','sousse','bizerte','gabes']
}
QUERIES = [
    'site:linkedin.com/in "{role}" "{city}" "{sector}"',
    'site:linkedin.com/company "{sector}" "{city}"',
    'site:kompass.com "{sector}" "{city}"',
    'site:charika.ma "{sector}"',
    'site:rekrute.com "{role}" "{sector}"',
    '"{role}" "{sector}" "{city}" email @gmail OR @yahoo OR @hotmail',
]
ROLES = ['directeur general','DSI','CTO','DAF','directeur informatique',
         'responsable IT','directeur achats','DRH','directeur commercial']
SECTORS = ['SAP','ERP','cloud','pharma','logistique','banque','telecom','energie']
BL_WORDS = ['wikipedia','zhihu','baidu','google.com','bing.com','facebook.com',
            'youtube.com','reddit.com','twitter.com','quora.com','adobe','admob',
            'larousse','dictionnaire','definition']

conn = psycopg2.connect(**DB); cur = conn.cursor(); added = 0; skipped = 0

def sx(q):
    try:
        r = requests.get(SEARX, params={'q': q, 'format': 'json'}, timeout=15)
        return r.json().get('results', [])[:10]
    except: return []

def is_valid_name(name):
    if not name or len(name) < 4: return False
    if any(c in name for c in ['http','www','@','#','$']): return False
    if re.search(r'[\u4e00-\u9fff]', name): return False  # No Chinese
    parts = name.strip().split()
    if len(parts) < 2: return False
    if any(len(p) < 2 for p in parts[:2]): return False
    if any(p.lower() in BL_WORDS for p in parts): return False
    return True

def extract_linkedin(results):
    leads = []
    for r in results:
        url = r.get('url','')
        if 'linkedin.com/in/' not in url: continue
        t = r.get('title','').replace(' | LinkedIn','').replace(' - LinkedIn','').strip()
        c = r.get('content','')
        parts = [p.strip() for p in t.split(' - ') if p.strip()]
        name = parts[0] if parts else ''
        if not is_valid_name(name): continue
        job = parts[1] if len(parts) > 1 else ''
        comp = parts[2] if len(parts) > 2 else ''
        if not comp:
            for kw in [' chez ',' at ',' dans ']:
                if kw in c: comp = c.split(kw)[-1].split('.')[0].split(',')[0].strip()[:100]; break
        leads.append({'name': name, 'job': job, 'company': comp, 'url': url, 'content': c})
    return leads

def extract_b2b(results, city, role, sector):
    leads = []
    for r in results:
        url = r.get('url','')
        if any(bl in url for bl in BL_WORDS): continue
        t = r.get('title','')
        c = r.get('content','')
        full = f"{t} {c}"
        emails = [e for e in set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full.lower()))
                  if not any(bl in e for bl in BL_WORDS+['noreply','contact@','info@','admin@'])]
        phones = list(set(re.findall(r'(?:\+212|\+213|\+216|0[567])\d[\d\s.-]{7,11}', full)))
        name = None
        for pat in [r'([A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+)', r'([A-Z][a-z]+ [A-Z][a-z]+)']:
            m = re.search(pat, t)
            if m and is_valid_name(m.group(1)): name = m.group(1); break
        if not emails and not name: continue
        comp = None
        for kw in [' chez ',' at ',' - ','| ']:
            if kw in t: comp = t.split(kw)[-1].strip()[:100]; break
        leads.append({'name': name, 'email': emails[0] if emails else None,
                      'phone': phones[0] if phones else None, 'company': comp,
                      'url': url, 'job': role, 'sector': sector})
    return leads

cities = CITIES.get(country, CITIES['MA'])
for city in cities:
    for sector in SECTORS:
        for role in ROLES[:5]:
            # LinkedIn search
            q = f'site:linkedin.com/in "{role}" "{city}"'
            for lead in extract_linkedin(sx(q)):
                cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE lead_linkedin_url=%s",(lead['url'],))
                if cur.fetchone(): skipped += 1; continue
                ind = ''
                for kw in SECTORS:
                    if kw.lower() in (lead['content']+lead['job']+lead['company']).lower(): ind = kw; break
                cur.execute("""INSERT INTO admin.linkedin_leads
                    (lead_name,lead_company,lead_title,lead_industry,lead_seniority,
                     lead_linkedin_url,lead_country,lead_city,captured_at)
                    VALUES (%s,%s,%s,%s,'senior',%s,%s,%s,NOW())""",
                    (lead['name'],lead['company'],lead['job'],ind or sector,
                     lead['url'],country,city))
                conn.commit(); added += 1
                print(f"+LI {lead['name']} @{lead['company']} [{lead['job']}]")

            # B2B email search
            q2 = f'"{role}" "{sector}" "{city}" email @gmail OR @yahoo'
            for lead in extract_b2b(sx(q2), city, role, sector):
                if lead.get('email'):
                    cur.execute("SELECT 1 FROM admin.weval_leads WHERE email=%s",(lead['email'],))
                    if cur.fetchone(): skipped += 1; continue
                elif lead.get('name'):
                    cur.execute("SELECT 1 FROM admin.weval_leads WHERE contact_name=%s",(lead['name'],))
                    if cur.fetchone(): skipped += 1; continue
                else: continue
                cur.execute("""INSERT INTO admin.weval_leads
                    (contact_name,email,phone,company_name,contact_title,industry,city,country,source,created_at)
                    VALUES (%s,%s,%s,%s,%s,%s,%s,%s,'searxng_b2b_v2',NOW())""",
                    (lead.get('name'),lead.get('email'),lead.get('phone'),
                     lead.get('company'),lead.get('job'),lead.get('sector'),city,country))
                conn.commit(); added += 1
                print(f"+B2B {lead.get('name','?')} {lead.get('email','')} @{lead.get('company','?')}")
            time.sleep(1)
    print(f"[{city}] +{added} total, {skipped} skipped")

cur.close(); conn.close()
print(f"\nFINAL {country}: +{added} leads, {skipped} dupes")