Files
html/api/weval_b2b_v2.py
2026-04-12 22:57:03 +02:00

133 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""WEVAL B2B Scraper v2.0 - LinkedIn-focused via SearXNG
Quality-first: only insert verified professional profiles"""
import re, sys, time, psycopg2, requests, json
DB = dict(host='10.1.0.3', dbname='adx_system', user='admin', password='admin123')
SEARX = "http://localhost:8888/search"
country = sys.argv[1] if len(sys.argv) > 1 else 'MA'
CITIES = {
'MA': ['casablanca','rabat','tanger','marrakech','fes','agadir'],
'DZ': ['alger','oran','constantine','annaba','blida'],
'TN': ['tunis','sfax','sousse','bizerte','gabes']
}
QUERIES = [
'site:linkedin.com/in "{role}" "{city}" "{sector}"',
'site:linkedin.com/company "{sector}" "{city}"',
'site:kompass.com "{sector}" "{city}"',
'site:charika.ma "{sector}"',
'site:rekrute.com "{role}" "{sector}"',
'"{role}" "{sector}" "{city}" email @gmail OR @yahoo OR @hotmail',
]
ROLES = ['directeur general','DSI','CTO','DAF','directeur informatique',
'responsable IT','directeur achats','DRH','directeur commercial']
SECTORS = ['SAP','ERP','cloud','pharma','logistique','banque','telecom','energie']
BL_WORDS = ['wikipedia','zhihu','baidu','google.com','bing.com','facebook.com',
'youtube.com','reddit.com','twitter.com','quora.com','adobe','admob',
'larousse','dictionnaire','definition']
conn = psycopg2.connect(**DB); cur = conn.cursor(); added = 0; skipped = 0
def sx(q):
try:
r = requests.get(SEARX, params={'q': q, 'format': 'json'}, timeout=15)
return r.json().get('results', [])[:10]
except: return []
def is_valid_name(name):
if not name or len(name) < 4: return False
if any(c in name for c in ['http','www','@','#','$']): return False
if re.search(r'[\u4e00-\u9fff]', name): return False # No Chinese
parts = name.strip().split()
if len(parts) < 2: return False
if any(len(p) < 2 for p in parts[:2]): return False
if any(p.lower() in BL_WORDS for p in parts): return False
return True
def extract_linkedin(results):
leads = []
for r in results:
url = r.get('url','')
if 'linkedin.com/in/' not in url: continue
t = r.get('title','').replace(' | LinkedIn','').replace(' - LinkedIn','').strip()
c = r.get('content','')
parts = [p.strip() for p in t.split(' - ') if p.strip()]
name = parts[0] if parts else ''
if not is_valid_name(name): continue
job = parts[1] if len(parts) > 1 else ''
comp = parts[2] if len(parts) > 2 else ''
if not comp:
for kw in [' chez ',' at ',' dans ']:
if kw in c: comp = c.split(kw)[-1].split('.')[0].split(',')[0].strip()[:100]; break
leads.append({'name': name, 'job': job, 'company': comp, 'url': url, 'content': c})
return leads
def extract_b2b(results, city, role, sector):
leads = []
for r in results:
url = r.get('url','')
if any(bl in url for bl in BL_WORDS): continue
t = r.get('title','')
c = r.get('content','')
full = f"{t} {c}"
emails = [e for e in set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full.lower()))
if not any(bl in e for bl in BL_WORDS+['noreply','contact@','info@','admin@'])]
phones = list(set(re.findall(r'(?:\+212|\+213|\+216|0[567])\d[\d\s.-]{7,11}', full)))
name = None
for pat in [r'([A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+)', r'([A-Z][a-z]+ [A-Z][a-z]+)']:
m = re.search(pat, t)
if m and is_valid_name(m.group(1)): name = m.group(1); break
if not emails and not name: continue
comp = None
for kw in [' chez ',' at ',' - ','| ']:
if kw in t: comp = t.split(kw)[-1].strip()[:100]; break
leads.append({'name': name, 'email': emails[0] if emails else None,
'phone': phones[0] if phones else None, 'company': comp,
'url': url, 'job': role, 'sector': sector})
return leads
cities = CITIES.get(country, CITIES['MA'])
for city in cities:
for sector in SECTORS:
for role in ROLES[:5]:
# LinkedIn search
q = f'site:linkedin.com/in "{role}" "{city}"'
for lead in extract_linkedin(sx(q)):
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE lead_linkedin_url=%s",(lead['url'],))
if cur.fetchone(): skipped += 1; continue
ind = ''
for kw in SECTORS:
if kw.lower() in (lead['content']+lead['job']+lead['company']).lower(): ind = kw; break
cur.execute("""INSERT INTO admin.linkedin_leads
(lead_name,lead_company,lead_title,lead_industry,lead_seniority,
lead_linkedin_url,lead_country,lead_city,captured_at)
VALUES (%s,%s,%s,%s,'senior',%s,%s,%s,NOW())""",
(lead['name'],lead['company'],lead['job'],ind or sector,
lead['url'],country,city))
conn.commit(); added += 1
print(f"+LI {lead['name']} @{lead['company']} [{lead['job']}]")
# B2B email search
q2 = f'"{role}" "{sector}" "{city}" email @gmail OR @yahoo'
for lead in extract_b2b(sx(q2), city, role, sector):
if lead.get('email'):
cur.execute("SELECT 1 FROM admin.weval_leads WHERE email=%s",(lead['email'],))
if cur.fetchone(): skipped += 1; continue
elif lead.get('name'):
cur.execute("SELECT 1 FROM admin.weval_leads WHERE contact_name=%s",(lead['name'],))
if cur.fetchone(): skipped += 1; continue
else: continue
cur.execute("""INSERT INTO admin.weval_leads
(contact_name,email,phone,company_name,contact_title,industry,city,country,source,created_at)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,'searxng_b2b_v2',NOW())""",
(lead.get('name'),lead.get('email'),lead.get('phone'),
lead.get('company'),lead.get('job'),lead.get('sector'),city,country))
conn.commit(); added += 1
print(f"+B2B {lead.get('name','?')} {lead.get('email','')} @{lead.get('company','?')}")
time.sleep(1)
print(f"[{city}] +{added} total, {skipped} skipped")
cur.close(); conn.close()
print(f"\nFINAL {country}: +{added} leads, {skipped} dupes")