#!/usr/bin/env python3 """WEVAL B2B Scraper v2.0 - LinkedIn-focused via SearXNG Quality-first: only insert verified professional profiles""" import re, sys, time, psycopg2, requests, json DB = dict(host='10.1.0.3', dbname='adx_system', user='admin', password='admin123') SEARX = "http://localhost:8888/search" country = sys.argv[1] if len(sys.argv) > 1 else 'MA' CITIES = { 'MA': ['casablanca','rabat','tanger','marrakech','fes','agadir'], 'DZ': ['alger','oran','constantine','annaba','blida'], 'TN': ['tunis','sfax','sousse','bizerte','gabes'] } QUERIES = [ 'site:linkedin.com/in "{role}" "{city}" "{sector}"', 'site:linkedin.com/company "{sector}" "{city}"', 'site:kompass.com "{sector}" "{city}"', 'site:charika.ma "{sector}"', 'site:rekrute.com "{role}" "{sector}"', '"{role}" "{sector}" "{city}" email @gmail OR @yahoo OR @hotmail', ] ROLES = ['directeur general','DSI','CTO','DAF','directeur informatique', 'responsable IT','directeur achats','DRH','directeur commercial'] SECTORS = ['SAP','ERP','cloud','pharma','logistique','banque','telecom','energie'] BL_WORDS = ['wikipedia','zhihu','baidu','google.com','bing.com','facebook.com', 'youtube.com','reddit.com','twitter.com','quora.com','adobe','admob', 'larousse','dictionnaire','definition'] conn = psycopg2.connect(**DB); cur = conn.cursor(); added = 0; skipped = 0 def sx(q): try: r = requests.get(SEARX, params={'q': q, 'format': 'json'}, timeout=15) return r.json().get('results', [])[:10] except: return [] def is_valid_name(name): if not name or len(name) < 4: return False if any(c in name for c in ['http','www','@','#','$']): return False if re.search(r'[\u4e00-\u9fff]', name): return False # No Chinese parts = name.strip().split() if len(parts) < 2: return False if any(len(p) < 2 for p in parts[:2]): return False if any(p.lower() in BL_WORDS for p in parts): return False return True def extract_linkedin(results): leads = [] for r in results: url = r.get('url','') if 'linkedin.com/in/' not in url: continue t = r.get('title','').replace(' | LinkedIn','').replace(' - LinkedIn','').strip() c = r.get('content','') parts = [p.strip() for p in t.split(' - ') if p.strip()] name = parts[0] if parts else '' if not is_valid_name(name): continue job = parts[1] if len(parts) > 1 else '' comp = parts[2] if len(parts) > 2 else '' if not comp: for kw in [' chez ',' at ',' dans ']: if kw in c: comp = c.split(kw)[-1].split('.')[0].split(',')[0].strip()[:100]; break leads.append({'name': name, 'job': job, 'company': comp, 'url': url, 'content': c}) return leads def extract_b2b(results, city, role, sector): leads = [] for r in results: url = r.get('url','') if any(bl in url for bl in BL_WORDS): continue t = r.get('title','') c = r.get('content','') full = f"{t} {c}" emails = [e for e in set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full.lower())) if not any(bl in e for bl in BL_WORDS+['noreply','contact@','info@','admin@'])] phones = list(set(re.findall(r'(?:\+212|\+213|\+216|0[567])\d[\d\s.-]{7,11}', full))) name = None for pat in [r'([A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+)', r'([A-Z][a-z]+ [A-Z][a-z]+)']: m = re.search(pat, t) if m and is_valid_name(m.group(1)): name = m.group(1); break if not emails and not name: continue comp = None for kw in [' chez ',' at ',' - ','| ']: if kw in t: comp = t.split(kw)[-1].strip()[:100]; break leads.append({'name': name, 'email': emails[0] if emails else None, 'phone': phones[0] if phones else None, 'company': comp, 'url': url, 'job': role, 'sector': sector}) return leads cities = CITIES.get(country, CITIES['MA']) for city in cities: for sector in SECTORS: for role in ROLES[:5]: # LinkedIn search q = f'site:linkedin.com/in "{role}" "{city}"' for lead in extract_linkedin(sx(q)): cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE lead_linkedin_url=%s",(lead['url'],)) if cur.fetchone(): skipped += 1; continue ind = '' for kw in SECTORS: if kw.lower() in (lead['content']+lead['job']+lead['company']).lower(): ind = kw; break cur.execute("""INSERT INTO admin.linkedin_leads (lead_name,lead_company,lead_title,lead_industry,lead_seniority, lead_linkedin_url,lead_country,lead_city,captured_at) VALUES (%s,%s,%s,%s,'senior',%s,%s,%s,NOW())""", (lead['name'],lead['company'],lead['job'],ind or sector, lead['url'],country,city)) conn.commit(); added += 1 print(f"+LI {lead['name']} @{lead['company']} [{lead['job']}]") # B2B email search q2 = f'"{role}" "{sector}" "{city}" email @gmail OR @yahoo' for lead in extract_b2b(sx(q2), city, role, sector): if lead.get('email'): cur.execute("SELECT 1 FROM admin.weval_leads WHERE email=%s",(lead['email'],)) if cur.fetchone(): skipped += 1; continue elif lead.get('name'): cur.execute("SELECT 1 FROM admin.weval_leads WHERE contact_name=%s",(lead['name'],)) if cur.fetchone(): skipped += 1; continue else: continue cur.execute("""INSERT INTO admin.weval_leads (contact_name,email,phone,company_name,contact_title,industry,city,country,source,created_at) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,'searxng_b2b_v2',NOW())""", (lead.get('name'),lead.get('email'),lead.get('phone'), lead.get('company'),lead.get('job'),lead.get('sector'),city,country)) conn.commit(); added += 1 print(f"+B2B {lead.get('name','?')} {lead.get('email','')} @{lead.get('company','?')}") time.sleep(1) print(f"[{city}] +{added} total, {skipped} skipped") cur.close(); conn.close() print(f"\nFINAL {country}: +{added} leads, {skipped} dupes")