Files
weval-consulting/api/weval_b2b_scraper.py

64 lines
3.2 KiB
Python

#!/usr/bin/env python3
"""WEVAL B2B Scraper v1.1 - SearXNG lead gen for Maghreb"""
import re, sys, time, psycopg2, requests
DB = dict(host='10.1.0.3', dbname='adx_system', user='admin', password='admin123')
SEARX = "http://localhost:8888/search"
country = sys.argv[1] if len(sys.argv) > 1 else 'MA'
CITIES = {
'MA': ['casablanca','rabat','tanger','marrakech','fes','agadir','meknes','oujda','kenitra'],
'DZ': ['alger','oran','constantine','annaba','blida','tlemcen','setif','batna'],
'TN': ['tunis','sfax','sousse','bizerte','gabes','kairouan','monastir','nabeul']
}
ROLES = ['directeur general','DSI','directeur informatique','DAF','CTO','responsable IT',
'directeur achats','supply chain','directeur commercial','DRH']
SECTORS = ['SAP','ERP','cloud','pharma','manufacturing','banque','assurance','telecom','energie','logistique']
BL = ['google','bing','example','facebook','wikipedia','youtube','twitter','zhihu']
conn = psycopg2.connect(**DB); cur = conn.cursor(); added = 0
def sx(q):
try:
r = requests.get(SEARX, params={'q': q, 'format': 'json'}, timeout=15)
return r.json().get('results', [])[:10]
except: return []
cities = CITIES.get(country, CITIES['MA'])
for city in cities:
for sector in SECTORS:
for role in ROLES[:5]:
q = f'"{role}" "{sector}" "{city}" email'
for r in sx(q):
t, c, u = r.get('title',''), r.get('content',''), r.get('url','')
full = f"{t} {c}"
emails = [e for e in set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full.lower())) if not any(x in e for x in BL)]
phones = list(set(re.findall(r'(?:\+212|\+213|\+216|0[567])[\d\s.-]{8,12}', full)))
name = None
for pat in [r'([A-Z][a-z]+ [A-Z][A-Z]+)', r'([A-Z][a-z]+ [A-Z][a-z]+)']:
m = re.search(pat, t)
if m: name = m.group(1); break
if not name and not emails: continue
company = None
for kw in ['chez ','at ','- ','| ']:
if kw in t: company = t.split(kw)[-1].strip()[:100]; break
em = emails[0] if emails else None
ph = phones[0] if phones else None
li = u if 'linkedin' in u else None
if em:
cur.execute("SELECT 1 FROM admin.weval_leads WHERE email=%s",(em,))
if cur.fetchone(): continue
elif name:
cur.execute("SELECT 1 FROM admin.weval_leads WHERE contact_name=%s AND company_name=%s",(name,company or ''))
if cur.fetchone(): continue
else: continue
cur.execute("""INSERT INTO admin.weval_leads
(contact_name,email,phone,company_name,contact_title,industry,city,country,linkedin_url,source,created_at)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,'searxng_b2b',NOW())""",
(name,em,ph,company,role,sector,city,country,li))
conn.commit(); added += 1
print(f"+LEAD {name or '?'} @{company or '?'} [{sector}/{role}] {em or ''}")
time.sleep(1)
cur.close(); conn.close()
print(f"B2B_{country}: +{added} leads")