64 lines
3.2 KiB
Python
64 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
"""WEVAL B2B Scraper v1.1 - SearXNG lead gen for Maghreb"""
|
|
import re, sys, time, psycopg2, requests
|
|
|
|
DB = dict(host='10.1.0.3', dbname='adx_system', user='admin', password='admin123')
|
|
SEARX = "http://localhost:8888/search"
|
|
country = sys.argv[1] if len(sys.argv) > 1 else 'MA'
|
|
|
|
CITIES = {
|
|
'MA': ['casablanca','rabat','tanger','marrakech','fes','agadir','meknes','oujda','kenitra'],
|
|
'DZ': ['alger','oran','constantine','annaba','blida','tlemcen','setif','batna'],
|
|
'TN': ['tunis','sfax','sousse','bizerte','gabes','kairouan','monastir','nabeul']
|
|
}
|
|
ROLES = ['directeur general','DSI','directeur informatique','DAF','CTO','responsable IT',
|
|
'directeur achats','supply chain','directeur commercial','DRH']
|
|
SECTORS = ['SAP','ERP','cloud','pharma','manufacturing','banque','assurance','telecom','energie','logistique']
|
|
BL = ['google','bing','example','facebook','wikipedia','youtube','twitter','zhihu']
|
|
|
|
conn = psycopg2.connect(**DB); cur = conn.cursor(); added = 0
|
|
|
|
def sx(q):
|
|
try:
|
|
r = requests.get(SEARX, params={'q': q, 'format': 'json'}, timeout=15)
|
|
return r.json().get('results', [])[:10]
|
|
except: return []
|
|
|
|
cities = CITIES.get(country, CITIES['MA'])
|
|
for city in cities:
|
|
for sector in SECTORS:
|
|
for role in ROLES[:5]:
|
|
q = f'"{role}" "{sector}" "{city}" email'
|
|
for r in sx(q):
|
|
t, c, u = r.get('title',''), r.get('content',''), r.get('url','')
|
|
full = f"{t} {c}"
|
|
emails = [e for e in set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full.lower())) if not any(x in e for x in BL)]
|
|
phones = list(set(re.findall(r'(?:\+212|\+213|\+216|0[567])[\d\s.-]{8,12}', full)))
|
|
name = None
|
|
for pat in [r'([A-Z][a-z]+ [A-Z][A-Z]+)', r'([A-Z][a-z]+ [A-Z][a-z]+)']:
|
|
m = re.search(pat, t)
|
|
if m: name = m.group(1); break
|
|
if not name and not emails: continue
|
|
company = None
|
|
for kw in ['chez ','at ','- ','| ']:
|
|
if kw in t: company = t.split(kw)[-1].strip()[:100]; break
|
|
em = emails[0] if emails else None
|
|
ph = phones[0] if phones else None
|
|
li = u if 'linkedin' in u else None
|
|
if em:
|
|
cur.execute("SELECT 1 FROM admin.weval_leads WHERE email=%s",(em,))
|
|
if cur.fetchone(): continue
|
|
elif name:
|
|
cur.execute("SELECT 1 FROM admin.weval_leads WHERE contact_name=%s AND company_name=%s",(name,company or ''))
|
|
if cur.fetchone(): continue
|
|
else: continue
|
|
cur.execute("""INSERT INTO admin.weval_leads
|
|
(contact_name,email,phone,company_name,contact_title,industry,city,country,linkedin_url,source,created_at)
|
|
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,'searxng_b2b',NOW())""",
|
|
(name,em,ph,company,role,sector,city,country,li))
|
|
conn.commit(); added += 1
|
|
print(f"+LEAD {name or '?'} @{company or '?'} [{sector}/{role}] {em or ''}")
|
|
time.sleep(1)
|
|
cur.close(); conn.close()
|
|
print(f"B2B_{country}: +{added} leads")
|