133 lines
6.3 KiB
Python
133 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
"""WEVAL B2B Scraper v2.0 - LinkedIn-focused via SearXNG
|
|
Quality-first: only insert verified professional profiles"""
|
|
import re, sys, time, psycopg2, requests, json
|
|
|
|
DB = dict(host='10.1.0.3', dbname='adx_system', user='admin', password='admin123')
|
|
SEARX = "http://localhost:8888/search"
|
|
country = sys.argv[1] if len(sys.argv) > 1 else 'MA'
|
|
|
|
CITIES = {
|
|
'MA': ['casablanca','rabat','tanger','marrakech','fes','agadir'],
|
|
'DZ': ['alger','oran','constantine','annaba','blida'],
|
|
'TN': ['tunis','sfax','sousse','bizerte','gabes']
|
|
}
|
|
QUERIES = [
|
|
'site:linkedin.com/in "{role}" "{city}" "{sector}"',
|
|
'site:linkedin.com/company "{sector}" "{city}"',
|
|
'site:kompass.com "{sector}" "{city}"',
|
|
'site:charika.ma "{sector}"',
|
|
'site:rekrute.com "{role}" "{sector}"',
|
|
'"{role}" "{sector}" "{city}" email @gmail OR @yahoo OR @hotmail',
|
|
]
|
|
ROLES = ['directeur general','DSI','CTO','DAF','directeur informatique',
|
|
'responsable IT','directeur achats','DRH','directeur commercial']
|
|
SECTORS = ['SAP','ERP','cloud','pharma','logistique','banque','telecom','energie']
|
|
BL_WORDS = ['wikipedia','zhihu','baidu','google.com','bing.com','facebook.com',
|
|
'youtube.com','reddit.com','twitter.com','quora.com','adobe','admob',
|
|
'larousse','dictionnaire','definition']
|
|
|
|
conn = psycopg2.connect(**DB); cur = conn.cursor(); added = 0; skipped = 0
|
|
|
|
def sx(q):
|
|
try:
|
|
r = requests.get(SEARX, params={'q': q, 'format': 'json'}, timeout=15)
|
|
return r.json().get('results', [])[:10]
|
|
except: return []
|
|
|
|
def is_valid_name(name):
|
|
if not name or len(name) < 4: return False
|
|
if any(c in name for c in ['http','www','@','#','$']): return False
|
|
if re.search(r'[\u4e00-\u9fff]', name): return False # No Chinese
|
|
parts = name.strip().split()
|
|
if len(parts) < 2: return False
|
|
if any(len(p) < 2 for p in parts[:2]): return False
|
|
if any(p.lower() in BL_WORDS for p in parts): return False
|
|
return True
|
|
|
|
def extract_linkedin(results):
|
|
leads = []
|
|
for r in results:
|
|
url = r.get('url','')
|
|
if 'linkedin.com/in/' not in url: continue
|
|
t = r.get('title','').replace(' | LinkedIn','').replace(' - LinkedIn','').strip()
|
|
c = r.get('content','')
|
|
parts = [p.strip() for p in t.split(' - ') if p.strip()]
|
|
name = parts[0] if parts else ''
|
|
if not is_valid_name(name): continue
|
|
job = parts[1] if len(parts) > 1 else ''
|
|
comp = parts[2] if len(parts) > 2 else ''
|
|
if not comp:
|
|
for kw in [' chez ',' at ',' dans ']:
|
|
if kw in c: comp = c.split(kw)[-1].split('.')[0].split(',')[0].strip()[:100]; break
|
|
leads.append({'name': name, 'job': job, 'company': comp, 'url': url, 'content': c})
|
|
return leads
|
|
|
|
def extract_b2b(results, city, role, sector):
|
|
leads = []
|
|
for r in results:
|
|
url = r.get('url','')
|
|
if any(bl in url for bl in BL_WORDS): continue
|
|
t = r.get('title','')
|
|
c = r.get('content','')
|
|
full = f"{t} {c}"
|
|
emails = [e for e in set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full.lower()))
|
|
if not any(bl in e for bl in BL_WORDS+['noreply','contact@','info@','admin@'])]
|
|
phones = list(set(re.findall(r'(?:\+212|\+213|\+216|0[567])\d[\d\s.-]{7,11}', full)))
|
|
name = None
|
|
for pat in [r'([A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+)', r'([A-Z][a-z]+ [A-Z][a-z]+)']:
|
|
m = re.search(pat, t)
|
|
if m and is_valid_name(m.group(1)): name = m.group(1); break
|
|
if not emails and not name: continue
|
|
comp = None
|
|
for kw in [' chez ',' at ',' - ','| ']:
|
|
if kw in t: comp = t.split(kw)[-1].strip()[:100]; break
|
|
leads.append({'name': name, 'email': emails[0] if emails else None,
|
|
'phone': phones[0] if phones else None, 'company': comp,
|
|
'url': url, 'job': role, 'sector': sector})
|
|
return leads
|
|
|
|
cities = CITIES.get(country, CITIES['MA'])
|
|
for city in cities:
|
|
for sector in SECTORS:
|
|
for role in ROLES[:5]:
|
|
# LinkedIn search
|
|
q = f'site:linkedin.com/in "{role}" "{city}"'
|
|
for lead in extract_linkedin(sx(q)):
|
|
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE lead_linkedin_url=%s",(lead['url'],))
|
|
if cur.fetchone(): skipped += 1; continue
|
|
ind = ''
|
|
for kw in SECTORS:
|
|
if kw.lower() in (lead['content']+lead['job']+lead['company']).lower(): ind = kw; break
|
|
cur.execute("""INSERT INTO admin.linkedin_leads
|
|
(lead_name,lead_company,lead_title,lead_industry,lead_seniority,
|
|
lead_linkedin_url,lead_country,lead_city,captured_at)
|
|
VALUES (%s,%s,%s,%s,'senior',%s,%s,%s,NOW())""",
|
|
(lead['name'],lead['company'],lead['job'],ind or sector,
|
|
lead['url'],country,city))
|
|
conn.commit(); added += 1
|
|
print(f"+LI {lead['name']} @{lead['company']} [{lead['job']}]")
|
|
|
|
# B2B email search
|
|
q2 = f'"{role}" "{sector}" "{city}" email @gmail OR @yahoo'
|
|
for lead in extract_b2b(sx(q2), city, role, sector):
|
|
if lead.get('email'):
|
|
cur.execute("SELECT 1 FROM admin.weval_leads WHERE email=%s",(lead['email'],))
|
|
if cur.fetchone(): skipped += 1; continue
|
|
elif lead.get('name'):
|
|
cur.execute("SELECT 1 FROM admin.weval_leads WHERE contact_name=%s",(lead['name'],))
|
|
if cur.fetchone(): skipped += 1; continue
|
|
else: continue
|
|
cur.execute("""INSERT INTO admin.weval_leads
|
|
(contact_name,email,phone,company_name,contact_title,industry,city,country,source,created_at)
|
|
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,'searxng_b2b_v2',NOW())""",
|
|
(lead.get('name'),lead.get('email'),lead.get('phone'),
|
|
lead.get('company'),lead.get('job'),lead.get('sector'),city,country))
|
|
conn.commit(); added += 1
|
|
print(f"+B2B {lead.get('name','?')} {lead.get('email','')} @{lead.get('company','?')}")
|
|
time.sleep(1)
|
|
print(f"[{city}] +{added} total, {skipped} skipped")
|
|
|
|
cur.close(); conn.close()
|
|
print(f"\nFINAL {country}: +{added} leads, {skipped} dupes")
|