Files
html/api/weval_linkedin_scraper.py
2026-04-12 22:57:03 +02:00

55 lines
2.5 KiB
Python

#!/usr/bin/env python3
"""WEVAL LinkedIn Scraper v1.1 - SearXNG LinkedIn discovery"""
import re, sys, time, psycopg2, requests
DB = dict(host='10.1.0.3', dbname='adx_system', user='admin', password='admin123')
SEARX = "http://localhost:8888/search"
country = sys.argv[1] if len(sys.argv) > 1 else 'MA'
CITIES = {'MA': ['casablanca','rabat','tanger','marrakech'],
'DZ': ['alger','oran','constantine','annaba'],
'TN': ['tunis','sfax','sousse','bizerte']}
ROLES = ['DSI','directeur general','CTO','directeur informatique','responsable IT',
'directeur achats','supply chain','DAF','DRH','SAP consultant',
'ERP manager','cloud architect','CISO','data officer','CDO']
conn = psycopg2.connect(**DB); cur = conn.cursor(); added = 0
def sx(q):
try:
r = requests.get(SEARX, params={'q': q, 'format': 'json'}, timeout=15)
return r.json().get('results', [])[:10]
except: return []
cities = CITIES.get(country, CITIES['MA'])
for city in cities:
for role in ROLES:
q = f'site:linkedin.com/in "{role}" "{city}"'
for r in sx(q):
u = r.get('url','')
if 'linkedin.com/in/' not in u: continue
t = r.get('title','').replace(' | LinkedIn','').replace(' - LinkedIn','')
c = r.get('content','')
parts = t.split(' - ')
name = parts[0].strip()
job = parts[1].strip() if len(parts)>1 else role
comp = parts[2].strip() if len(parts)>2 else ''
if not comp:
for kw in [' chez ',' at ']:
if kw in c: comp = c.split(kw)[-1].split('.')[0].strip()[:100]; break
if len(name.split()) < 2: continue
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE lead_linkedin_url=%s",(u,))
if cur.fetchone(): continue
ind = ''
for kw in ['pharma','SAP','ERP','cloud','banque','assurance','telecom','energie','IT','cyber']:
if kw.lower() in (c+t+comp).lower(): ind = kw; break
cur.execute("""INSERT INTO admin.linkedin_leads
(lead_name,lead_company,lead_title,lead_industry,lead_seniority,lead_linkedin_url,lead_country,lead_city,captured_at)
VALUES (%s,%s,%s,%s,'senior',%s,%s,%s,NOW())""",
(name,comp,job,ind,u,country,city))
conn.commit(); added += 1
print(f"+LI {name} @{comp} [{job}] {u}")
time.sleep(1.5)
cur.close(); conn.close()
print(f"LINKEDIN_{country}: +{added} profiles")