55 lines
2.5 KiB
Python
55 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""WEVAL LinkedIn Scraper v1.1 - SearXNG LinkedIn discovery"""
|
|
import re, sys, time, psycopg2, requests
|
|
|
|
DB = dict(host='10.1.0.3', dbname='adx_system', user='admin', password='admin123')
|
|
SEARX = "http://localhost:8888/search"
|
|
country = sys.argv[1] if len(sys.argv) > 1 else 'MA'
|
|
|
|
CITIES = {'MA': ['casablanca','rabat','tanger','marrakech'],
|
|
'DZ': ['alger','oran','constantine','annaba'],
|
|
'TN': ['tunis','sfax','sousse','bizerte']}
|
|
ROLES = ['DSI','directeur general','CTO','directeur informatique','responsable IT',
|
|
'directeur achats','supply chain','DAF','DRH','SAP consultant',
|
|
'ERP manager','cloud architect','CISO','data officer','CDO']
|
|
|
|
conn = psycopg2.connect(**DB); cur = conn.cursor(); added = 0
|
|
|
|
def sx(q):
|
|
try:
|
|
r = requests.get(SEARX, params={'q': q, 'format': 'json'}, timeout=15)
|
|
return r.json().get('results', [])[:10]
|
|
except: return []
|
|
|
|
cities = CITIES.get(country, CITIES['MA'])
|
|
for city in cities:
|
|
for role in ROLES:
|
|
q = f'site:linkedin.com/in "{role}" "{city}"'
|
|
for r in sx(q):
|
|
u = r.get('url','')
|
|
if 'linkedin.com/in/' not in u: continue
|
|
t = r.get('title','').replace(' | LinkedIn','').replace(' - LinkedIn','')
|
|
c = r.get('content','')
|
|
parts = t.split(' - ')
|
|
name = parts[0].strip()
|
|
job = parts[1].strip() if len(parts)>1 else role
|
|
comp = parts[2].strip() if len(parts)>2 else ''
|
|
if not comp:
|
|
for kw in [' chez ',' at ']:
|
|
if kw in c: comp = c.split(kw)[-1].split('.')[0].strip()[:100]; break
|
|
if len(name.split()) < 2: continue
|
|
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE lead_linkedin_url=%s",(u,))
|
|
if cur.fetchone(): continue
|
|
ind = ''
|
|
for kw in ['pharma','SAP','ERP','cloud','banque','assurance','telecom','energie','IT','cyber']:
|
|
if kw.lower() in (c+t+comp).lower(): ind = kw; break
|
|
cur.execute("""INSERT INTO admin.linkedin_leads
|
|
(lead_name,lead_company,lead_title,lead_industry,lead_seniority,lead_linkedin_url,lead_country,lead_city,captured_at)
|
|
VALUES (%s,%s,%s,%s,'senior',%s,%s,%s,NOW())""",
|
|
(name,comp,job,ind,u,country,city))
|
|
conn.commit(); added += 1
|
|
print(f"+LI {name} @{comp} [{job}] {u}")
|
|
time.sleep(1.5)
|
|
cur.close(); conn.close()
|
|
print(f"LINKEDIN_{country}: +{added} profiles")
|