151 lines
7.4 KiB
Python
151 lines
7.4 KiB
Python
#!/usr/bin/env python3
|
||
"""LinkedIn B2B lead scraper via SearXNG (sovereign search)
|
||
No browser needed, pure HTTP, much faster than Playwright
|
||
"""
|
||
import re, sys, time, psycopg2, json, requests
|
||
requests.packages.urllib3.disable_warnings()
|
||
|
||
DB=dict(host="10.1.0.3",dbname="adx_system",user="admin",password="admin123")
|
||
SEARX="http://localhost:8888/search"
|
||
batch=int(sys.argv[1]) if len(sys.argv)>1 else 200
|
||
|
||
QUERIES = [
|
||
# MA LinkedIn
|
||
("DSI casablanca linkedin", "MA"), ("directeur informatique rabat linkedin", "MA"),
|
||
("CTO maroc linkedin", "MA"), ("IT director casablanca linkedin", "MA"),
|
||
("ERP SAP manager maroc linkedin", "MA"), ("supply chain director maroc linkedin", "MA"),
|
||
("directeur general casablanca linkedin", "MA"), ("cloud architect maroc linkedin", "MA"),
|
||
("cybersecurity manager maroc linkedin", "MA"), ("transformation digitale maroc linkedin", "MA"),
|
||
("directeur financier casablanca linkedin", "MA"), ("directeur logistique maroc linkedin", "MA"),
|
||
("SAP consultant maroc linkedin", "MA"), ("data scientist maroc linkedin", "MA"),
|
||
# TN LinkedIn
|
||
("DSI tunis linkedin", "TN"), ("directeur informatique tunisie linkedin", "TN"),
|
||
("CTO tunisia linkedin", "TN"), ("ERP manager tunisie linkedin", "TN"),
|
||
("IT director tunis linkedin", "TN"), ("directeur general tunisie linkedin", "TN"),
|
||
# DZ LinkedIn
|
||
("DSI alger linkedin", "DZ"), ("directeur informatique algerie linkedin", "DZ"),
|
||
("CTO algeria linkedin", "DZ"), ("ERP SAP algerie linkedin", "DZ"),
|
||
("IT director alger linkedin", "DZ"), ("directeur general algerie linkedin", "DZ"),
|
||
# B2B company leads
|
||
("entreprise informatique casablanca email directeur", "MA"),
|
||
("societe IT rabat directeur contact", "MA"),
|
||
("cabinet conseil IT maroc email", "MA"),
|
||
("SSII maroc casablanca directeur email", "MA"),
|
||
("integrateur SAP maroc email", "MA"),
|
||
("entreprise informatique tunis email", "TN"),
|
||
("societe conseil IT algerie email", "DZ"),
|
||
]
|
||
|
||
conn=psycopg2.connect(**DB);cur=conn.cursor()
|
||
# Ensure table
|
||
cur.execute("""CREATE TABLE IF NOT EXISTS admin.weval_leads (
|
||
id SERIAL PRIMARY KEY, company_name VARCHAR(255), contact_name VARCHAR(255),
|
||
contact_title VARCHAR(255), email VARCHAR(255), phone VARCHAR(100),
|
||
website VARCHAR(255), industry VARCHAR(100), country VARCHAR(10),
|
||
city VARCHAR(100), source VARCHAR(50), linkedin_url VARCHAR(500),
|
||
notes TEXT, created_at TIMESTAMP DEFAULT NOW())""")
|
||
conn.commit()
|
||
|
||
total_li=0; total_b2b=0
|
||
|
||
for query, country in QUERIES:
|
||
if total_li+total_b2b >= batch: break
|
||
try:
|
||
r=requests.get(SEARX, params={"q":query,"format":"json","engines":"google,bing,duckduckgo"}, timeout=15, verify=False)
|
||
results=r.json().get("results",[])
|
||
except:
|
||
continue
|
||
|
||
for res in results[:15]:
|
||
url=res.get("url","")
|
||
title=res.get("title","")
|
||
snippet=res.get("content","")
|
||
full=title+" "+snippet
|
||
|
||
# Extract emails/phones from snippet
|
||
emails=[e.lower() for e in re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full)
|
||
if not any(x in e.lower() for x in ['google','bing','example','facebook','wikipedia','linkedin'])]
|
||
phones=re.findall(r'(?:\+212|\+216|\+213|0)[0-9 .-]{8,14}', full)
|
||
|
||
# LINKEDIN LEAD
|
||
if 'linkedin.com/in/' in url:
|
||
name=re.sub(r'\s*[-|–].*(linkedin|profil).*','',title,flags=re.I).strip()
|
||
if len(name)<3 or len(name)>100: continue
|
||
|
||
# Extract title from "Name - Title" or snippet
|
||
job=''
|
||
dm=re.search(r'[-–]\s*(.+?)(?:\s*[-|–]\s*[Ll]inkedin)',title)
|
||
if dm: job=dm.group(1).strip()[:200]
|
||
|
||
# Company from snippet
|
||
company=''
|
||
cm=re.search(r'(?:chez|at|@|pour)\s+([^.·,\n]{3,50})',snippet,re.I)
|
||
if cm: company=cm.group(1).strip()
|
||
|
||
# City
|
||
city=''
|
||
for c in ['Casablanca','Rabat','Marrakech','Fes','Tanger','Agadir','Meknes','Tunis','Sfax','Sousse','Alger','Oran','Constantine']:
|
||
if c.lower() in full.lower(): city=c;break
|
||
|
||
# Industry
|
||
industry=''
|
||
for kw,ind in [('sap','ERP/SAP'),('erp','ERP/SAP'),('cloud','Cloud'),('cyber','Cybersecurity'),
|
||
('supply','Supply Chain'),('dsi','IT Management'),('cto','IT Management'),
|
||
('directeur informatique','IT Management'),('it director','IT Management'),
|
||
('data','Data/Analytics'),('pharma','Life Sciences'),('finance','Finance'),
|
||
('logistique','Supply Chain'),('digital','Digital')]:
|
||
if kw in full.lower(): industry=ind;break
|
||
|
||
# Seniority
|
||
seniority=''
|
||
for kw,sen in [('directeur','Director'),('director','Director'),('dsi','C-Suite'),
|
||
('cto','C-Suite'),('ceo','C-Suite'),('dg','C-Suite'),('vp','VP'),
|
||
('manager','Manager'),('consultant','Consultant'),('chef','Manager')]:
|
||
if kw in (job+name).lower(): seniority=sen;break
|
||
|
||
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE lead_name=%s LIMIT 1",(name,))
|
||
if cur.fetchone(): continue
|
||
|
||
try:
|
||
cur.execute("""INSERT INTO admin.linkedin_leads
|
||
(lead_name,lead_email,lead_company,lead_title,lead_industry,lead_seniority,form_data,captured_at)
|
||
VALUES(%s,%s,%s,%s,%s,%s,%s,NOW())""",
|
||
(name, emails[0] if emails else None, company, job, industry, seniority,
|
||
json.dumps({"url":url,"city":city,"country":country})))
|
||
conn.commit(); total_li+=1
|
||
print(f"+LI {name} | {job[:30]} | {company[:30]} | {country}")
|
||
except: conn.rollback()
|
||
|
||
# B2B COMPANY LEAD
|
||
elif emails or phones:
|
||
company=re.sub(r'\s*[-|].*(bing|google).*','',title,flags=re.I).strip()[:200]
|
||
if len(company)<3: continue
|
||
|
||
city=''
|
||
for c in ['Casablanca','Rabat','Marrakech','Fes','Tanger','Tunis','Sfax','Alger','Oran']:
|
||
if c.lower() in full.lower(): city=c;break
|
||
|
||
industry=''
|
||
for kw,ind in [('sap','ERP/SAP'),('erp','ERP/SAP'),('cloud','Cloud'),('cyber','Cybersecurity'),
|
||
('informatique','IT Services'),('conseil','Consulting'),('pharma','Pharma'),
|
||
('banque','Banking'),('telecom','Telecom'),('logistique','Supply Chain')]:
|
||
if kw in full.lower(): industry=ind;break
|
||
|
||
cur.execute("SELECT 1 FROM admin.weval_leads WHERE company_name=%s AND country=%s LIMIT 1",(company[:200],country))
|
||
if cur.fetchone(): continue
|
||
|
||
try:
|
||
cur.execute("""INSERT INTO admin.weval_leads
|
||
(company_name,email,phone,website,industry,country,city,source,created_at)
|
||
VALUES(%s,%s,%s,%s,%s,%s,%s,'searxng_b2b',NOW())""",
|
||
(company[:200], emails[0] if emails else None, phones[0] if phones else None,
|
||
url[:255], industry, country, city))
|
||
conn.commit(); total_b2b+=1
|
||
print(f"+B2B {company[:40]} | {emails[0] if emails else '-'} | {country}")
|
||
except: conn.rollback()
|
||
|
||
time.sleep(2)
|
||
|
||
cur.close();conn.close()
|
||
print(f"\nRESULT: +{total_li} LinkedIn leads, +{total_b2b} B2B leads")
|