164 lines
7.7 KiB
Python
164 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
||
"""Playwright LinkedIn B2B lead scraper for WEVAL consulting
|
||
Targets: DSI, DG, CTO, IT Director, ERP Manager in Maghreb
|
||
Uses DuckDuckGo/Bing to find LinkedIn profiles (no login needed)
|
||
"""
|
||
import re, sys, time, psycopg2, json
|
||
from playwright.sync_api import sync_playwright
|
||
|
||
DB=dict(host="10.1.0.3",dbname="adx_system",user="admin",password="admin123")
|
||
batch=int(sys.argv[1]) if len(sys.argv)>1 else 50
|
||
|
||
QUERIES = [
|
||
# Morocco
|
||
'site:linkedin.com/in "DSI" "maroc" OR "casablanca" OR "rabat"',
|
||
'site:linkedin.com/in "directeur informatique" "maroc"',
|
||
'site:linkedin.com/in "CTO" "morocco" OR "casablanca"',
|
||
'site:linkedin.com/in "IT director" "morocco"',
|
||
'site:linkedin.com/in "ERP" "SAP" "maroc" OR "morocco"',
|
||
'site:linkedin.com/in "supply chain" "director" "maroc"',
|
||
'site:linkedin.com/in "directeur general" "casablanca" OR "rabat"',
|
||
'site:linkedin.com/in "cloud" "manager" "maroc"',
|
||
'site:linkedin.com/in "cybersecurity" "morocco"',
|
||
# Tunisia
|
||
'site:linkedin.com/in "DSI" "tunisie" OR "tunis"',
|
||
'site:linkedin.com/in "directeur informatique" "tunisie"',
|
||
'site:linkedin.com/in "IT manager" "tunisia"',
|
||
'site:linkedin.com/in "ERP" "SAP" "tunisie"',
|
||
# Algeria
|
||
'site:linkedin.com/in "DSI" "algerie" OR "alger"',
|
||
'site:linkedin.com/in "directeur informatique" "algerie"',
|
||
'site:linkedin.com/in "IT director" "algeria"',
|
||
'site:linkedin.com/in "ERP" "SAP" "algerie"',
|
||
# Generic Maghreb
|
||
'site:linkedin.com/in "directeur systemes information" "maghreb"',
|
||
'site:linkedin.com/in "transformation digitale" "directeur" "maroc" OR "tunisie" OR "algerie"',
|
||
'site:linkedin.com/in "SAP consultant" "north africa"',
|
||
]
|
||
|
||
conn = psycopg2.connect(**DB)
|
||
cur = conn.cursor()
|
||
total = 0
|
||
|
||
with sync_playwright() as p:
|
||
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
|
||
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0")
|
||
|
||
for query in QUERIES:
|
||
if total >= batch:
|
||
break
|
||
try:
|
||
page = ctx.new_page()
|
||
# Use Bing (more permissive than Google for scraping)
|
||
page.goto(f"https://www.bing.com/search?q={query.replace(' ','+')}&count=50", timeout=15000)
|
||
page.wait_for_timeout(3000)
|
||
|
||
# Extract LinkedIn profile info from search results
|
||
results = page.evaluate(r"""() => {
|
||
const data = [];
|
||
document.querySelectorAll('li.b_algo, .b_algo').forEach(el => {
|
||
const link = el.querySelector('a');
|
||
const href = link ? link.href : '';
|
||
if (!href.includes('linkedin.com/in/')) return;
|
||
|
||
const title = (link ? link.textContent : '').trim();
|
||
const snippet = (el.querySelector('.b_caption p, .b_lineclamp2') || {}).textContent || '';
|
||
|
||
// Parse: "Prénom Nom - Titre | LinkedIn"
|
||
let name = title.replace(/\s*[-|].*linkedin.*/i, '').replace(/\s*\|.*/,'').trim();
|
||
let titleJob = '';
|
||
const dashMatch = title.match(/[-–]\s*(.+?)(?:\s*[-|]\s*LinkedIn)/i);
|
||
if (dashMatch) titleJob = dashMatch[1].trim();
|
||
|
||
// Extract company from snippet
|
||
let company = '';
|
||
const compMatch = snippet.match(/(?:chez|at|@)\s+([^.·,]+)/i);
|
||
if (compMatch) company = compMatch[1].trim();
|
||
|
||
// Extract location
|
||
let location = '';
|
||
const locMatch = snippet.match(/(casablanca|rabat|marrakech|fes|tanger|tunis|sfax|alger|oran)/i);
|
||
if (locMatch) location = locMatch[1];
|
||
|
||
// Extract email if visible
|
||
const emails = snippet.match(/[\w.+-]+@[\w.-]+\.[a-z]{2,}/gi) || [];
|
||
|
||
if (name.length > 3) {
|
||
data.push({name, title: titleJob, company, location, email: emails[0]||'', url: href});
|
||
}
|
||
});
|
||
return data;
|
||
}""")
|
||
|
||
page.close()
|
||
|
||
for r in results:
|
||
name = r.get('name','').strip()
|
||
if not name or len(name) < 3:
|
||
continue
|
||
|
||
parts = name.split(None, 1)
|
||
lead_name = name
|
||
lead_title = r.get('title','')
|
||
lead_company = r.get('company','')
|
||
lead_email = r.get('email','')
|
||
location = r.get('location','')
|
||
url = r.get('url','')
|
||
|
||
# Determine country
|
||
country = ''
|
||
for kw, c in [('casablanca','MA'),('rabat','MA'),('marrakech','MA'),('fes','MA'),('tanger','MA'),('maroc','MA'),('morocco','MA'),
|
||
('tunis','TN'),('sfax','TN'),('tunisie','TN'),('tunisia','TN'),
|
||
('alger','DZ'),('oran','DZ'),('algerie','DZ'),('algeria','DZ')]:
|
||
if kw in (location + ' ' + lead_title + ' ' + lead_company + ' ' + query).lower():
|
||
country = c
|
||
break
|
||
|
||
# Determine industry from title
|
||
industry = ''
|
||
for kw, ind in [('sap','ERP/SAP'),('erp','ERP/SAP'),('cloud','Cloud'),('cyber','Cybersecurity'),
|
||
('supply chain','Supply Chain'),('dsi','IT Management'),('cto','IT Management'),
|
||
('directeur informatique','IT Management'),('it director','IT Management'),
|
||
('transformation digitale','Digital Transformation'),('data','Data/Analytics'),
|
||
('finance','Finance'),('pharma','Life Sciences')]:
|
||
if kw in (lead_title + ' ' + query).lower():
|
||
industry = ind
|
||
break
|
||
|
||
# Determine seniority
|
||
seniority = ''
|
||
for kw, sen in [('directeur','Director'),('director','Director'),('dsi','C-Suite'),('cto','C-Suite'),
|
||
('dg','C-Suite'),('ceo','C-Suite'),('vp','VP'),('manager','Manager'),('consultant','Consultant')]:
|
||
if kw in lead_title.lower():
|
||
seniority = sen
|
||
break
|
||
|
||
# Dedup
|
||
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE lead_name=%s AND lead_company=%s LIMIT 1", (lead_name, lead_company))
|
||
if cur.fetchone():
|
||
continue
|
||
|
||
try:
|
||
cur.execute("""INSERT INTO admin.linkedin_leads
|
||
(lead_name, lead_email, lead_company, lead_title, lead_industry, lead_seniority, form_data, captured_at)
|
||
VALUES (%s,%s,%s,%s,%s,%s,%s,NOW())""",
|
||
(lead_name, lead_email or None, lead_company, lead_title, industry, seniority,
|
||
json.dumps({"url": url, "location": location, "country": country, "query": query[:50]})))
|
||
conn.commit()
|
||
total += 1
|
||
print(f"+LEAD {lead_name} | {lead_title} | {lead_company} | {country}")
|
||
except:
|
||
conn.rollback()
|
||
|
||
time.sleep(5)
|
||
except Exception as e:
|
||
try: page.close()
|
||
except: pass
|
||
continue
|
||
|
||
browser.close()
|
||
|
||
cur.close()
|
||
conn.close()
|
||
print(f"\nLINKEDIN_B2B: +{total} leads")
|