Files
weval-consulting/api/pw_linkedin_b2b.py

164 lines
7.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Playwright LinkedIn B2B lead scraper for WEVAL consulting
Targets: DSI, DG, CTO, IT Director, ERP Manager in Maghreb
Uses DuckDuckGo/Bing to find LinkedIn profiles (no login needed)
"""
import re, sys, time, psycopg2, json
from playwright.sync_api import sync_playwright
DB=dict(host="10.1.0.3",dbname="adx_system",user="admin",password="admin123")
batch=int(sys.argv[1]) if len(sys.argv)>1 else 50
QUERIES = [
# Morocco
'site:linkedin.com/in "DSI" "maroc" OR "casablanca" OR "rabat"',
'site:linkedin.com/in "directeur informatique" "maroc"',
'site:linkedin.com/in "CTO" "morocco" OR "casablanca"',
'site:linkedin.com/in "IT director" "morocco"',
'site:linkedin.com/in "ERP" "SAP" "maroc" OR "morocco"',
'site:linkedin.com/in "supply chain" "director" "maroc"',
'site:linkedin.com/in "directeur general" "casablanca" OR "rabat"',
'site:linkedin.com/in "cloud" "manager" "maroc"',
'site:linkedin.com/in "cybersecurity" "morocco"',
# Tunisia
'site:linkedin.com/in "DSI" "tunisie" OR "tunis"',
'site:linkedin.com/in "directeur informatique" "tunisie"',
'site:linkedin.com/in "IT manager" "tunisia"',
'site:linkedin.com/in "ERP" "SAP" "tunisie"',
# Algeria
'site:linkedin.com/in "DSI" "algerie" OR "alger"',
'site:linkedin.com/in "directeur informatique" "algerie"',
'site:linkedin.com/in "IT director" "algeria"',
'site:linkedin.com/in "ERP" "SAP" "algerie"',
# Generic Maghreb
'site:linkedin.com/in "directeur systemes information" "maghreb"',
'site:linkedin.com/in "transformation digitale" "directeur" "maroc" OR "tunisie" OR "algerie"',
'site:linkedin.com/in "SAP consultant" "north africa"',
]
conn = psycopg2.connect(**DB)
cur = conn.cursor()
total = 0
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0")
for query in QUERIES:
if total >= batch:
break
try:
page = ctx.new_page()
# Use Bing (more permissive than Google for scraping)
page.goto(f"https://www.bing.com/search?q={query.replace(' ','+')}&count=50", timeout=15000)
page.wait_for_timeout(3000)
# Extract LinkedIn profile info from search results
results = page.evaluate(r"""() => {
const data = [];
document.querySelectorAll('li.b_algo, .b_algo').forEach(el => {
const link = el.querySelector('a');
const href = link ? link.href : '';
if (!href.includes('linkedin.com/in/')) return;
const title = (link ? link.textContent : '').trim();
const snippet = (el.querySelector('.b_caption p, .b_lineclamp2') || {}).textContent || '';
// Parse: "Prénom Nom - Titre | LinkedIn"
let name = title.replace(/\s*[-|].*linkedin.*/i, '').replace(/\s*\|.*/,'').trim();
let titleJob = '';
const dashMatch = title.match(/[-]\s*(.+?)(?:\s*[-|]\s*LinkedIn)/i);
if (dashMatch) titleJob = dashMatch[1].trim();
// Extract company from snippet
let company = '';
const compMatch = snippet.match(/(?:chez|at|@)\s+([^.·,]+)/i);
if (compMatch) company = compMatch[1].trim();
// Extract location
let location = '';
const locMatch = snippet.match(/(casablanca|rabat|marrakech|fes|tanger|tunis|sfax|alger|oran)/i);
if (locMatch) location = locMatch[1];
// Extract email if visible
const emails = snippet.match(/[\w.+-]+@[\w.-]+\.[a-z]{2,}/gi) || [];
if (name.length > 3) {
data.push({name, title: titleJob, company, location, email: emails[0]||'', url: href});
}
});
return data;
}""")
page.close()
for r in results:
name = r.get('name','').strip()
if not name or len(name) < 3:
continue
parts = name.split(None, 1)
lead_name = name
lead_title = r.get('title','')
lead_company = r.get('company','')
lead_email = r.get('email','')
location = r.get('location','')
url = r.get('url','')
# Determine country
country = ''
for kw, c in [('casablanca','MA'),('rabat','MA'),('marrakech','MA'),('fes','MA'),('tanger','MA'),('maroc','MA'),('morocco','MA'),
('tunis','TN'),('sfax','TN'),('tunisie','TN'),('tunisia','TN'),
('alger','DZ'),('oran','DZ'),('algerie','DZ'),('algeria','DZ')]:
if kw in (location + ' ' + lead_title + ' ' + lead_company + ' ' + query).lower():
country = c
break
# Determine industry from title
industry = ''
for kw, ind in [('sap','ERP/SAP'),('erp','ERP/SAP'),('cloud','Cloud'),('cyber','Cybersecurity'),
('supply chain','Supply Chain'),('dsi','IT Management'),('cto','IT Management'),
('directeur informatique','IT Management'),('it director','IT Management'),
('transformation digitale','Digital Transformation'),('data','Data/Analytics'),
('finance','Finance'),('pharma','Life Sciences')]:
if kw in (lead_title + ' ' + query).lower():
industry = ind
break
# Determine seniority
seniority = ''
for kw, sen in [('directeur','Director'),('director','Director'),('dsi','C-Suite'),('cto','C-Suite'),
('dg','C-Suite'),('ceo','C-Suite'),('vp','VP'),('manager','Manager'),('consultant','Consultant')]:
if kw in lead_title.lower():
seniority = sen
break
# Dedup
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE lead_name=%s AND lead_company=%s LIMIT 1", (lead_name, lead_company))
if cur.fetchone():
continue
try:
cur.execute("""INSERT INTO admin.linkedin_leads
(lead_name, lead_email, lead_company, lead_title, lead_industry, lead_seniority, form_data, captured_at)
VALUES (%s,%s,%s,%s,%s,%s,%s,NOW())""",
(lead_name, lead_email or None, lead_company, lead_title, industry, seniority,
json.dumps({"url": url, "location": location, "country": country, "query": query[:50]})))
conn.commit()
total += 1
print(f"+LEAD {lead_name} | {lead_title} | {lead_company} | {country}")
except:
conn.rollback()
time.sleep(5)
except Exception as e:
try: page.close()
except: pass
continue
browser.close()
cur.close()
conn.close()
print(f"\nLINKEDIN_B2B: +{total} leads")