Files
html/api/pw_weval_b2b.py
2026-04-12 22:57:03 +02:00

151 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""LinkedIn B2B lead scraper via SearXNG (sovereign search)
No browser needed, pure HTTP, much faster than Playwright
"""
import re, sys, time, psycopg2, json, requests
requests.packages.urllib3.disable_warnings()
DB=dict(host="10.1.0.3",dbname="adx_system",user="admin",password="admin123")
SEARX="http://localhost:8888/search"
batch=int(sys.argv[1]) if len(sys.argv)>1 else 200
QUERIES = [
# MA LinkedIn
("DSI casablanca linkedin", "MA"), ("directeur informatique rabat linkedin", "MA"),
("CTO maroc linkedin", "MA"), ("IT director casablanca linkedin", "MA"),
("ERP SAP manager maroc linkedin", "MA"), ("supply chain director maroc linkedin", "MA"),
("directeur general casablanca linkedin", "MA"), ("cloud architect maroc linkedin", "MA"),
("cybersecurity manager maroc linkedin", "MA"), ("transformation digitale maroc linkedin", "MA"),
("directeur financier casablanca linkedin", "MA"), ("directeur logistique maroc linkedin", "MA"),
("SAP consultant maroc linkedin", "MA"), ("data scientist maroc linkedin", "MA"),
# TN LinkedIn
("DSI tunis linkedin", "TN"), ("directeur informatique tunisie linkedin", "TN"),
("CTO tunisia linkedin", "TN"), ("ERP manager tunisie linkedin", "TN"),
("IT director tunis linkedin", "TN"), ("directeur general tunisie linkedin", "TN"),
# DZ LinkedIn
("DSI alger linkedin", "DZ"), ("directeur informatique algerie linkedin", "DZ"),
("CTO algeria linkedin", "DZ"), ("ERP SAP algerie linkedin", "DZ"),
("IT director alger linkedin", "DZ"), ("directeur general algerie linkedin", "DZ"),
# B2B company leads
("entreprise informatique casablanca email directeur", "MA"),
("societe IT rabat directeur contact", "MA"),
("cabinet conseil IT maroc email", "MA"),
("SSII maroc casablanca directeur email", "MA"),
("integrateur SAP maroc email", "MA"),
("entreprise informatique tunis email", "TN"),
("societe conseil IT algerie email", "DZ"),
]
conn=psycopg2.connect(**DB);cur=conn.cursor()
# Ensure table
cur.execute("""CREATE TABLE IF NOT EXISTS admin.weval_leads (
id SERIAL PRIMARY KEY, company_name VARCHAR(255), contact_name VARCHAR(255),
contact_title VARCHAR(255), email VARCHAR(255), phone VARCHAR(100),
website VARCHAR(255), industry VARCHAR(100), country VARCHAR(10),
city VARCHAR(100), source VARCHAR(50), linkedin_url VARCHAR(500),
notes TEXT, created_at TIMESTAMP DEFAULT NOW())""")
conn.commit()
total_li=0; total_b2b=0
for query, country in QUERIES:
if total_li+total_b2b >= batch: break
try:
r=requests.get(SEARX, params={"q":query,"format":"json","engines":"google,bing,duckduckgo"}, timeout=15, verify=False)
results=r.json().get("results",[])
except:
continue
for res in results[:15]:
url=res.get("url","")
title=res.get("title","")
snippet=res.get("content","")
full=title+" "+snippet
# Extract emails/phones from snippet
emails=[e.lower() for e in re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full)
if not any(x in e.lower() for x in ['google','bing','example','facebook','wikipedia','linkedin'])]
phones=re.findall(r'(?:\+212|\+216|\+213|0)[0-9 .-]{8,14}', full)
# LINKEDIN LEAD
if 'linkedin.com/in/' in url:
name=re.sub(r'\s*[-|].*(linkedin|profil).*','',title,flags=re.I).strip()
if len(name)<3 or len(name)>100: continue
# Extract title from "Name - Title" or snippet
job=''
dm=re.search(r'[-]\s*(.+?)(?:\s*[-|]\s*[Ll]inkedin)',title)
if dm: job=dm.group(1).strip()[:200]
# Company from snippet
company=''
cm=re.search(r'(?:chez|at|@|pour)\s+([^.·,\n]{3,50})',snippet,re.I)
if cm: company=cm.group(1).strip()
# City
city=''
for c in ['Casablanca','Rabat','Marrakech','Fes','Tanger','Agadir','Meknes','Tunis','Sfax','Sousse','Alger','Oran','Constantine']:
if c.lower() in full.lower(): city=c;break
# Industry
industry=''
for kw,ind in [('sap','ERP/SAP'),('erp','ERP/SAP'),('cloud','Cloud'),('cyber','Cybersecurity'),
('supply','Supply Chain'),('dsi','IT Management'),('cto','IT Management'),
('directeur informatique','IT Management'),('it director','IT Management'),
('data','Data/Analytics'),('pharma','Life Sciences'),('finance','Finance'),
('logistique','Supply Chain'),('digital','Digital')]:
if kw in full.lower(): industry=ind;break
# Seniority
seniority=''
for kw,sen in [('directeur','Director'),('director','Director'),('dsi','C-Suite'),
('cto','C-Suite'),('ceo','C-Suite'),('dg','C-Suite'),('vp','VP'),
('manager','Manager'),('consultant','Consultant'),('chef','Manager')]:
if kw in (job+name).lower(): seniority=sen;break
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE lead_name=%s LIMIT 1",(name,))
if cur.fetchone(): continue
try:
cur.execute("""INSERT INTO admin.linkedin_leads
(lead_name,lead_email,lead_company,lead_title,lead_industry,lead_seniority,form_data,captured_at)
VALUES(%s,%s,%s,%s,%s,%s,%s,NOW())""",
(name, emails[0] if emails else None, company, job, industry, seniority,
json.dumps({"url":url,"city":city,"country":country})))
conn.commit(); total_li+=1
print(f"+LI {name} | {job[:30]} | {company[:30]} | {country}")
except: conn.rollback()
# B2B COMPANY LEAD
elif emails or phones:
company=re.sub(r'\s*[-|].*(bing|google).*','',title,flags=re.I).strip()[:200]
if len(company)<3: continue
city=''
for c in ['Casablanca','Rabat','Marrakech','Fes','Tanger','Tunis','Sfax','Alger','Oran']:
if c.lower() in full.lower(): city=c;break
industry=''
for kw,ind in [('sap','ERP/SAP'),('erp','ERP/SAP'),('cloud','Cloud'),('cyber','Cybersecurity'),
('informatique','IT Services'),('conseil','Consulting'),('pharma','Pharma'),
('banque','Banking'),('telecom','Telecom'),('logistique','Supply Chain')]:
if kw in full.lower(): industry=ind;break
cur.execute("SELECT 1 FROM admin.weval_leads WHERE company_name=%s AND country=%s LIMIT 1",(company[:200],country))
if cur.fetchone(): continue
try:
cur.execute("""INSERT INTO admin.weval_leads
(company_name,email,phone,website,industry,country,city,source,created_at)
VALUES(%s,%s,%s,%s,%s,%s,%s,'searxng_b2b',NOW())""",
(company[:200], emails[0] if emails else None, phones[0] if phones else None,
url[:255], industry, country, city))
conn.commit(); total_b2b+=1
print(f"+B2B {company[:40]} | {emails[0] if emails else '-'} | {country}")
except: conn.rollback()
time.sleep(2)
cur.close();conn.close()
print(f"\nRESULT: +{total_li} LinkedIn leads, +{total_b2b} B2B leads")