Files
html/api/pw_b2b_scraper.py
2026-04-12 22:57:03 +02:00

154 lines
6.8 KiB
Python

#!/usr/bin/env python3
"""B2B Company scraper for WEVAL consulting leads
Targets: Companies in Maghreb using ERP/SAP/Cloud/IT services
Sources: Bing, DuckDuckGo, Kompass, Charika, Kerix
Feeds into: admin.weval_leads table
"""
import re, sys, time, psycopg2, json
from playwright.sync_api import sync_playwright
DB=dict(host="10.1.0.3",dbname="adx_system",user="admin",password="admin123")
batch=int(sys.argv[1]) if len(sys.argv)>1 else 100
# Ensure table exists
conn=psycopg2.connect(**DB);cur=conn.cursor()
cur.execute("""CREATE TABLE IF NOT EXISTS admin.weval_leads (
id SERIAL PRIMARY KEY,
company_name VARCHAR(255),
contact_name VARCHAR(255),
contact_title VARCHAR(255),
email VARCHAR(255),
phone VARCHAR(100),
website VARCHAR(255),
industry VARCHAR(100),
country VARCHAR(10),
city VARCHAR(100),
source VARCHAR(50),
linkedin_url VARCHAR(500),
notes TEXT,
created_at TIMESTAMP DEFAULT NOW()
)""")
conn.commit()
QUERIES = [
# Entreprises Maroc
("bing", "entreprise ERP SAP maroc casablanca site contactez email", "MA"),
("bing", "societe informatique maroc rabat directeur email", "MA"),
("bing", "entreprise cloud computing maroc casablanca", "MA"),
("bing", "societe conseil IT maroc transformation digitale", "MA"),
("bing", "industrie pharmaceutique maroc laboratoire directeur", "MA"),
("bing", "supply chain manager maroc logistique directeur", "MA"),
("bing", "manufacturing maroc usine directeur industriel email", "MA"),
("bing", "banque maroc DSI directeur systemes information", "MA"),
("bing", "telecom maroc IT manager email contact", "MA"),
("bing", "agroalimentaire maroc directeur general email", "MA"),
# Kerix / Kompass directories
("bing", "site:kerix.net informatique maroc", "MA"),
("bing", "site:charika.ma informatique technologie", "MA"),
# Tunisie
("bing", "entreprise informatique tunisie tunis directeur email", "TN"),
("bing", "societe ERP SAP tunisie email contact", "TN"),
("bing", "IT company tunisia tunis CTO email", "TN"),
("bing", "industrie tunisie directeur general email", "TN"),
# Algerie
("bing", "entreprise informatique algerie alger directeur email", "DZ"),
("bing", "societe IT algerie ERP SAP contact", "DZ"),
("bing", "industrie algerie directeur general email", "DZ"),
# Job boards (companies hiring = active IT needs)
("bing", "site:rekrute.com SAP consultant maroc", "MA"),
("bing", "site:emploi.ma ERP consultant directeur", "MA"),
("bing", "site:tanitjobs.com IT manager tunisie", "TN"),
]
total = 0
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0")
for engine, query, country in QUERIES:
if total >= batch:
break
try:
page = ctx.new_page()
url = f"https://www.bing.com/search?q={query.replace(' ','+')}&count=30"
page.goto(url, timeout=15000)
page.wait_for_timeout(3000)
results = page.evaluate(r"""() => {
const data = [];
document.querySelectorAll('li.b_algo, .b_algo').forEach(el => {
const link = el.querySelector('a');
const href = link ? link.href : '';
const title = (link ? link.textContent : '').trim();
const snippet = (el.querySelector('.b_caption p, .b_lineclamp2') || {}).textContent || '';
const full = title + ' ' + snippet;
// Extract emails
const emails = full.match(/[\w.+-]+@[\w.-]+\.[a-z]{2,}/gi) || [];
// Extract phones
const phones = full.match(/(?:\+212|\+216|\+213|0)[0-9 .-]{8,14}/g) || [];
if (title.length > 5) {
data.push({title, snippet, url: href, emails, phones});
}
});
return data;
}""")
page.close()
for r in results:
title = r.get('title','')
snippet = r.get('snippet','')
url = r.get('url','')
emails = [e for e in r.get('emails',[]) if not any(x in e.lower() for x in ['google','bing','example','facebook','wikipedia'])]
phones = r.get('phones',[])
# Extract company name from title
company = re.sub(r'\s*[-|].*(bing|google|linkedin|rekrute|emploi).*','',title,flags=re.I).strip()[:200]
if len(company) < 3: continue
# Extract city
city = ''
for c in ['Casablanca','Rabat','Marrakech','Fes','Tanger','Agadir','Tunis','Sfax','Sousse','Alger','Oran']:
if c.lower() in (title+snippet).lower():
city = c; break
# Dedup
cur.execute("SELECT 1 FROM admin.weval_leads WHERE company_name=%s AND country=%s LIMIT 1", (company[:200], country))
if cur.fetchone(): continue
# Industry detection
industry = ''
for kw, ind in [('sap','ERP/SAP'),('erp','ERP/SAP'),('cloud','Cloud'),('cyber','Cybersecurity'),
('pharma','Pharma'),('banque','Banking'),('telecom','Telecom'),('agro','Agro'),
('logistique','Supply Chain'),('supply','Supply Chain'),('manufacture','Manufacturing'),
('informatique','IT Services'),('it ','IT Services'),('conseil','Consulting')]:
if kw in (query+title+snippet).lower():
industry = ind; break
try:
cur.execute("""INSERT INTO admin.weval_leads
(company_name, email, phone, website, industry, country, city, source, notes, created_at)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,NOW())""",
(company[:200], emails[0] if emails else None, phones[0] if phones else None,
url[:255], industry, country, city, 'bing_scraper',
json.dumps({"query": query[:80], "snippet": snippet[:200]})))
conn.commit()
total += 1
em = emails[0] if emails else '-'
print(f"+B2B {company[:40]} | {industry} | {country} | {em}")
except:
conn.rollback()
time.sleep(4)
except:
try: page.close()
except: pass
browser.close()
cur.close()
conn.close()
print(f"\nWEVAL_B2B: +{total} leads")