154 lines
6.8 KiB
Python
154 lines
6.8 KiB
Python
#!/usr/bin/env python3
|
|
"""B2B Company scraper for WEVAL consulting leads
|
|
Targets: Companies in Maghreb using ERP/SAP/Cloud/IT services
|
|
Sources: Bing, DuckDuckGo, Kompass, Charika, Kerix
|
|
Feeds into: admin.weval_leads table
|
|
"""
|
|
import re, sys, time, psycopg2, json
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DB=dict(host="10.1.0.3",dbname="adx_system",user="admin",password="admin123")
|
|
batch=int(sys.argv[1]) if len(sys.argv)>1 else 100
|
|
|
|
# Ensure table exists
|
|
conn=psycopg2.connect(**DB);cur=conn.cursor()
|
|
cur.execute("""CREATE TABLE IF NOT EXISTS admin.weval_leads (
|
|
id SERIAL PRIMARY KEY,
|
|
company_name VARCHAR(255),
|
|
contact_name VARCHAR(255),
|
|
contact_title VARCHAR(255),
|
|
email VARCHAR(255),
|
|
phone VARCHAR(100),
|
|
website VARCHAR(255),
|
|
industry VARCHAR(100),
|
|
country VARCHAR(10),
|
|
city VARCHAR(100),
|
|
source VARCHAR(50),
|
|
linkedin_url VARCHAR(500),
|
|
notes TEXT,
|
|
created_at TIMESTAMP DEFAULT NOW()
|
|
)""")
|
|
conn.commit()
|
|
|
|
QUERIES = [
|
|
# Entreprises Maroc
|
|
("bing", "entreprise ERP SAP maroc casablanca site contactez email", "MA"),
|
|
("bing", "societe informatique maroc rabat directeur email", "MA"),
|
|
("bing", "entreprise cloud computing maroc casablanca", "MA"),
|
|
("bing", "societe conseil IT maroc transformation digitale", "MA"),
|
|
("bing", "industrie pharmaceutique maroc laboratoire directeur", "MA"),
|
|
("bing", "supply chain manager maroc logistique directeur", "MA"),
|
|
("bing", "manufacturing maroc usine directeur industriel email", "MA"),
|
|
("bing", "banque maroc DSI directeur systemes information", "MA"),
|
|
("bing", "telecom maroc IT manager email contact", "MA"),
|
|
("bing", "agroalimentaire maroc directeur general email", "MA"),
|
|
# Kerix / Kompass directories
|
|
("bing", "site:kerix.net informatique maroc", "MA"),
|
|
("bing", "site:charika.ma informatique technologie", "MA"),
|
|
# Tunisie
|
|
("bing", "entreprise informatique tunisie tunis directeur email", "TN"),
|
|
("bing", "societe ERP SAP tunisie email contact", "TN"),
|
|
("bing", "IT company tunisia tunis CTO email", "TN"),
|
|
("bing", "industrie tunisie directeur general email", "TN"),
|
|
# Algerie
|
|
("bing", "entreprise informatique algerie alger directeur email", "DZ"),
|
|
("bing", "societe IT algerie ERP SAP contact", "DZ"),
|
|
("bing", "industrie algerie directeur general email", "DZ"),
|
|
# Job boards (companies hiring = active IT needs)
|
|
("bing", "site:rekrute.com SAP consultant maroc", "MA"),
|
|
("bing", "site:emploi.ma ERP consultant directeur", "MA"),
|
|
("bing", "site:tanitjobs.com IT manager tunisie", "TN"),
|
|
]
|
|
|
|
total = 0
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
|
|
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0")
|
|
|
|
for engine, query, country in QUERIES:
|
|
if total >= batch:
|
|
break
|
|
try:
|
|
page = ctx.new_page()
|
|
url = f"https://www.bing.com/search?q={query.replace(' ','+')}&count=30"
|
|
page.goto(url, timeout=15000)
|
|
page.wait_for_timeout(3000)
|
|
|
|
results = page.evaluate(r"""() => {
|
|
const data = [];
|
|
document.querySelectorAll('li.b_algo, .b_algo').forEach(el => {
|
|
const link = el.querySelector('a');
|
|
const href = link ? link.href : '';
|
|
const title = (link ? link.textContent : '').trim();
|
|
const snippet = (el.querySelector('.b_caption p, .b_lineclamp2') || {}).textContent || '';
|
|
const full = title + ' ' + snippet;
|
|
|
|
// Extract emails
|
|
const emails = full.match(/[\w.+-]+@[\w.-]+\.[a-z]{2,}/gi) || [];
|
|
// Extract phones
|
|
const phones = full.match(/(?:\+212|\+216|\+213|0)[0-9 .-]{8,14}/g) || [];
|
|
|
|
if (title.length > 5) {
|
|
data.push({title, snippet, url: href, emails, phones});
|
|
}
|
|
});
|
|
return data;
|
|
}""")
|
|
page.close()
|
|
|
|
for r in results:
|
|
title = r.get('title','')
|
|
snippet = r.get('snippet','')
|
|
url = r.get('url','')
|
|
emails = [e for e in r.get('emails',[]) if not any(x in e.lower() for x in ['google','bing','example','facebook','wikipedia'])]
|
|
phones = r.get('phones',[])
|
|
|
|
# Extract company name from title
|
|
company = re.sub(r'\s*[-|].*(bing|google|linkedin|rekrute|emploi).*','',title,flags=re.I).strip()[:200]
|
|
if len(company) < 3: continue
|
|
|
|
# Extract city
|
|
city = ''
|
|
for c in ['Casablanca','Rabat','Marrakech','Fes','Tanger','Agadir','Tunis','Sfax','Sousse','Alger','Oran']:
|
|
if c.lower() in (title+snippet).lower():
|
|
city = c; break
|
|
|
|
# Dedup
|
|
cur.execute("SELECT 1 FROM admin.weval_leads WHERE company_name=%s AND country=%s LIMIT 1", (company[:200], country))
|
|
if cur.fetchone(): continue
|
|
|
|
# Industry detection
|
|
industry = ''
|
|
for kw, ind in [('sap','ERP/SAP'),('erp','ERP/SAP'),('cloud','Cloud'),('cyber','Cybersecurity'),
|
|
('pharma','Pharma'),('banque','Banking'),('telecom','Telecom'),('agro','Agro'),
|
|
('logistique','Supply Chain'),('supply','Supply Chain'),('manufacture','Manufacturing'),
|
|
('informatique','IT Services'),('it ','IT Services'),('conseil','Consulting')]:
|
|
if kw in (query+title+snippet).lower():
|
|
industry = ind; break
|
|
|
|
try:
|
|
cur.execute("""INSERT INTO admin.weval_leads
|
|
(company_name, email, phone, website, industry, country, city, source, notes, created_at)
|
|
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,NOW())""",
|
|
(company[:200], emails[0] if emails else None, phones[0] if phones else None,
|
|
url[:255], industry, country, city, 'bing_scraper',
|
|
json.dumps({"query": query[:80], "snippet": snippet[:200]})))
|
|
conn.commit()
|
|
total += 1
|
|
em = emails[0] if emails else '-'
|
|
print(f"+B2B {company[:40]} | {industry} | {country} | {em}")
|
|
except:
|
|
conn.rollback()
|
|
|
|
time.sleep(4)
|
|
except:
|
|
try: page.close()
|
|
except: pass
|
|
|
|
browser.close()
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print(f"\nWEVAL_B2B: +{total} leads")
|