Files
weval-consulting/api/pw_b2b_proper.py

133 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""WEVAL B2B Lead Scraper - targeted directories
Sources: rekrute.com (companies hiring SAP/ERP), charika.ma, kerix.net
"""
import re,sys,time,psycopg2,json
from playwright.sync_api import sync_playwright
DB=dict(host="10.1.0.3",dbname="adx_system",user="admin",password="admin123")
batch=int(sys.argv[1]) if len(sys.argv)>1 else 200
conn=psycopg2.connect(**DB);cur=conn.cursor()
cur.execute("""CREATE TABLE IF NOT EXISTS admin.weval_leads(
id SERIAL PRIMARY KEY,company_name VARCHAR(255),contact_name VARCHAR(255),
contact_title VARCHAR(255),email VARCHAR(255),phone VARCHAR(100),
website VARCHAR(500),industry VARCHAR(100),country VARCHAR(10),
city VARCHAR(100),source VARCHAR(50),linkedin_url VARCHAR(500),
notes TEXT,created_at TIMESTAMP DEFAULT NOW())""")
conn.commit()
total=0
with sync_playwright() as p:
browser=p.chromium.launch(headless=True,args=["--no-sandbox","--disable-dev-shm-usage"])
ctx=browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0")
# 1. REKRUTE.COM - companies hiring SAP/ERP/Cloud = WEVAL prospects
REKRUTE_QUERIES=["sap","erp","cloud","cybersecurite","supply+chain","transformation+digitale",
"directeur+informatique","dsi","data+engineer","devops","infrastructure"]
for kw in REKRUTE_QUERIES:
if total>=batch:break
try:
page=ctx.new_page()
page.goto(f"https://www.rekrute.com/offres-emploi-{kw}.html",timeout=15000)
page.wait_for_timeout(3000)
jobs=page.evaluate("""()=>{
const r=[];
document.querySelectorAll('.post-id, .section, .result-item, article, [class*=offer], [class*=job]').forEach(el=>{
const t=el.innerText||'';
// Company name usually in bold or specific class
const co=(el.querySelector('.company, .recruiter, h3, .enterprise, b') || {}).textContent || '';
const city=(t.match(/(Casablanca|Rabat|Marrakech|Fes|Tanger|Agadir|Meknes|Kenitra|Oujda|Tetouan)/i)||[''])[0];
if(co.length>2 && co.length<100) r.push({company:co.trim(),city:city,text:t.substring(0,300)});
});
return r.slice(0,30);
}""")
page.close()
for j in jobs:
co=j['company'].strip()
if len(co)<3 or len(co)>100:continue
cur.execute("SELECT 1 FROM admin.weval_leads WHERE company_name=%s LIMIT 1",(co,))
if cur.fetchone():continue
ind='ERP/SAP' if 'sap' in kw or 'erp' in kw else 'Cloud' if 'cloud' in kw else 'IT Services'
try:
cur.execute("INSERT INTO admin.weval_leads(company_name,industry,country,city,source,notes,created_at) VALUES(%s,%s,'MA',%s,'rekrute',%s,NOW())",
(co,ind,j.get('city',''),json.dumps({"keyword":kw})))
conn.commit();total+=1
print(f"+REKRUTE {co} | {ind} | {j.get('city','')}")
except:conn.rollback()
time.sleep(3)
except:
try:page.close()
except:pass
# 2. EMPLOI.MA
for kw in ["sap","erp","cloud","cybersecurite","data","devops"]:
if total>=batch:break
try:
page=ctx.new_page()
page.goto(f"https://www.emploi.ma/recherche-emploi-maroc/q-{kw}",timeout=15000)
page.wait_for_timeout(3000)
jobs=page.evaluate("""()=>{
const r=[];
document.querySelectorAll('.views-row, .job-item, article, .search-result').forEach(el=>{
const co=(el.querySelector('.company-name, .recruiter, .field-name-field-company a, h3 a') || {}).textContent || '';
const city=(el.innerText.match(/(Casablanca|Rabat|Marrakech|Fes|Tanger|Agadir)/i)||[''])[0];
if(co.length>2 && co.length<100) r.push({company:co.trim(),city:city});
});
return r.slice(0,20);
}""")
page.close()
for j in jobs:
co=j['company'].strip()
if len(co)<3:continue
cur.execute("SELECT 1 FROM admin.weval_leads WHERE company_name=%s LIMIT 1",(co,))
if cur.fetchone():continue
try:
cur.execute("INSERT INTO admin.weval_leads(company_name,industry,country,city,source,created_at) VALUES(%s,%s,'MA',%s,'emploi_ma',NOW())",
(co,'IT Services',j.get('city','')))
conn.commit();total+=1
print(f"+EMPLOI {co} | {j.get('city','')}")
except:conn.rollback()
time.sleep(3)
except:
try:page.close()
except:pass
# 3. CHARIKA.MA business registry
for sector in ["informatique","telecom","conseil","industrie","pharmaceutique"]:
if total>=batch:break
try:
page=ctx.new_page()
page.goto(f"https://www.charika.ma/recherche?q={sector}",timeout=15000)
page.wait_for_timeout(3000)
companies=page.evaluate("""()=>{
const r=[];
document.querySelectorAll('a[href*="/entreprise/"], .company-name, .result-title, h2 a, h3 a').forEach(el=>{
const nm=(el.textContent||'').trim();
const hr=el.href||'';
if(nm.length>2 && nm.length<100) r.push({name:nm,url:hr});
});
return r.slice(0,20);
}""")
page.close()
for c in companies:
nm=c['name'].strip()
if len(nm)<3:continue
cur.execute("SELECT 1 FROM admin.weval_leads WHERE company_name=%s LIMIT 1",(nm,))
if cur.fetchone():continue
try:
cur.execute("INSERT INTO admin.weval_leads(company_name,website,industry,country,source,created_at) VALUES(%s,%s,%s,'MA','charika',NOW())",
(nm,c.get('url','')[:255],sector.title()))
conn.commit();total+=1
print(f"+CHARIKA {nm} | {sector}")
except:conn.rollback()
time.sleep(3)
except:
try:page.close()
except:pass
browser.close()
cur.close();conn.close()
print(f"\nWEVAL_B2B: +{total} leads")