Files
html/api/weval_b2b.py
2026-04-12 22:57:03 +02:00

165 lines
7.4 KiB
Python

#!/usr/bin/env python3
"""WEVAL B2B Scraper - Playwright on S204 via Bing"""
import re, sys, time, psycopg2
from playwright.sync_api import sync_playwright
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
batch = int(sys.argv[1]) if len(sys.argv) > 1 else 300
QUERIES = [
"site:linkedin.com/in DSI maroc",
"site:linkedin.com/in CTO casablanca",
"site:linkedin.com/in directeur informatique maroc",
"site:linkedin.com/in SAP consultant maroc",
"site:linkedin.com/in ERP manager morocco",
"site:linkedin.com/in supply chain director maroc",
"site:linkedin.com/in directeur digital maroc",
"site:linkedin.com/in cybersecurity manager maroc",
"site:linkedin.com/in cloud architect maroc",
"site:linkedin.com/in directeur financier maroc",
"site:linkedin.com/in data engineer maroc",
"site:linkedin.com/in DevOps manager maroc",
"site:linkedin.com/in head IT morocco",
"site:linkedin.com/in CISO maroc",
"site:linkedin.com/in DSI tunisie",
"site:linkedin.com/in CTO algerie",
"site:linkedin.com/in manufacturing director morocco",
"site:linkedin.com/in pharma director maghreb",
"site:linkedin.com/in directeur operations maroc",
"site:linkedin.com/in startup CEO maroc",
"site:linkedin.com/in chief digital officer morocco",
"site:linkedin.com/in responsable infrastructure maroc",
"site:linkedin.com/in directeur achats maroc",
"site:linkedin.com/in product manager maroc",
"site:linkedin.com/in VP technology morocco",
"site:linkedin.com/in directeur technique rabat",
"site:linkedin.com/in IT manager casablanca",
"site:linkedin.com/in consultant SAP marrakech",
"site:linkedin.com/in directeur logistique maroc",
"site:linkedin.com/in responsable qualite maroc",
"DSI casablanca email contact entreprise",
"directeur IT maroc email",
"SSII ESN maroc email contact directeur",
"cabinet conseil IT maroc email",
"integrateur SAP maroc email contact",
"entreprise IT casablanca email",
"startup tech maroc fondateur email",
"banque maroc direction IT email",
"telecom maroc email IT",
"industrie maroc email directeur IT",
]
conn = psycopg2.connect(**DB)
cur = conn.cursor()
t_leads, t_li, t_contacts = 0, 0, 0
bl = ["google","facebook","wikipedia","youtube","twitter","instagram","example","noreply"]
def get_country(text):
tl = text.lower()
if any(x in tl for x in ["tunis","tunisia"]): return "TN"
if any(x in tl for x in ["alger","algeria"]): return "DZ"
return "MA"
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
ctx = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
for q in QUERIES:
if t_leads + t_contacts >= batch:
break
try:
page = ctx.new_page()
url = f"https://www.bing.com/search?q={q.replace(' ','+')}&count=30"
page.goto(url, timeout=15000, wait_until="domcontentloaded")
page.wait_for_timeout(2000)
results = page.evaluate("""() => {
const d = [];
document.querySelectorAll('.b_algo, li.b_algo').forEach(el => {
const a = el.querySelector('a');
if (!a) return;
const href = a.href || '';
const title = a.textContent || '';
const snippet = el.innerText.substring(0, 500);
d.push({href, title: title.trim(), snippet});
});
return d;
}""")
page.close()
for r in results:
title = r.get("title","")
url = r.get("href","")
snippet = r.get("snippet","")
full = f"{title} {snippet}"
is_li = "linkedin.com/in/" in url
c = get_country(full)
emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full)
emails = [e.lower() for e in emails if not any(x in e.lower() for x in bl)]
if is_li:
name = title.split(" - ")[0].split(" | ")[0].strip()
name = re.sub(r'\(.*?\)', '', name).strip()
name = re.sub(r'\s*LinkedIn$', '', name).strip()
parts = name.split()
if len(parts) < 2: continue
first, last = parts[0], " ".join(parts[1:])[:50]
if last.lower() in ["linkedin","profil","view","maroc","morocco","join","now"]: continue
full_name = f"{first} {last}"
company, role = "", ""
sp = title.split(" - ")
if len(sp) >= 3:
role = sp[1].strip()[:100]
company = sp[2].split("|")[0].strip()[:100]
elif len(sp) == 2:
role = sp[1].strip()[:100]
# weval_leads
cur.execute("SELECT 1 FROM admin.weval_leads WHERE LOWER(contact_name)=LOWER(%s) LIMIT 1", (full_name,))
if not cur.fetchone():
try:
cur.execute("""INSERT INTO admin.weval_leads
(contact_name,company_name,contact_title,email,country,linkedin_url,source,created_at)
VALUES(%s,%s,%s,%s,%s,%s,'pw_bing',NOW())""",
(full_name, company or None, role or None, emails[0] if emails else None, c, url))
conn.commit(); t_leads += 1
print(f"+LEAD {full_name} | {company} | {role}")
except: conn.rollback()
# linkedin_leads
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE LOWER(lead_name)=LOWER(%s) LIMIT 1", (full_name,))
if not cur.fetchone():
try:
cur.execute("""INSERT INTO admin.linkedin_leads
(lead_name,lead_company,lead_title,lead_linkedin_url,lead_country,captured_at)
VALUES(%s,%s,%s,%s,%s,NOW())""",
(full_name, company or None, role or None, url, c))
conn.commit(); t_li += 1
except: conn.rollback()
for email in emails:
cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1", (email,))
if cur.fetchone(): continue
domain = email.split("@")[1]
try:
cur.execute("""INSERT INTO admin.send_contacts
(email,first_name,domain,country,source,status,created_at)
VALUES(%s,%s,%s,%s,'pw_bing','active',NOW())""",
(email, email.split("@")[0].split(".")[0].title(), domain, c))
conn.commit(); t_contacts += 1
print(f"+CONTACT {email}")
except: conn.rollback()
time.sleep(4)
except Exception as e:
try: page.close()
except: pass
print(f"ERR {q[:30]}: {e}")
browser.close()
cur.close(); conn.close()
print(f"\nDONE: +{t_leads} leads, +{t_li} LinkedIn, +{t_contacts} contacts")