#!/usr/bin/env python3 """WEVAL B2B Scraper - Playwright on S204 via Bing""" import re, sys, time, psycopg2 from playwright.sync_api import sync_playwright DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123") batch = int(sys.argv[1]) if len(sys.argv) > 1 else 300 QUERIES = [ "site:linkedin.com/in DSI maroc", "site:linkedin.com/in CTO casablanca", "site:linkedin.com/in directeur informatique maroc", "site:linkedin.com/in SAP consultant maroc", "site:linkedin.com/in ERP manager morocco", "site:linkedin.com/in supply chain director maroc", "site:linkedin.com/in directeur digital maroc", "site:linkedin.com/in cybersecurity manager maroc", "site:linkedin.com/in cloud architect maroc", "site:linkedin.com/in directeur financier maroc", "site:linkedin.com/in data engineer maroc", "site:linkedin.com/in DevOps manager maroc", "site:linkedin.com/in head IT morocco", "site:linkedin.com/in CISO maroc", "site:linkedin.com/in DSI tunisie", "site:linkedin.com/in CTO algerie", "site:linkedin.com/in manufacturing director morocco", "site:linkedin.com/in pharma director maghreb", "site:linkedin.com/in directeur operations maroc", "site:linkedin.com/in startup CEO maroc", "site:linkedin.com/in chief digital officer morocco", "site:linkedin.com/in responsable infrastructure maroc", "site:linkedin.com/in directeur achats maroc", "site:linkedin.com/in product manager maroc", "site:linkedin.com/in VP technology morocco", "site:linkedin.com/in directeur technique rabat", "site:linkedin.com/in IT manager casablanca", "site:linkedin.com/in consultant SAP marrakech", "site:linkedin.com/in directeur logistique maroc", "site:linkedin.com/in responsable qualite maroc", "DSI casablanca email contact entreprise", "directeur IT maroc email", "SSII ESN maroc email contact directeur", "cabinet conseil IT maroc email", "integrateur SAP maroc email contact", "entreprise IT casablanca email", "startup tech maroc fondateur email", "banque maroc direction IT email", "telecom maroc email IT", "industrie maroc email directeur IT", ] conn = psycopg2.connect(**DB) cur = conn.cursor() t_leads, t_li, t_contacts = 0, 0, 0 bl = ["google","facebook","wikipedia","youtube","twitter","instagram","example","noreply"] def get_country(text): tl = text.lower() if any(x in tl for x in ["tunis","tunisia"]): return "TN" if any(x in tl for x in ["alger","algeria"]): return "DZ" return "MA" with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"]) ctx = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") for q in QUERIES: if t_leads + t_contacts >= batch: break try: page = ctx.new_page() url = f"https://www.bing.com/search?q={q.replace(' ','+')}&count=30" page.goto(url, timeout=15000, wait_until="domcontentloaded") page.wait_for_timeout(2000) results = page.evaluate("""() => { const d = []; document.querySelectorAll('.b_algo, li.b_algo').forEach(el => { const a = el.querySelector('a'); if (!a) return; const href = a.href || ''; const title = a.textContent || ''; const snippet = el.innerText.substring(0, 500); d.push({href, title: title.trim(), snippet}); }); return d; }""") page.close() for r in results: title = r.get("title","") url = r.get("href","") snippet = r.get("snippet","") full = f"{title} {snippet}" is_li = "linkedin.com/in/" in url c = get_country(full) emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full) emails = [e.lower() for e in emails if not any(x in e.lower() for x in bl)] if is_li: name = title.split(" - ")[0].split(" | ")[0].strip() name = re.sub(r'\(.*?\)', '', name).strip() name = re.sub(r'\s*LinkedIn$', '', name).strip() parts = name.split() if len(parts) < 2: continue first, last = parts[0], " ".join(parts[1:])[:50] if last.lower() in ["linkedin","profil","view","maroc","morocco","join","now"]: continue full_name = f"{first} {last}" company, role = "", "" sp = title.split(" - ") if len(sp) >= 3: role = sp[1].strip()[:100] company = sp[2].split("|")[0].strip()[:100] elif len(sp) == 2: role = sp[1].strip()[:100] # weval_leads cur.execute("SELECT 1 FROM admin.weval_leads WHERE LOWER(contact_name)=LOWER(%s) LIMIT 1", (full_name,)) if not cur.fetchone(): try: cur.execute("""INSERT INTO admin.weval_leads (contact_name,company_name,contact_title,email,country,linkedin_url,source,created_at) VALUES(%s,%s,%s,%s,%s,%s,'pw_bing',NOW())""", (full_name, company or None, role or None, emails[0] if emails else None, c, url)) conn.commit(); t_leads += 1 print(f"+LEAD {full_name} | {company} | {role}") except: conn.rollback() # linkedin_leads cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE LOWER(lead_name)=LOWER(%s) LIMIT 1", (full_name,)) if not cur.fetchone(): try: cur.execute("""INSERT INTO admin.linkedin_leads (lead_name,lead_company,lead_title,lead_linkedin_url,lead_country,captured_at) VALUES(%s,%s,%s,%s,%s,NOW())""", (full_name, company or None, role or None, url, c)) conn.commit(); t_li += 1 except: conn.rollback() for email in emails: cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1", (email,)) if cur.fetchone(): continue domain = email.split("@")[1] try: cur.execute("""INSERT INTO admin.send_contacts (email,first_name,domain,country,source,status,created_at) VALUES(%s,%s,%s,%s,'pw_bing','active',NOW())""", (email, email.split("@")[0].split(".")[0].title(), domain, c)) conn.commit(); t_contacts += 1 print(f"+CONTACT {email}") except: conn.rollback() time.sleep(4) except Exception as e: try: page.close() except: pass print(f"ERR {q[:30]}: {e}") browser.close() cur.close(); conn.close() print(f"\nDONE: +{t_leads} leads, +{t_li} LinkedIn, +{t_contacts} contacts")