#!/usr/bin/env python3 """WEVAL B2B+LinkedIn+Contacts scraper via Playwright Google""" import re, sys, time, psycopg2 from playwright.sync_api import sync_playwright DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123") batch = int(sys.argv[1]) if len(sys.argv) > 1 else 300 QUERIES = [ "site:linkedin.com/in DSI maroc", "site:linkedin.com/in CTO maroc casablanca", "site:linkedin.com/in directeur IT maroc", "site:linkedin.com/in directeur technique casablanca", "site:linkedin.com/in SAP consultant maroc", "site:linkedin.com/in ERP manager morocco", "site:linkedin.com/in supply chain director morocco", "site:linkedin.com/in directeur digital maroc", "site:linkedin.com/in cybersecurity morocco", "site:linkedin.com/in cloud architect maroc", "site:linkedin.com/in directeur financier maroc", "site:linkedin.com/in head of IT maroc", "site:linkedin.com/in data engineer maroc", "site:linkedin.com/in DevOps manager maroc", "site:linkedin.com/in directeur infrastructure maroc", "site:linkedin.com/in CISO maroc", "site:linkedin.com/in directeur systemes information rabat", "site:linkedin.com/in DSI tunisie", "site:linkedin.com/in CTO algerie", "site:linkedin.com/in directeur informatique casablanca", "site:linkedin.com/in manufacturing director morocco", "site:linkedin.com/in pharma director maghreb", "site:linkedin.com/in directeur operations maroc", "site:linkedin.com/in responsable achats maroc", "site:linkedin.com/in startup CEO maroc fondateur", "site:linkedin.com/in chief digital officer morocco", "directeur systemes information casablanca email", "DSI maroc email contact entreprise", "CTO maroc email contact", "cabinet conseil IT maroc email", "SSII ESN maroc directeur email", "integrateur SAP maroc email", "entreprise IT casablanca email contact", "entreprise tech rabat marrakech email", "startup maroc fondateur email contact", "PME maroc email directeur", "banque maroc email IT direction", "telecom maroc email direction IT", "assurance maroc email direction", "industrie maroc email directeur", ] conn = psycopg2.connect(**DB) cur = conn.cursor() t_leads = 0 t_li = 0 t_contacts = 0 bl = ["google","facebook","linkedin.com/company","wikipedia","youtube","twitter","instagram","example","noreply"] def country(text): tl = text.lower() if any(x in tl for x in ["tunis","tunisia"]):return "TN" if any(x in tl for x in ["alger","algeria"]):return "DZ" return "MA" with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"]) ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0") for q in QUERIES: if t_leads + t_contacts >= batch: break try: page = ctx.new_page() page.goto(f"https://www.google.com/search?q={q.replace(' ','+')}&hl=fr&num=20", timeout=12000) page.wait_for_timeout(2000) results = page.evaluate("""() => { const d = []; document.querySelectorAll('.g, [data-hveid]').forEach(el => { const a = el.querySelector('a[href]'); if (!a) return; const href = a.href; const title = (el.querySelector('h3') || {}).textContent || ''; const snippet = el.innerText.substring(0, 500); if (title.length > 3) d.push({href, title, snippet}); }); return d; }""") page.close() for r in results: title = r.get("title","") url = r.get("href","") snippet = r.get("snippet","") full = f"{title} {snippet}" is_li = "linkedin.com/in/" in url c = country(full) # Extract emails emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full) emails = [e.lower() for e in emails if not any(x in e.lower() for x in bl)] # LinkedIn profiles -> weval_leads + linkedin_leads if is_li: name = title.split(" - ")[0].split(" | ")[0].strip() name = re.sub(r'\(.*?\)', '', name).strip() parts = name.split() if len(parts) < 2: continue first, last = parts[0], " ".join(parts[1:])[:50] if last.lower() in ["linkedin","profil","view","maroc","morocco","join"]: continue company = "" role = "" sp = title.split(" - ") if len(sp) >= 3: role = sp[1].strip()[:100] company = sp[2].strip()[:100] elif len(sp) == 2: role = sp[1].strip()[:100] # weval_leads cur.execute("SELECT 1 FROM admin.weval_leads WHERE LOWER(first_name)=%s AND LOWER(last_name)=%s LIMIT 1",(first.lower(),last.lower())) if not cur.fetchone(): try: cur.execute("""INSERT INTO admin.weval_leads (first_name,last_name,email,company,title,country,linkedin_url,source,created_at) VALUES(%s,%s,%s,%s,%s,%s,%s,'pw_google',NOW())""", (first,last,emails[0] if emails else None,company or None,role or None,c,url)) conn.commit(); t_leads += 1 print(f"+LEAD {first} {last} | {company} | {role}") except: conn.rollback() # linkedin_leads cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE LOWER(lead_name)=LOWER(%s) LIMIT 1",(f"{first} {last}",)) if not cur.fetchone(): try: cur.execute("""INSERT INTO admin.linkedin_leads (lead_name,lead_company,lead_title,lead_linkedin_url,lead_country,captured_at) VALUES(%s,%s,%s,%s,%s,NOW())""", (f"{first} {last}",company or None,role or None,url,c)) conn.commit(); t_li += 1 except: conn.rollback() # All emails -> send_contacts for email in emails: cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1",(email,)) if cur.fetchone(): continue domain = email.split("@")[1] try: cur.execute("""INSERT INTO admin.send_contacts (email,first_name,domain,country,source,status,created_at) VALUES(%s,%s,%s,%s,'pw_google','active',NOW())""", (email,email.split("@")[0].split(".")[0].title(),domain,c)) conn.commit(); t_contacts += 1 print(f"+CONTACT {email}") except: conn.rollback() time.sleep(5) except Exception as e: try: page.close() except: pass print(f"ERR: {e}") browser.close() cur.close(); conn.close() print(f"\nDONE: +{t_leads} leads, +{t_li} LinkedIn, +{t_contacts} contacts")