Files
html/api/weval_google_scraper.py
2026-04-12 22:57:03 +02:00

162 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""WEVAL B2B+LinkedIn+Contacts scraper via Playwright Google"""
import re, sys, time, psycopg2
from playwright.sync_api import sync_playwright
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
batch = int(sys.argv[1]) if len(sys.argv) > 1 else 300
QUERIES = [
"site:linkedin.com/in DSI maroc",
"site:linkedin.com/in CTO maroc casablanca",
"site:linkedin.com/in directeur IT maroc",
"site:linkedin.com/in directeur technique casablanca",
"site:linkedin.com/in SAP consultant maroc",
"site:linkedin.com/in ERP manager morocco",
"site:linkedin.com/in supply chain director morocco",
"site:linkedin.com/in directeur digital maroc",
"site:linkedin.com/in cybersecurity morocco",
"site:linkedin.com/in cloud architect maroc",
"site:linkedin.com/in directeur financier maroc",
"site:linkedin.com/in head of IT maroc",
"site:linkedin.com/in data engineer maroc",
"site:linkedin.com/in DevOps manager maroc",
"site:linkedin.com/in directeur infrastructure maroc",
"site:linkedin.com/in CISO maroc",
"site:linkedin.com/in directeur systemes information rabat",
"site:linkedin.com/in DSI tunisie",
"site:linkedin.com/in CTO algerie",
"site:linkedin.com/in directeur informatique casablanca",
"site:linkedin.com/in manufacturing director morocco",
"site:linkedin.com/in pharma director maghreb",
"site:linkedin.com/in directeur operations maroc",
"site:linkedin.com/in responsable achats maroc",
"site:linkedin.com/in startup CEO maroc fondateur",
"site:linkedin.com/in chief digital officer morocco",
"directeur systemes information casablanca email",
"DSI maroc email contact entreprise",
"CTO maroc email contact",
"cabinet conseil IT maroc email",
"SSII ESN maroc directeur email",
"integrateur SAP maroc email",
"entreprise IT casablanca email contact",
"entreprise tech rabat marrakech email",
"startup maroc fondateur email contact",
"PME maroc email directeur",
"banque maroc email IT direction",
"telecom maroc email direction IT",
"assurance maroc email direction",
"industrie maroc email directeur",
]
conn = psycopg2.connect(**DB)
cur = conn.cursor()
t_leads = 0
t_li = 0
t_contacts = 0
bl = ["google","facebook","linkedin.com/company","wikipedia","youtube","twitter","instagram","example","noreply"]
def country(text):
tl = text.lower()
if any(x in tl for x in ["tunis","tunisia"]):return "TN"
if any(x in tl for x in ["alger","algeria"]):return "DZ"
return "MA"
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0")
for q in QUERIES:
if t_leads + t_contacts >= batch:
break
try:
page = ctx.new_page()
page.goto(f"https://www.google.com/search?q={q.replace(' ','+')}&hl=fr&num=20", timeout=12000)
page.wait_for_timeout(2000)
results = page.evaluate("""() => {
const d = [];
document.querySelectorAll('.g, [data-hveid]').forEach(el => {
const a = el.querySelector('a[href]');
if (!a) return;
const href = a.href;
const title = (el.querySelector('h3') || {}).textContent || '';
const snippet = el.innerText.substring(0, 500);
if (title.length > 3) d.push({href, title, snippet});
});
return d;
}""")
page.close()
for r in results:
title = r.get("title","")
url = r.get("href","")
snippet = r.get("snippet","")
full = f"{title} {snippet}"
is_li = "linkedin.com/in/" in url
c = country(full)
# Extract emails
emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full)
emails = [e.lower() for e in emails if not any(x in e.lower() for x in bl)]
# LinkedIn profiles -> weval_leads + linkedin_leads
if is_li:
name = title.split(" - ")[0].split(" | ")[0].strip()
name = re.sub(r'\(.*?\)', '', name).strip()
parts = name.split()
if len(parts) < 2: continue
first, last = parts[0], " ".join(parts[1:])[:50]
if last.lower() in ["linkedin","profil","view","maroc","morocco","join"]: continue
company = ""
role = ""
sp = title.split(" - ")
if len(sp) >= 3:
role = sp[1].strip()[:100]
company = sp[2].strip()[:100]
elif len(sp) == 2:
role = sp[1].strip()[:100]
# weval_leads
cur.execute("SELECT 1 FROM admin.weval_leads WHERE LOWER(first_name)=%s AND LOWER(last_name)=%s LIMIT 1",(first.lower(),last.lower()))
if not cur.fetchone():
try:
cur.execute("""INSERT INTO admin.weval_leads (first_name,last_name,email,company,title,country,linkedin_url,source,created_at) VALUES(%s,%s,%s,%s,%s,%s,%s,'pw_google',NOW())""",
(first,last,emails[0] if emails else None,company or None,role or None,c,url))
conn.commit(); t_leads += 1
print(f"+LEAD {first} {last} | {company} | {role}")
except: conn.rollback()
# linkedin_leads
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE LOWER(lead_name)=LOWER(%s) LIMIT 1",(f"{first} {last}",))
if not cur.fetchone():
try:
cur.execute("""INSERT INTO admin.linkedin_leads (lead_name,lead_company,lead_title,lead_linkedin_url,lead_country,captured_at) VALUES(%s,%s,%s,%s,%s,NOW())""",
(f"{first} {last}",company or None,role or None,url,c))
conn.commit(); t_li += 1
except: conn.rollback()
# All emails -> send_contacts
for email in emails:
cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1",(email,))
if cur.fetchone(): continue
domain = email.split("@")[1]
try:
cur.execute("""INSERT INTO admin.send_contacts (email,first_name,domain,country,source,status,created_at) VALUES(%s,%s,%s,%s,'pw_google','active',NOW())""",
(email,email.split("@")[0].split(".")[0].title(),domain,c))
conn.commit(); t_contacts += 1
print(f"+CONTACT {email}")
except: conn.rollback()
time.sleep(5)
except Exception as e:
try: page.close()
except: pass
print(f"ERR: {e}")
browser.close()
cur.close(); conn.close()
print(f"\nDONE: +{t_leads} leads, +{t_li} LinkedIn, +{t_contacts} contacts")