162 lines
7.2 KiB
Python
162 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""WEVAL B2B+LinkedIn+Contacts scraper via Playwright Google"""
|
|
import re, sys, time, psycopg2
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
|
|
batch = int(sys.argv[1]) if len(sys.argv) > 1 else 300
|
|
|
|
QUERIES = [
|
|
"site:linkedin.com/in DSI maroc",
|
|
"site:linkedin.com/in CTO maroc casablanca",
|
|
"site:linkedin.com/in directeur IT maroc",
|
|
"site:linkedin.com/in directeur technique casablanca",
|
|
"site:linkedin.com/in SAP consultant maroc",
|
|
"site:linkedin.com/in ERP manager morocco",
|
|
"site:linkedin.com/in supply chain director morocco",
|
|
"site:linkedin.com/in directeur digital maroc",
|
|
"site:linkedin.com/in cybersecurity morocco",
|
|
"site:linkedin.com/in cloud architect maroc",
|
|
"site:linkedin.com/in directeur financier maroc",
|
|
"site:linkedin.com/in head of IT maroc",
|
|
"site:linkedin.com/in data engineer maroc",
|
|
"site:linkedin.com/in DevOps manager maroc",
|
|
"site:linkedin.com/in directeur infrastructure maroc",
|
|
"site:linkedin.com/in CISO maroc",
|
|
"site:linkedin.com/in directeur systemes information rabat",
|
|
"site:linkedin.com/in DSI tunisie",
|
|
"site:linkedin.com/in CTO algerie",
|
|
"site:linkedin.com/in directeur informatique casablanca",
|
|
"site:linkedin.com/in manufacturing director morocco",
|
|
"site:linkedin.com/in pharma director maghreb",
|
|
"site:linkedin.com/in directeur operations maroc",
|
|
"site:linkedin.com/in responsable achats maroc",
|
|
"site:linkedin.com/in startup CEO maroc fondateur",
|
|
"site:linkedin.com/in chief digital officer morocco",
|
|
"directeur systemes information casablanca email",
|
|
"DSI maroc email contact entreprise",
|
|
"CTO maroc email contact",
|
|
"cabinet conseil IT maroc email",
|
|
"SSII ESN maroc directeur email",
|
|
"integrateur SAP maroc email",
|
|
"entreprise IT casablanca email contact",
|
|
"entreprise tech rabat marrakech email",
|
|
"startup maroc fondateur email contact",
|
|
"PME maroc email directeur",
|
|
"banque maroc email IT direction",
|
|
"telecom maroc email direction IT",
|
|
"assurance maroc email direction",
|
|
"industrie maroc email directeur",
|
|
]
|
|
|
|
conn = psycopg2.connect(**DB)
|
|
cur = conn.cursor()
|
|
t_leads = 0
|
|
t_li = 0
|
|
t_contacts = 0
|
|
bl = ["google","facebook","linkedin.com/company","wikipedia","youtube","twitter","instagram","example","noreply"]
|
|
|
|
def country(text):
|
|
tl = text.lower()
|
|
if any(x in tl for x in ["tunis","tunisia"]):return "TN"
|
|
if any(x in tl for x in ["alger","algeria"]):return "DZ"
|
|
return "MA"
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
|
|
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0")
|
|
|
|
for q in QUERIES:
|
|
if t_leads + t_contacts >= batch:
|
|
break
|
|
try:
|
|
page = ctx.new_page()
|
|
page.goto(f"https://www.google.com/search?q={q.replace(' ','+')}&hl=fr&num=20", timeout=12000)
|
|
page.wait_for_timeout(2000)
|
|
|
|
results = page.evaluate("""() => {
|
|
const d = [];
|
|
document.querySelectorAll('.g, [data-hveid]').forEach(el => {
|
|
const a = el.querySelector('a[href]');
|
|
if (!a) return;
|
|
const href = a.href;
|
|
const title = (el.querySelector('h3') || {}).textContent || '';
|
|
const snippet = el.innerText.substring(0, 500);
|
|
if (title.length > 3) d.push({href, title, snippet});
|
|
});
|
|
return d;
|
|
}""")
|
|
page.close()
|
|
|
|
for r in results:
|
|
title = r.get("title","")
|
|
url = r.get("href","")
|
|
snippet = r.get("snippet","")
|
|
full = f"{title} {snippet}"
|
|
is_li = "linkedin.com/in/" in url
|
|
c = country(full)
|
|
|
|
# Extract emails
|
|
emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full)
|
|
emails = [e.lower() for e in emails if not any(x in e.lower() for x in bl)]
|
|
|
|
# LinkedIn profiles -> weval_leads + linkedin_leads
|
|
if is_li:
|
|
name = title.split(" - ")[0].split(" | ")[0].strip()
|
|
name = re.sub(r'\(.*?\)', '', name).strip()
|
|
parts = name.split()
|
|
if len(parts) < 2: continue
|
|
first, last = parts[0], " ".join(parts[1:])[:50]
|
|
if last.lower() in ["linkedin","profil","view","maroc","morocco","join"]: continue
|
|
|
|
company = ""
|
|
role = ""
|
|
sp = title.split(" - ")
|
|
if len(sp) >= 3:
|
|
role = sp[1].strip()[:100]
|
|
company = sp[2].strip()[:100]
|
|
elif len(sp) == 2:
|
|
role = sp[1].strip()[:100]
|
|
|
|
# weval_leads
|
|
cur.execute("SELECT 1 FROM admin.weval_leads WHERE LOWER(first_name)=%s AND LOWER(last_name)=%s LIMIT 1",(first.lower(),last.lower()))
|
|
if not cur.fetchone():
|
|
try:
|
|
cur.execute("""INSERT INTO admin.weval_leads (first_name,last_name,email,company,title,country,linkedin_url,source,created_at) VALUES(%s,%s,%s,%s,%s,%s,%s,'pw_google',NOW())""",
|
|
(first,last,emails[0] if emails else None,company or None,role or None,c,url))
|
|
conn.commit(); t_leads += 1
|
|
print(f"+LEAD {first} {last} | {company} | {role}")
|
|
except: conn.rollback()
|
|
|
|
# linkedin_leads
|
|
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE LOWER(lead_name)=LOWER(%s) LIMIT 1",(f"{first} {last}",))
|
|
if not cur.fetchone():
|
|
try:
|
|
cur.execute("""INSERT INTO admin.linkedin_leads (lead_name,lead_company,lead_title,lead_linkedin_url,lead_country,captured_at) VALUES(%s,%s,%s,%s,%s,NOW())""",
|
|
(f"{first} {last}",company or None,role or None,url,c))
|
|
conn.commit(); t_li += 1
|
|
except: conn.rollback()
|
|
|
|
# All emails -> send_contacts
|
|
for email in emails:
|
|
cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1",(email,))
|
|
if cur.fetchone(): continue
|
|
domain = email.split("@")[1]
|
|
try:
|
|
cur.execute("""INSERT INTO admin.send_contacts (email,first_name,domain,country,source,status,created_at) VALUES(%s,%s,%s,%s,'pw_google','active',NOW())""",
|
|
(email,email.split("@")[0].split(".")[0].title(),domain,c))
|
|
conn.commit(); t_contacts += 1
|
|
print(f"+CONTACT {email}")
|
|
except: conn.rollback()
|
|
|
|
time.sleep(5)
|
|
except Exception as e:
|
|
try: page.close()
|
|
except: pass
|
|
print(f"ERR: {e}")
|
|
|
|
browser.close()
|
|
|
|
cur.close(); conn.close()
|
|
print(f"\nDONE: +{t_leads} leads, +{t_li} LinkedIn, +{t_contacts} contacts")
|