165 lines
7.4 KiB
Python
165 lines
7.4 KiB
Python
#!/usr/bin/env python3
|
|
"""WEVAL B2B Scraper - Playwright on S204 via Bing"""
|
|
import re, sys, time, psycopg2
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
|
|
batch = int(sys.argv[1]) if len(sys.argv) > 1 else 300
|
|
|
|
QUERIES = [
|
|
"site:linkedin.com/in DSI maroc",
|
|
"site:linkedin.com/in CTO casablanca",
|
|
"site:linkedin.com/in directeur informatique maroc",
|
|
"site:linkedin.com/in SAP consultant maroc",
|
|
"site:linkedin.com/in ERP manager morocco",
|
|
"site:linkedin.com/in supply chain director maroc",
|
|
"site:linkedin.com/in directeur digital maroc",
|
|
"site:linkedin.com/in cybersecurity manager maroc",
|
|
"site:linkedin.com/in cloud architect maroc",
|
|
"site:linkedin.com/in directeur financier maroc",
|
|
"site:linkedin.com/in data engineer maroc",
|
|
"site:linkedin.com/in DevOps manager maroc",
|
|
"site:linkedin.com/in head IT morocco",
|
|
"site:linkedin.com/in CISO maroc",
|
|
"site:linkedin.com/in DSI tunisie",
|
|
"site:linkedin.com/in CTO algerie",
|
|
"site:linkedin.com/in manufacturing director morocco",
|
|
"site:linkedin.com/in pharma director maghreb",
|
|
"site:linkedin.com/in directeur operations maroc",
|
|
"site:linkedin.com/in startup CEO maroc",
|
|
"site:linkedin.com/in chief digital officer morocco",
|
|
"site:linkedin.com/in responsable infrastructure maroc",
|
|
"site:linkedin.com/in directeur achats maroc",
|
|
"site:linkedin.com/in product manager maroc",
|
|
"site:linkedin.com/in VP technology morocco",
|
|
"site:linkedin.com/in directeur technique rabat",
|
|
"site:linkedin.com/in IT manager casablanca",
|
|
"site:linkedin.com/in consultant SAP marrakech",
|
|
"site:linkedin.com/in directeur logistique maroc",
|
|
"site:linkedin.com/in responsable qualite maroc",
|
|
"DSI casablanca email contact entreprise",
|
|
"directeur IT maroc email",
|
|
"SSII ESN maroc email contact directeur",
|
|
"cabinet conseil IT maroc email",
|
|
"integrateur SAP maroc email contact",
|
|
"entreprise IT casablanca email",
|
|
"startup tech maroc fondateur email",
|
|
"banque maroc direction IT email",
|
|
"telecom maroc email IT",
|
|
"industrie maroc email directeur IT",
|
|
]
|
|
|
|
conn = psycopg2.connect(**DB)
|
|
cur = conn.cursor()
|
|
t_leads, t_li, t_contacts = 0, 0, 0
|
|
bl = ["google","facebook","wikipedia","youtube","twitter","instagram","example","noreply"]
|
|
|
|
def get_country(text):
|
|
tl = text.lower()
|
|
if any(x in tl for x in ["tunis","tunisia"]): return "TN"
|
|
if any(x in tl for x in ["alger","algeria"]): return "DZ"
|
|
return "MA"
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
|
|
ctx = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
|
|
|
for q in QUERIES:
|
|
if t_leads + t_contacts >= batch:
|
|
break
|
|
try:
|
|
page = ctx.new_page()
|
|
url = f"https://www.bing.com/search?q={q.replace(' ','+')}&count=30"
|
|
page.goto(url, timeout=15000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(2000)
|
|
|
|
results = page.evaluate("""() => {
|
|
const d = [];
|
|
document.querySelectorAll('.b_algo, li.b_algo').forEach(el => {
|
|
const a = el.querySelector('a');
|
|
if (!a) return;
|
|
const href = a.href || '';
|
|
const title = a.textContent || '';
|
|
const snippet = el.innerText.substring(0, 500);
|
|
d.push({href, title: title.trim(), snippet});
|
|
});
|
|
return d;
|
|
}""")
|
|
page.close()
|
|
|
|
for r in results:
|
|
title = r.get("title","")
|
|
url = r.get("href","")
|
|
snippet = r.get("snippet","")
|
|
full = f"{title} {snippet}"
|
|
is_li = "linkedin.com/in/" in url
|
|
c = get_country(full)
|
|
|
|
emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full)
|
|
emails = [e.lower() for e in emails if not any(x in e.lower() for x in bl)]
|
|
|
|
if is_li:
|
|
name = title.split(" - ")[0].split(" | ")[0].strip()
|
|
name = re.sub(r'\(.*?\)', '', name).strip()
|
|
name = re.sub(r'\s*LinkedIn$', '', name).strip()
|
|
parts = name.split()
|
|
if len(parts) < 2: continue
|
|
first, last = parts[0], " ".join(parts[1:])[:50]
|
|
if last.lower() in ["linkedin","profil","view","maroc","morocco","join","now"]: continue
|
|
full_name = f"{first} {last}"
|
|
|
|
company, role = "", ""
|
|
sp = title.split(" - ")
|
|
if len(sp) >= 3:
|
|
role = sp[1].strip()[:100]
|
|
company = sp[2].split("|")[0].strip()[:100]
|
|
elif len(sp) == 2:
|
|
role = sp[1].strip()[:100]
|
|
|
|
# weval_leads
|
|
cur.execute("SELECT 1 FROM admin.weval_leads WHERE LOWER(contact_name)=LOWER(%s) LIMIT 1", (full_name,))
|
|
if not cur.fetchone():
|
|
try:
|
|
cur.execute("""INSERT INTO admin.weval_leads
|
|
(contact_name,company_name,contact_title,email,country,linkedin_url,source,created_at)
|
|
VALUES(%s,%s,%s,%s,%s,%s,'pw_bing',NOW())""",
|
|
(full_name, company or None, role or None, emails[0] if emails else None, c, url))
|
|
conn.commit(); t_leads += 1
|
|
print(f"+LEAD {full_name} | {company} | {role}")
|
|
except: conn.rollback()
|
|
|
|
# linkedin_leads
|
|
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE LOWER(lead_name)=LOWER(%s) LIMIT 1", (full_name,))
|
|
if not cur.fetchone():
|
|
try:
|
|
cur.execute("""INSERT INTO admin.linkedin_leads
|
|
(lead_name,lead_company,lead_title,lead_linkedin_url,lead_country,captured_at)
|
|
VALUES(%s,%s,%s,%s,%s,NOW())""",
|
|
(full_name, company or None, role or None, url, c))
|
|
conn.commit(); t_li += 1
|
|
except: conn.rollback()
|
|
|
|
for email in emails:
|
|
cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1", (email,))
|
|
if cur.fetchone(): continue
|
|
domain = email.split("@")[1]
|
|
try:
|
|
cur.execute("""INSERT INTO admin.send_contacts
|
|
(email,first_name,domain,country,source,status,created_at)
|
|
VALUES(%s,%s,%s,%s,'pw_bing','active',NOW())""",
|
|
(email, email.split("@")[0].split(".")[0].title(), domain, c))
|
|
conn.commit(); t_contacts += 1
|
|
print(f"+CONTACT {email}")
|
|
except: conn.rollback()
|
|
|
|
time.sleep(4)
|
|
except Exception as e:
|
|
try: page.close()
|
|
except: pass
|
|
print(f"ERR {q[:30]}: {e}")
|
|
|
|
browser.close()
|
|
|
|
cur.close(); conn.close()
|
|
print(f"\nDONE: +{t_leads} leads, +{t_li} LinkedIn, +{t_contacts} contacts")
|