Files
html/api/weval_scraper.py
2026-04-12 22:57:03 +02:00

209 lines
8.0 KiB
Python

#!/usr/bin/env python3
"""WEVAL Combined Scraper: B2B Leads + LinkedIn + Maghreb Contacts via SearXNG"""
import re, sys, time, json, psycopg2, urllib.request, urllib.parse
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
SEARXNG = "http://localhost:8888/search"
batch = int(sys.argv[1]) if len(sys.argv) > 1 else 500
mode = sys.argv[2] if len(sys.argv) > 2 else "all"
def search(q, lang="fr"):
url = f"{SEARXNG}?q={urllib.parse.quote(q)}&format=json&language={lang}"
try:
r = urllib.request.urlopen(url, timeout=10)
return json.loads(r.read()).get("results", [])
except:
return []
# ═══ QUERIES ═══
B2B_QUERIES = [
"directeur systemes information maroc linkedin",
"DSI casablanca linkedin email",
"CTO maroc linkedin profil",
"directeur technique casablanca linkedin",
"directeur digital maroc linkedin",
"responsable IT maroc linkedin",
"directeur supply chain maroc",
"directeur financier maroc linkedin",
"chief information officer morocco linkedin",
"SAP consultant manager maroc",
"cloud architect maroc linkedin",
"responsable cybersecurite maroc",
"directeur achats maroc",
"directeur operations maroc",
"DSI rabat linkedin",
"DSI tunis tunisie linkedin",
"CTO algerie alger linkedin",
"ERP project manager maghreb",
"responsable infrastructure IT maroc",
"CISO maroc linkedin",
"head of IT morocco linkedin",
"VP technology morocco linkedin",
"data engineer maroc linkedin",
"DevOps manager maroc linkedin",
"product manager maroc linkedin",
"manufacturing director morocco",
"pharma director maghreb",
"life sciences manager maroc",
"cabinet conseil IT maroc",
"SSII ESN maroc directeur",
"integrateur SAP maroc directeur",
"startup tech maroc fondateur CEO",
]
CONTACT_QUERIES = [
"entreprise maroc email contact site officiel",
"societe casablanca contact email",
"entreprise IT maroc email",
"startup maroc email fondateur",
"PME maroc email contact",
"banque maroc email direction",
"assurance maroc email contact",
"industrie maroc email direction",
"SSII maroc email",
"ESN maroc email contact",
"telecom maroc email direction",
"pharma maroc email",
"logistique maroc email",
"BTP maroc email direction",
"entreprise tunisie email contact",
"entreprise algerie email contact",
"cabinet conseil maroc email",
"agence digitale maroc email",
]
conn = psycopg2.connect(**DB)
cur = conn.cursor()
t_leads = 0
t_linkedin = 0
t_contacts = 0
bl = ["google","facebook","linkedin","wikipedia","youtube","twitter","instagram","example","gov.ma","noreply"]
def extract_country(text):
tl = text.lower()
if any(x in tl for x in ["tunis","tunisia"]):
return "TN"
elif any(x in tl for x in ["alger","algeria"]):
return "DZ"
return "MA"
# ═══ B2B LEADS ═══
if mode in ("all", "b2b"):
for q in B2B_QUERIES:
if t_leads >= batch:
break
results = search(q)
for r in results:
title = r.get("title", "")
url = r.get("url", "")
snippet = r.get("content", "")
full = f"{title} {snippet}"
# LinkedIn profiles
is_li = "linkedin.com/in/" in url
# Extract names
names = re.findall(r'([A-Z][a-z]{2,}(?:\s+[A-Z][a-z]{2,}){1,2})', full)
if not names and is_li:
name = title.split(" - ")[0].split(" | ")[0].strip()
name = re.sub(r'\s*\(.*?\)', '', name).strip()
if len(name.split()) >= 2:
names = [name]
emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full)
emails = [e for e in emails if not any(x in e.lower() for x in bl)]
company = ""
role = ""
for sep in [" - ", " | ", " chez ", " at "]:
sp = full.split(sep)
if len(sp) >= 3:
role = sp[1].strip()[:100]
company = sp[2].split(" - ")[0].split(" | ")[0].strip()[:100]
break
elif len(sp) == 2:
role = sp[1].strip()[:100]
break
country = extract_country(full)
for name in names[:2]:
parts = name.split()
if len(parts) < 2:
continue
first, last = parts[0], " ".join(parts[1:])
if last.lower() in ["linkedin","maroc","morocco","profile","view","join"]:
continue
cur.execute("SELECT 1 FROM admin.weval_leads WHERE LOWER(first_name)=%s AND LOWER(last_name)=%s LIMIT 1",
(first.lower(), last.lower()))
if cur.fetchone():
continue
email = emails[0] if emails else None
li_url = url if is_li else None
try:
cur.execute("""INSERT INTO admin.weval_leads
(first_name,last_name,email,company,title,country,linkedin_url,source,created_at)
VALUES(%s,%s,%s,%s,%s,%s,%s,'searxng_b2b',NOW())""",
(first,last,email,company or None,role or None,country,li_url))
conn.commit()
t_leads += 1
print(f"+LEAD {first} {last} | {company} | {role} | {country}")
except:
conn.rollback()
# Also add to linkedin_leads if LinkedIn URL
if is_li:
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE LOWER(lead_name)=LOWER(%s) LIMIT 1",
(f"{first} {last}",))
if not cur.fetchone():
try:
cur.execute("""INSERT INTO admin.linkedin_leads
(lead_name,lead_company,lead_title,lead_linkedin_url,lead_country,captured_at)
VALUES(%s,%s,%s,%s,%s,NOW())""",
(f"{first} {last}",company or None,role or None,url,country))
conn.commit()
t_linkedin += 1
except:
conn.rollback()
time.sleep(2)
# ═══ MAGHREB CONTACTS ═══
if mode in ("all", "contacts"):
for q in CONTACT_QUERIES:
if t_contacts >= batch:
break
results = search(q)
for r in results:
text = f"{r.get('title','')} {r.get('content','')}"
emails = set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', text))
emails = {e.lower() for e in emails if not any(x in e.lower() for x in bl) and len(e) < 100}
for email in emails:
cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1", (email,))
if cur.fetchone():
continue
domain = email.split("@")[1]
country = extract_country(domain + " " + text)
name = email.split("@")[0].replace("."," ").replace("_"," ")
first = name.split()[0].title() if name.split() else None
try:
cur.execute("""INSERT INTO admin.send_contacts
(email,first_name,domain,country,source,status,created_at)
VALUES(%s,%s,%s,%s,'searxng_maghreb','active',NOW())""",
(email, first, domain, country))
conn.commit()
t_contacts += 1
print(f"+CONTACT {email} ({country})")
except:
conn.rollback()
time.sleep(2)
cur.close()
conn.close()
print(f"\nWEVAL_SCRAPER: +{t_leads} B2B leads, +{t_linkedin} LinkedIn, +{t_contacts} contacts")