209 lines
8.0 KiB
Python
209 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
"""WEVAL Combined Scraper: B2B Leads + LinkedIn + Maghreb Contacts via SearXNG"""
|
|
import re, sys, time, json, psycopg2, urllib.request, urllib.parse
|
|
|
|
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
|
|
SEARXNG = "http://localhost:8888/search"
|
|
batch = int(sys.argv[1]) if len(sys.argv) > 1 else 500
|
|
mode = sys.argv[2] if len(sys.argv) > 2 else "all"
|
|
|
|
def search(q, lang="fr"):
|
|
url = f"{SEARXNG}?q={urllib.parse.quote(q)}&format=json&language={lang}"
|
|
try:
|
|
r = urllib.request.urlopen(url, timeout=10)
|
|
return json.loads(r.read()).get("results", [])
|
|
except:
|
|
return []
|
|
|
|
# ═══ QUERIES ═══
|
|
B2B_QUERIES = [
|
|
"directeur systemes information maroc linkedin",
|
|
"DSI casablanca linkedin email",
|
|
"CTO maroc linkedin profil",
|
|
"directeur technique casablanca linkedin",
|
|
"directeur digital maroc linkedin",
|
|
"responsable IT maroc linkedin",
|
|
"directeur supply chain maroc",
|
|
"directeur financier maroc linkedin",
|
|
"chief information officer morocco linkedin",
|
|
"SAP consultant manager maroc",
|
|
"cloud architect maroc linkedin",
|
|
"responsable cybersecurite maroc",
|
|
"directeur achats maroc",
|
|
"directeur operations maroc",
|
|
"DSI rabat linkedin",
|
|
"DSI tunis tunisie linkedin",
|
|
"CTO algerie alger linkedin",
|
|
"ERP project manager maghreb",
|
|
"responsable infrastructure IT maroc",
|
|
"CISO maroc linkedin",
|
|
"head of IT morocco linkedin",
|
|
"VP technology morocco linkedin",
|
|
"data engineer maroc linkedin",
|
|
"DevOps manager maroc linkedin",
|
|
"product manager maroc linkedin",
|
|
"manufacturing director morocco",
|
|
"pharma director maghreb",
|
|
"life sciences manager maroc",
|
|
"cabinet conseil IT maroc",
|
|
"SSII ESN maroc directeur",
|
|
"integrateur SAP maroc directeur",
|
|
"startup tech maroc fondateur CEO",
|
|
]
|
|
|
|
CONTACT_QUERIES = [
|
|
"entreprise maroc email contact site officiel",
|
|
"societe casablanca contact email",
|
|
"entreprise IT maroc email",
|
|
"startup maroc email fondateur",
|
|
"PME maroc email contact",
|
|
"banque maroc email direction",
|
|
"assurance maroc email contact",
|
|
"industrie maroc email direction",
|
|
"SSII maroc email",
|
|
"ESN maroc email contact",
|
|
"telecom maroc email direction",
|
|
"pharma maroc email",
|
|
"logistique maroc email",
|
|
"BTP maroc email direction",
|
|
"entreprise tunisie email contact",
|
|
"entreprise algerie email contact",
|
|
"cabinet conseil maroc email",
|
|
"agence digitale maroc email",
|
|
]
|
|
|
|
conn = psycopg2.connect(**DB)
|
|
cur = conn.cursor()
|
|
t_leads = 0
|
|
t_linkedin = 0
|
|
t_contacts = 0
|
|
bl = ["google","facebook","linkedin","wikipedia","youtube","twitter","instagram","example","gov.ma","noreply"]
|
|
|
|
def extract_country(text):
|
|
tl = text.lower()
|
|
if any(x in tl for x in ["tunis","tunisia"]):
|
|
return "TN"
|
|
elif any(x in tl for x in ["alger","algeria"]):
|
|
return "DZ"
|
|
return "MA"
|
|
|
|
# ═══ B2B LEADS ═══
|
|
if mode in ("all", "b2b"):
|
|
for q in B2B_QUERIES:
|
|
if t_leads >= batch:
|
|
break
|
|
results = search(q)
|
|
for r in results:
|
|
title = r.get("title", "")
|
|
url = r.get("url", "")
|
|
snippet = r.get("content", "")
|
|
full = f"{title} {snippet}"
|
|
|
|
# LinkedIn profiles
|
|
is_li = "linkedin.com/in/" in url
|
|
|
|
# Extract names
|
|
names = re.findall(r'([A-Z][a-z]{2,}(?:\s+[A-Z][a-z]{2,}){1,2})', full)
|
|
if not names and is_li:
|
|
name = title.split(" - ")[0].split(" | ")[0].strip()
|
|
name = re.sub(r'\s*\(.*?\)', '', name).strip()
|
|
if len(name.split()) >= 2:
|
|
names = [name]
|
|
|
|
emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full)
|
|
emails = [e for e in emails if not any(x in e.lower() for x in bl)]
|
|
|
|
company = ""
|
|
role = ""
|
|
for sep in [" - ", " | ", " chez ", " at "]:
|
|
sp = full.split(sep)
|
|
if len(sp) >= 3:
|
|
role = sp[1].strip()[:100]
|
|
company = sp[2].split(" - ")[0].split(" | ")[0].strip()[:100]
|
|
break
|
|
elif len(sp) == 2:
|
|
role = sp[1].strip()[:100]
|
|
break
|
|
|
|
country = extract_country(full)
|
|
|
|
for name in names[:2]:
|
|
parts = name.split()
|
|
if len(parts) < 2:
|
|
continue
|
|
first, last = parts[0], " ".join(parts[1:])
|
|
if last.lower() in ["linkedin","maroc","morocco","profile","view","join"]:
|
|
continue
|
|
|
|
cur.execute("SELECT 1 FROM admin.weval_leads WHERE LOWER(first_name)=%s AND LOWER(last_name)=%s LIMIT 1",
|
|
(first.lower(), last.lower()))
|
|
if cur.fetchone():
|
|
continue
|
|
|
|
email = emails[0] if emails else None
|
|
li_url = url if is_li else None
|
|
|
|
try:
|
|
cur.execute("""INSERT INTO admin.weval_leads
|
|
(first_name,last_name,email,company,title,country,linkedin_url,source,created_at)
|
|
VALUES(%s,%s,%s,%s,%s,%s,%s,'searxng_b2b',NOW())""",
|
|
(first,last,email,company or None,role or None,country,li_url))
|
|
conn.commit()
|
|
t_leads += 1
|
|
print(f"+LEAD {first} {last} | {company} | {role} | {country}")
|
|
except:
|
|
conn.rollback()
|
|
|
|
# Also add to linkedin_leads if LinkedIn URL
|
|
if is_li:
|
|
cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE LOWER(lead_name)=LOWER(%s) LIMIT 1",
|
|
(f"{first} {last}",))
|
|
if not cur.fetchone():
|
|
try:
|
|
cur.execute("""INSERT INTO admin.linkedin_leads
|
|
(lead_name,lead_company,lead_title,lead_linkedin_url,lead_country,captured_at)
|
|
VALUES(%s,%s,%s,%s,%s,NOW())""",
|
|
(f"{first} {last}",company or None,role or None,url,country))
|
|
conn.commit()
|
|
t_linkedin += 1
|
|
except:
|
|
conn.rollback()
|
|
time.sleep(2)
|
|
|
|
# ═══ MAGHREB CONTACTS ═══
|
|
if mode in ("all", "contacts"):
|
|
for q in CONTACT_QUERIES:
|
|
if t_contacts >= batch:
|
|
break
|
|
results = search(q)
|
|
for r in results:
|
|
text = f"{r.get('title','')} {r.get('content','')}"
|
|
emails = set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', text))
|
|
emails = {e.lower() for e in emails if not any(x in e.lower() for x in bl) and len(e) < 100}
|
|
|
|
for email in emails:
|
|
cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1", (email,))
|
|
if cur.fetchone():
|
|
continue
|
|
|
|
domain = email.split("@")[1]
|
|
country = extract_country(domain + " " + text)
|
|
name = email.split("@")[0].replace("."," ").replace("_"," ")
|
|
first = name.split()[0].title() if name.split() else None
|
|
|
|
try:
|
|
cur.execute("""INSERT INTO admin.send_contacts
|
|
(email,first_name,domain,country,source,status,created_at)
|
|
VALUES(%s,%s,%s,%s,'searxng_maghreb','active',NOW())""",
|
|
(email, first, domain, country))
|
|
conn.commit()
|
|
t_contacts += 1
|
|
print(f"+CONTACT {email} ({country})")
|
|
except:
|
|
conn.rollback()
|
|
time.sleep(2)
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print(f"\nWEVAL_SCRAPER: +{t_leads} B2B leads, +{t_linkedin} LinkedIn, +{t_contacts} contacts")
|