#!/usr/bin/env python3 """WEVAL Combined Scraper: B2B Leads + LinkedIn + Maghreb Contacts via SearXNG""" import re, sys, time, json, psycopg2, urllib.request, urllib.parse DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123") SEARXNG = "http://localhost:8888/search" batch = int(sys.argv[1]) if len(sys.argv) > 1 else 500 mode = sys.argv[2] if len(sys.argv) > 2 else "all" def search(q, lang="fr"): url = f"{SEARXNG}?q={urllib.parse.quote(q)}&format=json&language={lang}" try: r = urllib.request.urlopen(url, timeout=10) return json.loads(r.read()).get("results", []) except: return [] # ═══ QUERIES ═══ B2B_QUERIES = [ "directeur systemes information maroc linkedin", "DSI casablanca linkedin email", "CTO maroc linkedin profil", "directeur technique casablanca linkedin", "directeur digital maroc linkedin", "responsable IT maroc linkedin", "directeur supply chain maroc", "directeur financier maroc linkedin", "chief information officer morocco linkedin", "SAP consultant manager maroc", "cloud architect maroc linkedin", "responsable cybersecurite maroc", "directeur achats maroc", "directeur operations maroc", "DSI rabat linkedin", "DSI tunis tunisie linkedin", "CTO algerie alger linkedin", "ERP project manager maghreb", "responsable infrastructure IT maroc", "CISO maroc linkedin", "head of IT morocco linkedin", "VP technology morocco linkedin", "data engineer maroc linkedin", "DevOps manager maroc linkedin", "product manager maroc linkedin", "manufacturing director morocco", "pharma director maghreb", "life sciences manager maroc", "cabinet conseil IT maroc", "SSII ESN maroc directeur", "integrateur SAP maroc directeur", "startup tech maroc fondateur CEO", ] CONTACT_QUERIES = [ "entreprise maroc email contact site officiel", "societe casablanca contact email", "entreprise IT maroc email", "startup maroc email fondateur", "PME maroc email contact", "banque maroc email direction", "assurance maroc email contact", "industrie maroc email direction", "SSII maroc email", "ESN maroc email contact", "telecom maroc email direction", "pharma maroc email", "logistique maroc email", "BTP maroc email direction", "entreprise tunisie email contact", "entreprise algerie email contact", "cabinet conseil maroc email", "agence digitale maroc email", ] conn = psycopg2.connect(**DB) cur = conn.cursor() t_leads = 0 t_linkedin = 0 t_contacts = 0 bl = ["google","facebook","linkedin","wikipedia","youtube","twitter","instagram","example","gov.ma","noreply"] def extract_country(text): tl = text.lower() if any(x in tl for x in ["tunis","tunisia"]): return "TN" elif any(x in tl for x in ["alger","algeria"]): return "DZ" return "MA" # ═══ B2B LEADS ═══ if mode in ("all", "b2b"): for q in B2B_QUERIES: if t_leads >= batch: break results = search(q) for r in results: title = r.get("title", "") url = r.get("url", "") snippet = r.get("content", "") full = f"{title} {snippet}" # LinkedIn profiles is_li = "linkedin.com/in/" in url # Extract names names = re.findall(r'([A-Z][a-z]{2,}(?:\s+[A-Z][a-z]{2,}){1,2})', full) if not names and is_li: name = title.split(" - ")[0].split(" | ")[0].strip() name = re.sub(r'\s*\(.*?\)', '', name).strip() if len(name.split()) >= 2: names = [name] emails = re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', full) emails = [e for e in emails if not any(x in e.lower() for x in bl)] company = "" role = "" for sep in [" - ", " | ", " chez ", " at "]: sp = full.split(sep) if len(sp) >= 3: role = sp[1].strip()[:100] company = sp[2].split(" - ")[0].split(" | ")[0].strip()[:100] break elif len(sp) == 2: role = sp[1].strip()[:100] break country = extract_country(full) for name in names[:2]: parts = name.split() if len(parts) < 2: continue first, last = parts[0], " ".join(parts[1:]) if last.lower() in ["linkedin","maroc","morocco","profile","view","join"]: continue cur.execute("SELECT 1 FROM admin.weval_leads WHERE LOWER(first_name)=%s AND LOWER(last_name)=%s LIMIT 1", (first.lower(), last.lower())) if cur.fetchone(): continue email = emails[0] if emails else None li_url = url if is_li else None try: cur.execute("""INSERT INTO admin.weval_leads (first_name,last_name,email,company,title,country,linkedin_url,source,created_at) VALUES(%s,%s,%s,%s,%s,%s,%s,'searxng_b2b',NOW())""", (first,last,email,company or None,role or None,country,li_url)) conn.commit() t_leads += 1 print(f"+LEAD {first} {last} | {company} | {role} | {country}") except: conn.rollback() # Also add to linkedin_leads if LinkedIn URL if is_li: cur.execute("SELECT 1 FROM admin.linkedin_leads WHERE LOWER(lead_name)=LOWER(%s) LIMIT 1", (f"{first} {last}",)) if not cur.fetchone(): try: cur.execute("""INSERT INTO admin.linkedin_leads (lead_name,lead_company,lead_title,lead_linkedin_url,lead_country,captured_at) VALUES(%s,%s,%s,%s,%s,NOW())""", (f"{first} {last}",company or None,role or None,url,country)) conn.commit() t_linkedin += 1 except: conn.rollback() time.sleep(2) # ═══ MAGHREB CONTACTS ═══ if mode in ("all", "contacts"): for q in CONTACT_QUERIES: if t_contacts >= batch: break results = search(q) for r in results: text = f"{r.get('title','')} {r.get('content','')}" emails = set(re.findall(r'[\w.+-]+@[\w.-]+\.[a-z]{2,}', text)) emails = {e.lower() for e in emails if not any(x in e.lower() for x in bl) and len(e) < 100} for email in emails: cur.execute("SELECT 1 FROM admin.send_contacts WHERE email=%s LIMIT 1", (email,)) if cur.fetchone(): continue domain = email.split("@")[1] country = extract_country(domain + " " + text) name = email.split("@")[0].replace("."," ").replace("_"," ") first = name.split()[0].title() if name.split() else None try: cur.execute("""INSERT INTO admin.send_contacts (email,first_name,domain,country,source,status,created_at) VALUES(%s,%s,%s,%s,'searxng_maghreb','active',NOW())""", (email, first, domain, country)) conn.commit() t_contacts += 1 print(f"+CONTACT {email} ({country})") except: conn.rollback() time.sleep(2) cur.close() conn.close() print(f"\nWEVAL_SCRAPER: +{t_leads} B2B leads, +{t_linkedin} LinkedIn, +{t_contacts} contacts")