63 lines
2.6 KiB
Python
63 lines
2.6 KiB
Python
#!/usr/bin/env python3
|
|
"""WEVAL Playwright Kompass Scraper — headless directory scraping"""
|
|
import re, sys, time, psycopg2
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DB = dict(host='10.1.0.3', dbname='adx_system', user='admin', password='admin123')
|
|
country = sys.argv[1] if len(sys.argv) > 1 else 'MA'
|
|
max_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 20
|
|
|
|
COUNTRY_MAP = {'MA': 'ma', 'DZ': 'dz', 'TN': 'tn'}
|
|
SECTORS = ['informatique','logiciel','erp','cloud','pharmaceutique','banque','telecom','energie','logistique','chimie']
|
|
|
|
conn = psycopg2.connect(**DB); cur = conn.cursor(); added = 0
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
page = browser.new_page()
|
|
|
|
cc = COUNTRY_MAP.get(country, 'ma')
|
|
for sector in SECTORS:
|
|
for pg in range(1, max_pages + 1):
|
|
url = f"https://www.kompass.com/{cc}/searchCompanies?text={sector}&page={pg}"
|
|
try:
|
|
page.goto(url, timeout=15000)
|
|
time.sleep(2)
|
|
|
|
# Extract company cards
|
|
cards = page.query_selector_all('.company-card, .companyName, [class*=company]')
|
|
if not cards:
|
|
cards = page.query_selector_all('a[href*="/company/"]')
|
|
|
|
for card in cards:
|
|
try:
|
|
name = card.inner_text().strip()[:200]
|
|
href = card.get_attribute('href') or ''
|
|
|
|
if not name or len(name) < 3: continue
|
|
|
|
cur.execute("SELECT 1 FROM admin.weval_leads WHERE company_name=%s AND country=%s",(name,country))
|
|
if cur.fetchone(): continue
|
|
|
|
website = f"https://www.kompass.com{href}" if href.startswith('/') else href
|
|
|
|
cur.execute("""INSERT INTO admin.weval_leads
|
|
(company_name,industry,country,website,source,created_at)
|
|
VALUES (%s,%s,%s,%s,'kompass_pw',NOW())""",
|
|
(name, sector, country, website))
|
|
conn.commit(); added += 1
|
|
print(f"+KOMPASS {name} [{sector}] {country}")
|
|
except: pass
|
|
|
|
if not cards: break
|
|
print(f"[{sector}] page {pg}: {len(cards)} cards")
|
|
except Exception as e:
|
|
print(f"ERR {sector} p{pg}: {e}")
|
|
break
|
|
time.sleep(2)
|
|
|
|
browser.close()
|
|
|
|
cur.close(); conn.close()
|
|
print(f"KOMPASS_{country}: +{added} companies")
|