99 lines
4.5 KiB
Python
99 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Playwright B2B Scraper - Charika.ma company directory"""
|
|
import re, sys, time, json, psycopg2, asyncio
|
|
from playwright.async_api import async_playwright
|
|
|
|
DB = dict(host='10.1.0.3', port=5432, dbname='adx_system', user='admin', password='admin123')
|
|
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 100
|
|
|
|
SECTORS = [
|
|
'informatique', 'conseil', 'industrie', 'pharmaceutique', 'logistique',
|
|
'banque', 'assurance', 'telecoms', 'energie', 'automobile',
|
|
'agroalimentaire', 'chimie', 'textile', 'construction', 'immobilier',
|
|
]
|
|
|
|
async def main():
|
|
conn = psycopg2.connect(**DB)
|
|
cur = conn.cursor()
|
|
total_new = 0
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
page = await browser.new_page()
|
|
await page.set_extra_http_headers({'Accept-Language': 'fr-FR,fr;q=0.9'})
|
|
|
|
for sector in SECTORS:
|
|
if total_new >= BATCH:
|
|
break
|
|
|
|
for pg in range(1, 6):
|
|
url = "https://www.charika.ma/recherche?q=%s&page=%d" % (sector, pg)
|
|
try:
|
|
await page.goto(url, timeout=15000)
|
|
await page.wait_for_timeout(2000)
|
|
|
|
# Extract company cards
|
|
cards = await page.query_selector_all('.company-card, .result-item, article')
|
|
if not cards:
|
|
cards = await page.query_selector_all('a[href*="/societe/"]')
|
|
|
|
for card in cards:
|
|
try:
|
|
text = await card.inner_text()
|
|
href = await card.get_attribute('href') or ''
|
|
|
|
# Extract company name
|
|
lines = [l.strip() for l in text.split('\n') if l.strip()]
|
|
if not lines:
|
|
continue
|
|
company_name = lines[0][:100]
|
|
|
|
# Extract DG/Director name if visible
|
|
contact_name = ''
|
|
for line in lines:
|
|
if any(k in line.lower() for k in ['directeur', 'gerant', 'president', 'dg ']):
|
|
m = re.search(r'(?:directeur|gerant|president|dg)\s*:?\s*(.+)', line, re.I)
|
|
if m:
|
|
contact_name = m.group(1).strip()[:100]
|
|
break
|
|
|
|
# City extraction
|
|
city = ''
|
|
for ct in ['casablanca', 'rabat', 'tanger', 'marrakech', 'fes', 'agadir', 'meknes', 'oujda', 'kenitra']:
|
|
if ct in text.lower():
|
|
city = ct.capitalize()
|
|
break
|
|
|
|
# Dedup
|
|
cur.execute("SELECT 1 FROM admin.weval_leads WHERE company_name=%s LIMIT 1", (company_name,))
|
|
if cur.fetchone():
|
|
continue
|
|
|
|
website = ''
|
|
if 'charika.ma' in href:
|
|
website = href
|
|
|
|
cur.execute("""INSERT INTO admin.weval_leads
|
|
(company_name, contact_name, contact_title, industry, city, country, website, source, created_at)
|
|
VALUES (%s,%s,%s,%s,%s,'MA',%s,'charika_scraper',NOW())""",
|
|
(company_name, contact_name, 'Directeur General' if contact_name else '', sector, city, website))
|
|
conn.commit()
|
|
total_new += 1
|
|
print("+COMPANY %s | %s | %s" % (company_name, contact_name or '?', city or '?'))
|
|
|
|
except Exception as e:
|
|
conn.rollback()
|
|
|
|
await page.wait_for_timeout(3000)
|
|
except Exception as e:
|
|
print("ERR page %s: %s" % (url[:50], e))
|
|
await page.wait_for_timeout(5000)
|
|
|
|
await browser.close()
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print("\nCHARIKA: +%d companies" % total_new)
|
|
|
|
asyncio.run(main())
|