Files
html/api/pw_b2b_charika.py
2026-04-12 22:57:03 +02:00

99 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""Playwright B2B Scraper - Charika.ma company directory"""
import re, sys, time, json, psycopg2, asyncio
from playwright.async_api import async_playwright
DB = dict(host='10.1.0.3', port=5432, dbname='adx_system', user='admin', password='admin123')
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 100
SECTORS = [
'informatique', 'conseil', 'industrie', 'pharmaceutique', 'logistique',
'banque', 'assurance', 'telecoms', 'energie', 'automobile',
'agroalimentaire', 'chimie', 'textile', 'construction', 'immobilier',
]
async def main():
conn = psycopg2.connect(**DB)
cur = conn.cursor()
total_new = 0
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.set_extra_http_headers({'Accept-Language': 'fr-FR,fr;q=0.9'})
for sector in SECTORS:
if total_new >= BATCH:
break
for pg in range(1, 6):
url = "https://www.charika.ma/recherche?q=%s&page=%d" % (sector, pg)
try:
await page.goto(url, timeout=15000)
await page.wait_for_timeout(2000)
# Extract company cards
cards = await page.query_selector_all('.company-card, .result-item, article')
if not cards:
cards = await page.query_selector_all('a[href*="/societe/"]')
for card in cards:
try:
text = await card.inner_text()
href = await card.get_attribute('href') or ''
# Extract company name
lines = [l.strip() for l in text.split('\n') if l.strip()]
if not lines:
continue
company_name = lines[0][:100]
# Extract DG/Director name if visible
contact_name = ''
for line in lines:
if any(k in line.lower() for k in ['directeur', 'gerant', 'president', 'dg ']):
m = re.search(r'(?:directeur|gerant|president|dg)\s*:?\s*(.+)', line, re.I)
if m:
contact_name = m.group(1).strip()[:100]
break
# City extraction
city = ''
for ct in ['casablanca', 'rabat', 'tanger', 'marrakech', 'fes', 'agadir', 'meknes', 'oujda', 'kenitra']:
if ct in text.lower():
city = ct.capitalize()
break
# Dedup
cur.execute("SELECT 1 FROM admin.weval_leads WHERE company_name=%s LIMIT 1", (company_name,))
if cur.fetchone():
continue
website = ''
if 'charika.ma' in href:
website = href
cur.execute("""INSERT INTO admin.weval_leads
(company_name, contact_name, contact_title, industry, city, country, website, source, created_at)
VALUES (%s,%s,%s,%s,%s,'MA',%s,'charika_scraper',NOW())""",
(company_name, contact_name, 'Directeur General' if contact_name else '', sector, city, website))
conn.commit()
total_new += 1
print("+COMPANY %s | %s | %s" % (company_name, contact_name or '?', city or '?'))
except Exception as e:
conn.rollback()
await page.wait_for_timeout(3000)
except Exception as e:
print("ERR page %s: %s" % (url[:50], e))
await page.wait_for_timeout(5000)
await browser.close()
cur.close()
conn.close()
print("\nCHARIKA: +%d companies" % total_new)
asyncio.run(main())