Files
html/api/pw_gmaps.py
2026-04-12 22:57:03 +02:00

88 lines
4.0 KiB
Python

#!/usr/bin/env python3
import re, sys, time, psycopg2
from playwright.sync_api import sync_playwright
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
country = sys.argv[1] if len(sys.argv) > 1 else "ma"
batch = int(sys.argv[2]) if len(sys.argv) > 2 else 200
pays_code = country.upper()
pays_name = "maroc" if country == "ma" else "tunisie"
MA_CITIES = ["casablanca","rabat","marrakech","fes","tanger","agadir","meknes","oujda","kenitra","tetouan","safi","nador","beni-mellal","sale","temara","mohammedia","el-jadida"]
TN_CITIES = ["tunis","sfax","sousse","bizerte","kairouan","gabes","monastir","nabeul","ariana","medenine","mahdia"]
cities = MA_CITIES if country == "ma" else TN_CITIES
specs = ["medecin","cardiologue","dentiste","pharmacien","pediatre","gynecologue","ophtalmologue","dermatologue"]
conn = psycopg2.connect(**DB)
cur = conn.cursor()
total = 0
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36")
for spec in specs:
for city in cities:
if total >= batch:
break
try:
page = ctx.new_page()
q = f"{spec} {city} {pays_name} telephone email"
page.goto(f"https://www.google.com/search?q={q}", timeout=10000)
page.wait_for_timeout(2000)
# Extract from search results
results = page.evaluate("""() => {
const data = [];
document.querySelectorAll('.g, [data-hveid]').forEach(el => {
const text = el.innerText || '';
// Find Dr/Pr names
const names = text.match(/(?:Dr|Pr|Docteur)\.?\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})/g) || [];
const phones = text.match(/(?:\+212|\+216|0)[0-9 .-]{8,12}/g) || [];
const emails = text.match(/[\w.+-]+@[\w.-]+\.[a-z]{2,}/gi) || [];
names.forEach(n => data.push({name: n, phone: phones[0]||'', email: emails[0]||''}));
});
return data;
}""")
page.close()
for r in results[:5]:
name = re.sub(r'^(Dr|Pr|Docteur)\.?\s*', '', r['name']).strip()
parts = name.split(None, 1)
if not parts or len(parts[0]) < 2:
continue
nom = parts[-1].upper()
prenom = parts[0].title() if len(parts) > 1 else ""
cur.execute("SELECT 1 FROM ethica.medecins_validated WHERE LOWER(TRIM(nom))=LOWER(TRIM(%s)) AND LOWER(TRIM(prenom))=LOWER(TRIM(%s)) AND pays=%s LIMIT 1", (nom, prenom, pays_code))
if cur.fetchone():
continue
email = r.get('email','')
phone = re.sub(r'[\s.-]','', r.get('phone',''))
bl = ["google","example","facebook","twitter"]
if email and any(x in email.lower() for x in bl):
email = ""
try:
cur.execute("INSERT INTO ethica.medecins_validated (nom,prenom,specialite,ville,pays,email,telephone,source,created_at) VALUES(%s,%s,%s,%s,%s,%s,%s,'playwright_gmaps',NOW())", (nom,prenom,spec,city.title(),pays_code,email or None,phone or None))
conn.commit()
total += 1
except:
conn.rollback()
time.sleep(3)
except:
try: page.close()
except: pass
if total >= batch:
break
print(f"[{spec}] +{total} ({pays_code})")
browser.close()
cur.close()
conn.close()
print(f"GMAPS_{pays_code}: +{total}")