88 lines
4.0 KiB
Python
88 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
|
import re, sys, time, psycopg2
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DB = dict(host="10.1.0.3", dbname="adx_system", user="admin", password="admin123")
|
|
country = sys.argv[1] if len(sys.argv) > 1 else "ma"
|
|
batch = int(sys.argv[2]) if len(sys.argv) > 2 else 200
|
|
pays_code = country.upper()
|
|
pays_name = "maroc" if country == "ma" else "tunisie"
|
|
|
|
MA_CITIES = ["casablanca","rabat","marrakech","fes","tanger","agadir","meknes","oujda","kenitra","tetouan","safi","nador","beni-mellal","sale","temara","mohammedia","el-jadida"]
|
|
TN_CITIES = ["tunis","sfax","sousse","bizerte","kairouan","gabes","monastir","nabeul","ariana","medenine","mahdia"]
|
|
cities = MA_CITIES if country == "ma" else TN_CITIES
|
|
specs = ["medecin","cardiologue","dentiste","pharmacien","pediatre","gynecologue","ophtalmologue","dermatologue"]
|
|
|
|
conn = psycopg2.connect(**DB)
|
|
cur = conn.cursor()
|
|
total = 0
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox","--disable-dev-shm-usage"])
|
|
ctx = browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36")
|
|
|
|
for spec in specs:
|
|
for city in cities:
|
|
if total >= batch:
|
|
break
|
|
try:
|
|
page = ctx.new_page()
|
|
q = f"{spec} {city} {pays_name} telephone email"
|
|
page.goto(f"https://www.google.com/search?q={q}", timeout=10000)
|
|
page.wait_for_timeout(2000)
|
|
|
|
# Extract from search results
|
|
results = page.evaluate("""() => {
|
|
const data = [];
|
|
document.querySelectorAll('.g, [data-hveid]').forEach(el => {
|
|
const text = el.innerText || '';
|
|
// Find Dr/Pr names
|
|
const names = text.match(/(?:Dr|Pr|Docteur)\.?\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})/g) || [];
|
|
const phones = text.match(/(?:\+212|\+216|0)[0-9 .-]{8,12}/g) || [];
|
|
const emails = text.match(/[\w.+-]+@[\w.-]+\.[a-z]{2,}/gi) || [];
|
|
names.forEach(n => data.push({name: n, phone: phones[0]||'', email: emails[0]||''}));
|
|
});
|
|
return data;
|
|
}""")
|
|
|
|
page.close()
|
|
|
|
for r in results[:5]:
|
|
name = re.sub(r'^(Dr|Pr|Docteur)\.?\s*', '', r['name']).strip()
|
|
parts = name.split(None, 1)
|
|
if not parts or len(parts[0]) < 2:
|
|
continue
|
|
nom = parts[-1].upper()
|
|
prenom = parts[0].title() if len(parts) > 1 else ""
|
|
|
|
cur.execute("SELECT 1 FROM ethica.medecins_validated WHERE LOWER(TRIM(nom))=LOWER(TRIM(%s)) AND LOWER(TRIM(prenom))=LOWER(TRIM(%s)) AND pays=%s LIMIT 1", (nom, prenom, pays_code))
|
|
if cur.fetchone():
|
|
continue
|
|
|
|
email = r.get('email','')
|
|
phone = re.sub(r'[\s.-]','', r.get('phone',''))
|
|
bl = ["google","example","facebook","twitter"]
|
|
if email and any(x in email.lower() for x in bl):
|
|
email = ""
|
|
|
|
try:
|
|
cur.execute("INSERT INTO ethica.medecins_validated (nom,prenom,specialite,ville,pays,email,telephone,source,created_at) VALUES(%s,%s,%s,%s,%s,%s,%s,'playwright_gmaps',NOW())", (nom,prenom,spec,city.title(),pays_code,email or None,phone or None))
|
|
conn.commit()
|
|
total += 1
|
|
except:
|
|
conn.rollback()
|
|
|
|
time.sleep(3)
|
|
except:
|
|
try: page.close()
|
|
except: pass
|
|
if total >= batch:
|
|
break
|
|
print(f"[{spec}] +{total} ({pays_code})")
|
|
|
|
browser.close()
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print(f"GMAPS_{pays_code}: +{total}")
|