Files
weval-l99/ethica-scraper-v3.py
2026-04-15 01:38:46 +02:00

94 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""ETHICA v3 - Improved regex, all cities, pagination, multi-keywords"""
import json, re, time, sys, urllib.request, urllib.parse
import psycopg2
DB = "host=10.1.0.3 dbname=adx_system user=admin password=admin123"
SEARX = "http://localhost:8080/search"
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 5
LOG = "/opt/weval-l99/logs/ethica-scraper-v3.log"
PAGES = 3
SPECS = ["generaliste","cardiologue","pediatre","dentiste","gastro-enterologue","pneumologue","gynecologue","orthopediste","dermatologue","ophtalmologue","neurologue","urologue","nephrologue","rhumatologue","endocrinologue","chirurgien","radiologue","anesthesiste"]
VTN = ["tunis","sfax","sousse","bizerte","gabes","ariana","monastir","kairouan","nabeul"]
VMA = ["casablanca","rabat","marrakech","fes","tanger","agadir","meknes","oujda"]
KW = ["telephone","cabinet medical"]
def log(msg):
line = "[%s] V3: %s" % (time.strftime("%Y-%m-%d %H:%M:%S"), msg)
print(line, flush=True)
try:
fh = open(LOG, "a")
fh.write(line + "\n")
fh.close()
except:
pass
def search(query, page=1):
try:
url = "%s?q=%s&format=json&pageno=%d" % (SEARX, urllib.parse.quote(query), page)
req = urllib.request.Request(url, headers={"User-Agent": "EthicaBot/3.0"})
r = urllib.request.urlopen(req, timeout=10)
data = json.loads(r.read())
results = []
for res in data.get("results", []):
title = res.get("title", "")
content = res.get("content", "")
src_url = res.get("url", "")
full = title + " " + content
nom = None
m = re.search(r"(?:Dr|Docteur|Pr|Prof)\.?\s+([A-Z][a-zA-Z]+(?:\s+[A-Za-z]+){0,3})", full)
if not m:
m = re.search(r"(?:DR|DOCTEUR|PR)\.?\s+([A-Z]{2,}(?:\s+[A-Z]{2,}){0,3})", full)
if m:
nom = m.group(1).strip()[:100]
emails = re.findall(r"[\w.+-]+@[\w-]+\.[\w.]+", content)
phones = re.findall(r"(?:\+216|\+212|0)[\s.-]?[0-9]{2}[\s.-]?[0-9]{3}[\s.-]?[0-9]{3}", content)
if nom and len(nom) > 3:
phone_clean = re.sub(r"[\s.\-]", "", phones[0]) if phones else None
results.append({"nom": nom, "source_url": src_url, "email": emails[0] if emails else None, "telephone": phone_clean})
return results
except Exception as e:
log(" ERR: %s" % e)
return []
def upsert(conn, nom, spec, pays, source, url, email, phone):
cur = conn.cursor()
try:
cur.execute("INSERT INTO ethica.medecins_real (nom,specialite,pays,source,source_url,email,telephone,created_at) VALUES (%s,%s,%s,%s,%s,%s,%s,NOW()) ON CONFLICT (nom,pays) DO UPDATE SET email=COALESCE(ethica.medecins_real.email,EXCLUDED.email), telephone=COALESCE(ethica.medecins_real.telephone,EXCLUDED.telephone)", (nom,spec,pays,source,url,email,phone))
conn.commit()
return True
except:
conn.rollback()
return False
def run_country(conn, specs, villes, pays, source, country_name):
total = 0
inserted = 0
for spec in specs:
sf = 0
for ville in villes:
for kw in KW:
for pg in range(1, PAGES + 1):
q = "%s %s %s %s" % (spec, ville, country_name, kw)
results = search(q, pg)
for r in results:
if upsert(conn, r["nom"], spec, pays, source, r["source_url"], r["email"], r["telephone"]):
inserted += 1
sf += len(results)
total += len(results)
time.sleep(1.5)
log(" %s %s: %d found (%d new)" % (pays, spec, sf, inserted))
return total, inserted
def main():
conn = psycopg2.connect(DB)
log("START v3 batch=%d pages=%d cities=TN:%d+MA:%d" % (BATCH, PAGES, len(VTN), len(VMA)))
t1, i1 = run_country(conn, SPECS[:BATCH], VTN, "TN", "cnam_searx_v3", "tunisie")
t2, i2 = run_country(conn, SPECS[:BATCH], VMA, "MA", "cromc_searx_v3", "maroc")
log("DONE TN=%d(%d new) MA=%d(%d new) total=%d inserted=%d" % (t1, i1, t2, i2, t1+t2, i1+i2))
conn.close()
if __name__ == "__main__":
main()