94 lines
4.1 KiB
Python
94 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""ETHICA v3 - Improved regex, all cities, pagination, multi-keywords"""
|
|
import json, re, time, sys, urllib.request, urllib.parse
|
|
import psycopg2
|
|
|
|
DB = "host=10.1.0.3 dbname=adx_system user=admin password=admin123"
|
|
SEARX = "http://localhost:8080/search"
|
|
BATCH = int(sys.argv[1]) if len(sys.argv) > 1 else 5
|
|
LOG = "/opt/weval-l99/logs/ethica-scraper-v3.log"
|
|
PAGES = 3
|
|
|
|
SPECS = ["generaliste","cardiologue","pediatre","dentiste","gastro-enterologue","pneumologue","gynecologue","orthopediste","dermatologue","ophtalmologue","neurologue","urologue","nephrologue","rhumatologue","endocrinologue","chirurgien","radiologue","anesthesiste"]
|
|
VTN = ["tunis","sfax","sousse","bizerte","gabes","ariana","monastir","kairouan","nabeul"]
|
|
VMA = ["casablanca","rabat","marrakech","fes","tanger","agadir","meknes","oujda"]
|
|
KW = ["telephone","cabinet medical"]
|
|
|
|
def log(msg):
|
|
line = "[%s] V3: %s" % (time.strftime("%Y-%m-%d %H:%M:%S"), msg)
|
|
print(line, flush=True)
|
|
try:
|
|
fh = open(LOG, "a")
|
|
fh.write(line + "\n")
|
|
fh.close()
|
|
except:
|
|
pass
|
|
|
|
def search(query, page=1):
|
|
try:
|
|
url = "%s?q=%s&format=json&pageno=%d" % (SEARX, urllib.parse.quote(query), page)
|
|
req = urllib.request.Request(url, headers={"User-Agent": "EthicaBot/3.0"})
|
|
r = urllib.request.urlopen(req, timeout=10)
|
|
data = json.loads(r.read())
|
|
results = []
|
|
for res in data.get("results", []):
|
|
title = res.get("title", "")
|
|
content = res.get("content", "")
|
|
src_url = res.get("url", "")
|
|
full = title + " " + content
|
|
nom = None
|
|
m = re.search(r"(?:Dr|Docteur|Pr|Prof)\.?\s+([A-Z][a-zA-Z]+(?:\s+[A-Za-z]+){0,3})", full)
|
|
if not m:
|
|
m = re.search(r"(?:DR|DOCTEUR|PR)\.?\s+([A-Z]{2,}(?:\s+[A-Z]{2,}){0,3})", full)
|
|
if m:
|
|
nom = m.group(1).strip()[:100]
|
|
emails = re.findall(r"[\w.+-]+@[\w-]+\.[\w.]+", content)
|
|
phones = re.findall(r"(?:\+216|\+212|0)[\s.-]?[0-9]{2}[\s.-]?[0-9]{3}[\s.-]?[0-9]{3}", content)
|
|
if nom and len(nom) > 3:
|
|
phone_clean = re.sub(r"[\s.\-]", "", phones[0]) if phones else None
|
|
results.append({"nom": nom, "source_url": src_url, "email": emails[0] if emails else None, "telephone": phone_clean})
|
|
return results
|
|
except Exception as e:
|
|
log(" ERR: %s" % e)
|
|
return []
|
|
|
|
def upsert(conn, nom, spec, pays, source, url, email, phone):
|
|
cur = conn.cursor()
|
|
try:
|
|
cur.execute("INSERT INTO ethica.medecins_real (nom,specialite,pays,source,source_url,email,telephone,created_at) VALUES (%s,%s,%s,%s,%s,%s,%s,NOW()) ON CONFLICT (nom,pays) DO UPDATE SET email=COALESCE(ethica.medecins_real.email,EXCLUDED.email), telephone=COALESCE(ethica.medecins_real.telephone,EXCLUDED.telephone)", (nom,spec,pays,source,url,email,phone))
|
|
conn.commit()
|
|
return True
|
|
except:
|
|
conn.rollback()
|
|
return False
|
|
|
|
def run_country(conn, specs, villes, pays, source, country_name):
|
|
total = 0
|
|
inserted = 0
|
|
for spec in specs:
|
|
sf = 0
|
|
for ville in villes:
|
|
for kw in KW:
|
|
for pg in range(1, PAGES + 1):
|
|
q = "%s %s %s %s" % (spec, ville, country_name, kw)
|
|
results = search(q, pg)
|
|
for r in results:
|
|
if upsert(conn, r["nom"], spec, pays, source, r["source_url"], r["email"], r["telephone"]):
|
|
inserted += 1
|
|
sf += len(results)
|
|
total += len(results)
|
|
time.sleep(1.5)
|
|
log(" %s %s: %d found (%d new)" % (pays, spec, sf, inserted))
|
|
return total, inserted
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DB)
|
|
log("START v3 batch=%d pages=%d cities=TN:%d+MA:%d" % (BATCH, PAGES, len(VTN), len(VMA)))
|
|
t1, i1 = run_country(conn, SPECS[:BATCH], VTN, "TN", "cnam_searx_v3", "tunisie")
|
|
t2, i2 = run_country(conn, SPECS[:BATCH], VMA, "MA", "cromc_searx_v3", "maroc")
|
|
log("DONE TN=%d(%d new) MA=%d(%d new) total=%d inserted=%d" % (t1, i1, t2, i2, t1+t2, i1+i2))
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|