Files
html/api/pw_linkedin.py
2026-04-12 22:57:03 +02:00

66 lines
3.1 KiB
Python

#!/usr/bin/env python3
import requests,re,psycopg2,sys,time,json
DB=dict(host='10.1.0.3',dbname='adx_system',user='admin',password='admin123')
SX='http://127.0.0.1:8888/search'
batch=int(sys.argv[1]) if len(sys.argv)>1 else 99999
QUERIES=[
"directeur general pharma maroc","DG laboratoire tunisie","CEO pharma algerie",
"directeur marketing pharma maroc","directeur medical pharma tunisie","medical director algerie",
"directeur qualite pharma maroc","regulatory affairs maghreb","pharmacovigilance maroc",
"clinical research tunisie","market access algerie","key account manager pharma maroc",
"business development pharma tunisie","country manager pharma maroc","consultant SAP maroc",
"SAP project manager tunisie","SAP functional algerie","SAP S4HANA maroc","ERP manager maghreb",
"CIO DSI maroc","directeur informatique tunisie","IT director algerie",
"RSSI CISO maroc","cybersecurity tunisie","security architect algerie",
"cloud architect maroc","data engineer tunisie","devops algerie",
"supply chain director maroc","logistics manager tunisie","directeur achats algerie",
"CFO directeur financier maroc","finance director tunisie",
"clinical operations maghreb","MSL pharma maroc","chef de produit pharma tunisie",
"directeur commercial pharma tunisie","sales director pharma maroc",
"directeur usine maroc","plant manager tunisie","quality manager manufacturing maroc",
"AI machine learning maroc","data scientist algerie","BI analyst tunisie",
]
conn=psycopg2.connect(**DB);cur=conn.cursor();total=0
for query in QUERIES:
if total>=batch:break
sq=f"site:linkedin.com/in {query}"
try:
r=requests.get(SX,params={"q":sq,"format":"json"},timeout=15)
if r.status_code!=200:continue
data=r.json()
for res in data.get("results",[]):
url=res.get("url","")
if "linkedin.com/in/" not in url:continue
url=url.split("?")[0]
# Dedup
cur.execute("SELECT 1 FROM admin.linkedin_profiles WHERE linkedin_url=%s LIMIT 1",(url,))
if cur.fetchone():continue
# Parse
title=res.get("title","")
content=res.get("content","")
slug=url.split("/in/")[1].split("/")[0] if "/in/" in url else ""
full_name=slug.replace("-"," ").title()[:100]
full_name=re.sub(r'\d+','',full_name).strip()
if len(full_name)<3:continue
headline=(title+" "+content)[:300]
company=""
m=re.search(r'Experience:\s*([^·\n]+)',headline)
if m:company=m.group(1).strip()[:100]
location=""
m=re.search(r'Location:\s*([^·\n]+)',headline)
if m:location=m.group(1).strip()[:100]
try:
cur.execute("INSERT INTO admin.linkedin_profiles (linkedin_url,full_name,headline,company,location,source_search,scraped_at) VALUES(%s,%s,%s,%s,%s,%s,NOW())",
(url,full_name,headline,company,location,query))
conn.commit();total+=1
except:conn.rollback()
time.sleep(2)
except:pass
if total%10==0 and total>0:print(f"+{total} profiles")
cur.close();conn.close()
print(f"LINKEDIN_SEARXNG:+{total}")