66 lines
3.1 KiB
Python
66 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
import requests,re,psycopg2,sys,time,json
|
|
DB=dict(host='10.1.0.3',dbname='adx_system',user='admin',password='admin123')
|
|
SX='http://127.0.0.1:8888/search'
|
|
batch=int(sys.argv[1]) if len(sys.argv)>1 else 99999
|
|
|
|
QUERIES=[
|
|
"directeur general pharma maroc","DG laboratoire tunisie","CEO pharma algerie",
|
|
"directeur marketing pharma maroc","directeur medical pharma tunisie","medical director algerie",
|
|
"directeur qualite pharma maroc","regulatory affairs maghreb","pharmacovigilance maroc",
|
|
"clinical research tunisie","market access algerie","key account manager pharma maroc",
|
|
"business development pharma tunisie","country manager pharma maroc","consultant SAP maroc",
|
|
"SAP project manager tunisie","SAP functional algerie","SAP S4HANA maroc","ERP manager maghreb",
|
|
"CIO DSI maroc","directeur informatique tunisie","IT director algerie",
|
|
"RSSI CISO maroc","cybersecurity tunisie","security architect algerie",
|
|
"cloud architect maroc","data engineer tunisie","devops algerie",
|
|
"supply chain director maroc","logistics manager tunisie","directeur achats algerie",
|
|
"CFO directeur financier maroc","finance director tunisie",
|
|
"clinical operations maghreb","MSL pharma maroc","chef de produit pharma tunisie",
|
|
"directeur commercial pharma tunisie","sales director pharma maroc",
|
|
"directeur usine maroc","plant manager tunisie","quality manager manufacturing maroc",
|
|
"AI machine learning maroc","data scientist algerie","BI analyst tunisie",
|
|
]
|
|
|
|
conn=psycopg2.connect(**DB);cur=conn.cursor();total=0
|
|
|
|
for query in QUERIES:
|
|
if total>=batch:break
|
|
sq=f"site:linkedin.com/in {query}"
|
|
try:
|
|
r=requests.get(SX,params={"q":sq,"format":"json"},timeout=15)
|
|
if r.status_code!=200:continue
|
|
data=r.json()
|
|
for res in data.get("results",[]):
|
|
url=res.get("url","")
|
|
if "linkedin.com/in/" not in url:continue
|
|
url=url.split("?")[0]
|
|
# Dedup
|
|
cur.execute("SELECT 1 FROM admin.linkedin_profiles WHERE linkedin_url=%s LIMIT 1",(url,))
|
|
if cur.fetchone():continue
|
|
# Parse
|
|
title=res.get("title","")
|
|
content=res.get("content","")
|
|
slug=url.split("/in/")[1].split("/")[0] if "/in/" in url else ""
|
|
full_name=slug.replace("-"," ").title()[:100]
|
|
full_name=re.sub(r'\d+','',full_name).strip()
|
|
if len(full_name)<3:continue
|
|
headline=(title+" "+content)[:300]
|
|
company=""
|
|
m=re.search(r'Experience:\s*([^·\n]+)',headline)
|
|
if m:company=m.group(1).strip()[:100]
|
|
location=""
|
|
m=re.search(r'Location:\s*([^·\n]+)',headline)
|
|
if m:location=m.group(1).strip()[:100]
|
|
try:
|
|
cur.execute("INSERT INTO admin.linkedin_profiles (linkedin_url,full_name,headline,company,location,source_search,scraped_at) VALUES(%s,%s,%s,%s,%s,%s,NOW())",
|
|
(url,full_name,headline,company,location,query))
|
|
conn.commit();total+=1
|
|
except:conn.rollback()
|
|
time.sleep(2)
|
|
except:pass
|
|
if total%10==0 and total>0:print(f"+{total} profiles")
|
|
|
|
cur.close();conn.close()
|
|
print(f"LINKEDIN_SEARXNG:+{total}")
|