Files
html/api/ss.php
2026-04-12 22:57:03 +02:00

130 lines
5.1 KiB
PHP

<?php
if(($_GET['k']??'')!=='WEVADS2026') die('auth');
$r=[];
// Create project
$dir="/opt/weval-scrapy";
if(!is_dir($dir)){
exec("cd /opt && python3 -m scrapy startproject weval_scrapy weval-scrapy 2>&1",$out);
$r[]="project: ".(is_dir($dir)?"created":"tried: ".implode("|",$out));
if(!is_dir($dir)){
// Manual creation
@mkdir("$dir/weval_scrapy/spiders",0755,true);
file_put_contents("$dir/scrapy.cfg","[settings]\ndefault = weval_scrapy.settings\n[deploy]\nproject = weval_scrapy\n");
file_put_contents("$dir/weval_scrapy/settings.py","BOT_NAME='weval_scrapy'\nSPIDER_MODULES=['weval_scrapy.spiders']\nROBOTSTXT_OBEY=True\nCONCURRENT_REQUESTS=4\nDOWNLOAD_DELAY=2\n");
file_put_contents("$dir/weval_scrapy/__init__.py","");
file_put_contents("$dir/weval_scrapy/spiders/__init__.py","");
$r[]="project: manually created";
}
}else{$r[]="project: exists";}
// HCP Spider
$sp="$dir/weval_scrapy/spiders";
file_put_contents("$sp/hcp_spider.py","import scrapy, json
class HCPSpider(scrapy.Spider):
name = 'hcp_enrichment'
custom_settings = {'CONCURRENT_REQUESTS':4,'DOWNLOAD_DELAY':2,'ROBOTSTXT_OBEY':True,
'USER_AGENT':'WEVAL-HCP-Bot/1.0 (+https://weval-consulting.com)'}
def start_requests(self):
# Seed: Google search for HCPs
countries = ['maroc','tunisie','algerie']
specialties = ['cardiologue','dermatologue','generaliste','pharmacien','pediatre']
for c in countries:
for s in specialties:
yield scrapy.Request(f'https://www.google.com/search?q={s}+{c}+contact',
callback=self.parse, meta={'country':c,'specialty':s})
def parse(self, response):
for result in response.css('div.g'):
yield {
'name': result.css('h3::text').get(),
'url': result.css('a::attr(href)').get(),
'snippet': result.css('.VwiC3b::text').get(),
'country': response.meta['country'],
'specialty': response.meta['specialty'],
}
");
$r[]="hcp_spider: created";
// B2B Spider
file_put_contents("$sp/b2b_spider.py","import scrapy
class B2BSpider(scrapy.Spider):
name = 'b2b_leads'
custom_settings = {'CONCURRENT_REQUESTS':2,'DOWNLOAD_DELAY':3,'ROBOTSTXT_OBEY':True}
start_urls = ['https://www.kerix.net/fr/annuaire-entreprises']
def parse(self, response):
for company in response.css('.company-item, .list-item, tr'):
yield {
'name': company.css('a::text, td:first-child::text').get(),
'url': company.css('a::attr(href)').get(),
'sector': company.css('.sector::text, td:nth-child(2)::text').get(),
}
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
");
$r[]="b2b_spider: created";
// Pharma Spider (doctoranytime.ma, sante.gov.ma)
file_put_contents("$sp/pharma_spider.py","import scrapy
class PharmaSpider(scrapy.Spider):
name = 'pharma_directory'
custom_settings = {'CONCURRENT_REQUESTS':2,'DOWNLOAD_DELAY':3,'ROBOTSTXT_OBEY':True}
start_urls = [
'https://www.doctoranytime.ma/specialite/medecin-generaliste',
'https://www.doctoranytime.ma/specialite/cardiologue',
'https://www.doctoranytime.ma/specialite/dermatologue',
]
def parse(self, response):
for doc in response.css('.doctor-card, .search-result'):
yield {
'name': doc.css('.doctor-name::text, h2::text').get(),
'specialty': doc.css('.specialty::text').get(),
'city': doc.css('.city::text, .location::text').get(),
'profile_url': response.urljoin(doc.css('a::attr(href)').get() or ''),
}
next_page = response.css('a.next::attr(href), .pagination a[rel=next]::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
");
$r[]="pharma_spider: created";
// Site Monitor Spider
file_put_contents("$sp/site_monitor.py","import scrapy
class SiteMonitorSpider(scrapy.Spider):
name = 'site_monitor'
custom_settings = {'CONCURRENT_REQUESTS':8,'DOWNLOAD_DELAY':0.5}
start_urls = [
'https://weval-consulting.com/',
'https://weval-consulting.com/wevia-ia/wevia.html',
'https://weval-consulting.com/ai-benchmark.html',
'https://weval-consulting.com/oss-discovery.html',
'https://weval-consulting.com/realtime-monitor.html',
'https://deerflow.weval-consulting.com/',
'https://crm.weval-consulting.com/',
]
def parse(self, response):
yield {
'url': response.url,
'status': response.status,
'size': len(response.body),
'title': response.css('title::text').get(),
'load_ms': response.meta.get('download_latency',0)*1000,
}
");
$r[]="site_monitor: created";
$spiders=glob("$sp/*.py");
$spiders=array_values(array_filter(array_map(fn($f)=>basename($f,".py"),$spiders),fn($n)=>$n!=="__init__"));
$r[]="total_spiders: ".count($spiders)." (".implode(",",$spiders).")";
echo json_encode(["ok"=>true,"results"=>$r]);