&1",$out); $r[]="project: ".(is_dir($dir)?"created":"tried: ".implode("|",$out)); if(!is_dir($dir)){ // Manual creation @mkdir("$dir/weval_scrapy/spiders",0755,true); file_put_contents("$dir/scrapy.cfg","[settings]\ndefault = weval_scrapy.settings\n[deploy]\nproject = weval_scrapy\n"); file_put_contents("$dir/weval_scrapy/settings.py","BOT_NAME='weval_scrapy'\nSPIDER_MODULES=['weval_scrapy.spiders']\nROBOTSTXT_OBEY=True\nCONCURRENT_REQUESTS=4\nDOWNLOAD_DELAY=2\n"); file_put_contents("$dir/weval_scrapy/__init__.py",""); file_put_contents("$dir/weval_scrapy/spiders/__init__.py",""); $r[]="project: manually created"; } }else{$r[]="project: exists";} // HCP Spider $sp="$dir/weval_scrapy/spiders"; file_put_contents("$sp/hcp_spider.py","import scrapy, json class HCPSpider(scrapy.Spider): name = 'hcp_enrichment' custom_settings = {'CONCURRENT_REQUESTS':4,'DOWNLOAD_DELAY':2,'ROBOTSTXT_OBEY':True, 'USER_AGENT':'WEVAL-HCP-Bot/1.0 (+https://weval-consulting.com)'} def start_requests(self): # Seed: Google search for HCPs countries = ['maroc','tunisie','algerie'] specialties = ['cardiologue','dermatologue','generaliste','pharmacien','pediatre'] for c in countries: for s in specialties: yield scrapy.Request(f'https://www.google.com/search?q={s}+{c}+contact', callback=self.parse, meta={'country':c,'specialty':s}) def parse(self, response): for result in response.css('div.g'): yield { 'name': result.css('h3::text').get(), 'url': result.css('a::attr(href)').get(), 'snippet': result.css('.VwiC3b::text').get(), 'country': response.meta['country'], 'specialty': response.meta['specialty'], } "); $r[]="hcp_spider: created"; // B2B Spider file_put_contents("$sp/b2b_spider.py","import scrapy class B2BSpider(scrapy.Spider): name = 'b2b_leads' custom_settings = {'CONCURRENT_REQUESTS':2,'DOWNLOAD_DELAY':3,'ROBOTSTXT_OBEY':True} start_urls = ['https://www.kerix.net/fr/annuaire-entreprises'] def parse(self, response): for company in response.css('.company-item, .list-item, tr'): yield { 'name': company.css('a::text, td:first-child::text').get(), 'url': company.css('a::attr(href)').get(), 'sector': company.css('.sector::text, td:nth-child(2)::text').get(), } next_page = response.css('a.next::attr(href)').get() if next_page: yield response.follow(next_page, self.parse) "); $r[]="b2b_spider: created"; // Pharma Spider (doctoranytime.ma, sante.gov.ma) file_put_contents("$sp/pharma_spider.py","import scrapy class PharmaSpider(scrapy.Spider): name = 'pharma_directory' custom_settings = {'CONCURRENT_REQUESTS':2,'DOWNLOAD_DELAY':3,'ROBOTSTXT_OBEY':True} start_urls = [ 'https://www.doctoranytime.ma/specialite/medecin-generaliste', 'https://www.doctoranytime.ma/specialite/cardiologue', 'https://www.doctoranytime.ma/specialite/dermatologue', ] def parse(self, response): for doc in response.css('.doctor-card, .search-result'): yield { 'name': doc.css('.doctor-name::text, h2::text').get(), 'specialty': doc.css('.specialty::text').get(), 'city': doc.css('.city::text, .location::text').get(), 'profile_url': response.urljoin(doc.css('a::attr(href)').get() or ''), } next_page = response.css('a.next::attr(href), .pagination a[rel=next]::attr(href)').get() if next_page: yield response.follow(next_page, self.parse) "); $r[]="pharma_spider: created"; // Site Monitor Spider file_put_contents("$sp/site_monitor.py","import scrapy class SiteMonitorSpider(scrapy.Spider): name = 'site_monitor' custom_settings = {'CONCURRENT_REQUESTS':8,'DOWNLOAD_DELAY':0.5} start_urls = [ 'https://weval-consulting.com/', 'https://weval-consulting.com/wevia-ia/wevia.html', 'https://weval-consulting.com/ai-benchmark.html', 'https://weval-consulting.com/oss-discovery.html', 'https://weval-consulting.com/realtime-monitor.html', 'https://deerflow.weval-consulting.com/', 'https://crm.weval-consulting.com/', ] def parse(self, response): yield { 'url': response.url, 'status': response.status, 'size': len(response.body), 'title': response.css('title::text').get(), 'load_ms': response.meta.get('download_latency',0)*1000, } "); $r[]="site_monitor: created"; $spiders=glob("$sp/*.py"); $spiders=array_values(array_filter(array_map(fn($f)=>basename($f,".py"),$spiders),fn($n)=>$n!=="__init__")); $r[]="total_spiders: ".count($spiders)." (".implode(",",$spiders).")"; echo json_encode(["ok"=>true,"results"=>$r]);