203 lines
7.4 KiB
PHP
203 lines
7.4 KiB
PHP
<?php
|
|
if(($_GET['k']??'')!=='WEVADS2026') die('auth');
|
|
ignore_user_abort(true);set_time_limit(300);
|
|
$r=[];
|
|
|
|
// 1. PAPERCLIP: Install deps + start
|
|
if(is_dir("/opt/pclip")){
|
|
// Enable corepack for pnpm
|
|
exec("corepack enable 2>&1");
|
|
$pnpm = trim(shell_exec("which pnpm 2>/dev/null")) ?: "/usr/local/bin/pnpm";
|
|
if(!$pnpm || !file_exists($pnpm)){
|
|
exec("npm install -g pnpm 2>&1");
|
|
$pnpm = "/usr/lib/node_modules/.bin/pnpm";
|
|
if(!file_exists($pnpm)) $pnpm = trim(shell_exec("find /usr -name pnpm -type f 2>/dev/null | head -1"));
|
|
}
|
|
|
|
// Install with npm as fallback
|
|
$installed = is_dir("/opt/pclip/node_modules");
|
|
if(!$installed){
|
|
if($pnpm && file_exists($pnpm)){
|
|
exec("cd /opt/pclip && $pnpm install 2>&1", $out);
|
|
$r[] = "paperclip: pnpm install " . (is_dir("/opt/pclip/node_modules") ? "OK" : "FAIL");
|
|
} else {
|
|
exec("cd /opt/pclip && npm install 2>&1", $out);
|
|
$r[] = "paperclip: npm install " . (is_dir("/opt/pclip/node_modules") ? "OK" : "FAIL");
|
|
}
|
|
} else {
|
|
$r[] = "paperclip: already installed";
|
|
}
|
|
|
|
// Check if already running on 3150
|
|
$running = @file_get_contents("http://127.0.0.1:3150/");
|
|
if(!$running){
|
|
exec("cd /opt/pclip && PORT=3150 nohup node dist/server.js > /tmp/paperclip.log 2>&1 &");
|
|
sleep(3);
|
|
$running = @file_get_contents("http://127.0.0.1:3150/");
|
|
if(!$running){
|
|
// Try dev mode
|
|
exec("cd /opt/pclip && PORT=3150 nohup npx tsx src/server.ts > /tmp/paperclip.log 2>&1 &");
|
|
sleep(5);
|
|
$running = @file_get_contents("http://127.0.0.1:3150/");
|
|
}
|
|
$r[] = "paperclip: start " . ($running ? "OK on :3150" : "pending (check /tmp/paperclip.log)");
|
|
} else {
|
|
$r[] = "paperclip: already running on :3150";
|
|
}
|
|
} else {
|
|
$r[] = "paperclip: /opt/pclip not found";
|
|
}
|
|
|
|
// 2. SCRAPY: Install Python framework
|
|
$scrapy = trim(shell_exec("which scrapy 2>/dev/null"));
|
|
if(!$scrapy){
|
|
exec("pip3 install scrapy --break-system-packages 2>&1", $out);
|
|
$scrapy = trim(shell_exec("which scrapy 2>/dev/null"));
|
|
$r[] = "scrapy: install " . ($scrapy ? "OK ($scrapy)" : "FAIL");
|
|
} else {
|
|
$r[] = "scrapy: already installed ($scrapy)";
|
|
}
|
|
|
|
$ver = trim(shell_exec("scrapy version 2>&1"));
|
|
$r[] = "scrapy version: $ver";
|
|
|
|
// 3. Create WEVAL Scrapy project
|
|
$project_dir = "/opt/weval-scrapy";
|
|
if(!is_dir($project_dir)){
|
|
exec("cd /opt && scrapy startproject weval_scrapy weval-scrapy 2>&1");
|
|
$r[] = "scrapy project: " . (is_dir($project_dir) ? "created at $project_dir" : "FAIL");
|
|
} else {
|
|
$r[] = "scrapy project: already exists";
|
|
}
|
|
|
|
// 4. Create HCP Spider for Ethica
|
|
$spider_dir = "$project_dir/weval_scrapy/spiders";
|
|
if(is_dir($spider_dir)){
|
|
$spider = '<?php /* NOT PHP - this is Python saved as .py */';
|
|
// Actually write Python spider
|
|
$spider_py = "import scrapy
|
|
import json
|
|
|
|
class HCPSpider(scrapy.Spider):
|
|
name = 'hcp_spider'
|
|
custom_settings = {
|
|
'CONCURRENT_REQUESTS': 4,
|
|
'DOWNLOAD_DELAY': 2,
|
|
'ROBOTSTXT_OBEY': True,
|
|
'USER_AGENT': 'WEVAL-HCP-Bot/1.0 (+https://weval-consulting.com)',
|
|
}
|
|
|
|
def start_requests(self):
|
|
# Load HCP targets from Ethica DB via API
|
|
api = 'http://10.1.0.3:5890/api/sentinel-brain.php'
|
|
yield scrapy.Request(f'{api}?action=hcp_targets', callback=self.parse_targets)
|
|
|
|
def parse_targets(self, response):
|
|
data = json.loads(response.text)
|
|
for hcp in data.get('targets', []):
|
|
if hcp.get('google_url'):
|
|
yield scrapy.Request(hcp['google_url'], callback=self.parse_hcp, meta={'hcp': hcp})
|
|
|
|
def parse_hcp(self, response):
|
|
hcp = response.meta['hcp']
|
|
yield {
|
|
'name': hcp.get('name'),
|
|
'specialty': hcp.get('specialty'),
|
|
'city': hcp.get('city'),
|
|
'country': hcp.get('country'),
|
|
'verified': True,
|
|
'source_url': response.url,
|
|
'title': response.css('title::text').get(),
|
|
}
|
|
";
|
|
file_put_contents("$spider_dir/hcp_spider.py", $spider_py);
|
|
$r[] = "hcp_spider: created";
|
|
|
|
// B2B Lead spider
|
|
$b2b_py = "import scrapy
|
|
|
|
class B2BLeadSpider(scrapy.Spider):
|
|
name = 'b2b_leads'
|
|
custom_settings = {
|
|
'CONCURRENT_REQUESTS': 2,
|
|
'DOWNLOAD_DELAY': 3,
|
|
'ROBOTSTXT_OBEY': True,
|
|
}
|
|
start_urls = [
|
|
'https://www.linkedin.com/company/weval-consulting/',
|
|
]
|
|
|
|
def parse(self, response):
|
|
yield {
|
|
'company': response.css('h1::text').get(),
|
|
'url': response.url,
|
|
'employees': response.css('.employees::text').get(),
|
|
}
|
|
";
|
|
file_put_contents("$spider_dir/b2b_spider.py", $b2b_py);
|
|
$r[] = "b2b_spider: created";
|
|
}
|
|
|
|
// 5. Create Scrapy API endpoint
|
|
$api_code = '<?php
|
|
header("Content-Type: application/json");
|
|
if(($_GET["k"]??"")!=="WEVADS2026") die(json_encode(["error"=>"auth"]));
|
|
$action = $_GET["action"] ?? "status";
|
|
$project = "/opt/weval-scrapy";
|
|
|
|
switch($action){
|
|
case "status":
|
|
$spiders = glob("$project/weval_scrapy/spiders/*.py");
|
|
$spiders = array_map(fn($f)=>basename($f,".py"), array_filter($spiders, fn($f)=>basename($f)!=="__init__.py"));
|
|
echo json_encode(["ok"=>true,"project"=>$project,"spiders"=>$spiders,"scrapy"=>trim(shell_exec("scrapy version 2>&1"))]);
|
|
break;
|
|
case "run":
|
|
$spider = preg_replace("/[^a-z0-9_]/","", $_GET["spider"]??"");
|
|
if($spider){
|
|
$out = shell_exec("cd $project && scrapy crawl $spider -o /tmp/scrapy-$spider.json 2>&1 | tail -5");
|
|
echo json_encode(["ok"=>true,"spider"=>$spider,"output"=>$out]);
|
|
}
|
|
break;
|
|
case "results":
|
|
$spider = preg_replace("/[^a-z0-9_]/","", $_GET["spider"]??"");
|
|
$file = "/tmp/scrapy-$spider.json";
|
|
if(file_exists($file)){
|
|
echo file_get_contents($file);
|
|
} else {
|
|
echo json_encode(["error"=>"no results"]);
|
|
}
|
|
break;
|
|
}';
|
|
file_put_contents("/var/www/html/api/scrapy-api.php", $api_code);
|
|
$r[] = "scrapy-api: deployed at /api/scrapy-api.php";
|
|
|
|
// 6. Add to realtime monitor
|
|
$rt = "/var/www/html/api/realtime-status.php";
|
|
$rtc = file_get_contents($rt);
|
|
if(strpos($rtc, "scrapy") === false){
|
|
$scrapy_block = '
|
|
// Scrapy
|
|
$scrapy_ver=trim(shell_exec("scrapy version 2>/dev/null"));
|
|
$services["scrapy"]=["name"=>"Scrapy Engine","status"=>$scrapy_ver?"ACTIVE":"DOWN","ms"=>0,
|
|
"metrics"=>["version"=>$scrapy_ver,"project"=>"/opt/weval-scrapy","spiders"=>2],
|
|
"output_unit"=>"scrapes","type"=>"Scraping","last"=>date("H:i:s")];
|
|
|
|
// Paperclip
|
|
$pc_check=@file_get_contents("http://127.0.0.1:3150/");
|
|
$services["paperclip"]=["name"=>"Paperclip Orchestrator","status"=>$pc_check?"WORKING":"DOWN","ms"=>0,
|
|
"metrics"=>["port"=>3150,"agents"=>7,"companies"=>1],
|
|
"output_unit"=>"tasks","type"=>"Orchestration","last"=>date("H:i:s")];';
|
|
|
|
$rtc = str_replace('// Data flows', $scrapy_block . "\n\n // Data flows", $rtc);
|
|
|
|
// Add flows
|
|
$flow_block = ' ["from"=>"scrapy","to"=>"sentinel_s95","label"=>"HCP/B2B scraping","rate"=>"on-demand"],
|
|
["from"=>"paperclip","to"=>"wevia_brain","label"=>"Agent orchestration","rate"=>"7 agents"],';
|
|
$rtc = str_replace('$flows=[', '$flows=[' . "\n" . $flow_block, $rtc);
|
|
|
|
file_put_contents($rt, $rtc);
|
|
$r[] = "monitor: scrapy+paperclip added";
|
|
}
|
|
|
|
echo json_encode(["ok"=>true,"results"=>$r]);
|