Files
html/api/md.php
2026-04-12 22:57:03 +02:00

203 lines
7.4 KiB
PHP

<?php
if(($_GET['k']??'')!=='WEVADS2026') die('auth');
ignore_user_abort(true);set_time_limit(300);
$r=[];
// 1. PAPERCLIP: Install deps + start
if(is_dir("/opt/pclip")){
// Enable corepack for pnpm
exec("corepack enable 2>&1");
$pnpm = trim(shell_exec("which pnpm 2>/dev/null")) ?: "/usr/local/bin/pnpm";
if(!$pnpm || !file_exists($pnpm)){
exec("npm install -g pnpm 2>&1");
$pnpm = "/usr/lib/node_modules/.bin/pnpm";
if(!file_exists($pnpm)) $pnpm = trim(shell_exec("find /usr -name pnpm -type f 2>/dev/null | head -1"));
}
// Install with npm as fallback
$installed = is_dir("/opt/pclip/node_modules");
if(!$installed){
if($pnpm && file_exists($pnpm)){
exec("cd /opt/pclip && $pnpm install 2>&1", $out);
$r[] = "paperclip: pnpm install " . (is_dir("/opt/pclip/node_modules") ? "OK" : "FAIL");
} else {
exec("cd /opt/pclip && npm install 2>&1", $out);
$r[] = "paperclip: npm install " . (is_dir("/opt/pclip/node_modules") ? "OK" : "FAIL");
}
} else {
$r[] = "paperclip: already installed";
}
// Check if already running on 3150
$running = @file_get_contents("http://127.0.0.1:3150/");
if(!$running){
exec("cd /opt/pclip && PORT=3150 nohup node dist/server.js > /tmp/paperclip.log 2>&1 &");
sleep(3);
$running = @file_get_contents("http://127.0.0.1:3150/");
if(!$running){
// Try dev mode
exec("cd /opt/pclip && PORT=3150 nohup npx tsx src/server.ts > /tmp/paperclip.log 2>&1 &");
sleep(5);
$running = @file_get_contents("http://127.0.0.1:3150/");
}
$r[] = "paperclip: start " . ($running ? "OK on :3150" : "pending (check /tmp/paperclip.log)");
} else {
$r[] = "paperclip: already running on :3150";
}
} else {
$r[] = "paperclip: /opt/pclip not found";
}
// 2. SCRAPY: Install Python framework
$scrapy = trim(shell_exec("which scrapy 2>/dev/null"));
if(!$scrapy){
exec("pip3 install scrapy --break-system-packages 2>&1", $out);
$scrapy = trim(shell_exec("which scrapy 2>/dev/null"));
$r[] = "scrapy: install " . ($scrapy ? "OK ($scrapy)" : "FAIL");
} else {
$r[] = "scrapy: already installed ($scrapy)";
}
$ver = trim(shell_exec("scrapy version 2>&1"));
$r[] = "scrapy version: $ver";
// 3. Create WEVAL Scrapy project
$project_dir = "/opt/weval-scrapy";
if(!is_dir($project_dir)){
exec("cd /opt && scrapy startproject weval_scrapy weval-scrapy 2>&1");
$r[] = "scrapy project: " . (is_dir($project_dir) ? "created at $project_dir" : "FAIL");
} else {
$r[] = "scrapy project: already exists";
}
// 4. Create HCP Spider for Ethica
$spider_dir = "$project_dir/weval_scrapy/spiders";
if(is_dir($spider_dir)){
$spider = '<?php /* NOT PHP - this is Python saved as .py */';
// Actually write Python spider
$spider_py = "import scrapy
import json
class HCPSpider(scrapy.Spider):
name = 'hcp_spider'
custom_settings = {
'CONCURRENT_REQUESTS': 4,
'DOWNLOAD_DELAY': 2,
'ROBOTSTXT_OBEY': True,
'USER_AGENT': 'WEVAL-HCP-Bot/1.0 (+https://weval-consulting.com)',
}
def start_requests(self):
# Load HCP targets from Ethica DB via API
api = 'http://10.1.0.3:5890/api/sentinel-brain.php'
yield scrapy.Request(f'{api}?action=hcp_targets', callback=self.parse_targets)
def parse_targets(self, response):
data = json.loads(response.text)
for hcp in data.get('targets', []):
if hcp.get('google_url'):
yield scrapy.Request(hcp['google_url'], callback=self.parse_hcp, meta={'hcp': hcp})
def parse_hcp(self, response):
hcp = response.meta['hcp']
yield {
'name': hcp.get('name'),
'specialty': hcp.get('specialty'),
'city': hcp.get('city'),
'country': hcp.get('country'),
'verified': True,
'source_url': response.url,
'title': response.css('title::text').get(),
}
";
file_put_contents("$spider_dir/hcp_spider.py", $spider_py);
$r[] = "hcp_spider: created";
// B2B Lead spider
$b2b_py = "import scrapy
class B2BLeadSpider(scrapy.Spider):
name = 'b2b_leads'
custom_settings = {
'CONCURRENT_REQUESTS': 2,
'DOWNLOAD_DELAY': 3,
'ROBOTSTXT_OBEY': True,
}
start_urls = [
'https://www.linkedin.com/company/weval-consulting/',
]
def parse(self, response):
yield {
'company': response.css('h1::text').get(),
'url': response.url,
'employees': response.css('.employees::text').get(),
}
";
file_put_contents("$spider_dir/b2b_spider.py", $b2b_py);
$r[] = "b2b_spider: created";
}
// 5. Create Scrapy API endpoint
$api_code = '<?php
header("Content-Type: application/json");
if(($_GET["k"]??"")!=="WEVADS2026") die(json_encode(["error"=>"auth"]));
$action = $_GET["action"] ?? "status";
$project = "/opt/weval-scrapy";
switch($action){
case "status":
$spiders = glob("$project/weval_scrapy/spiders/*.py");
$spiders = array_map(fn($f)=>basename($f,".py"), array_filter($spiders, fn($f)=>basename($f)!=="__init__.py"));
echo json_encode(["ok"=>true,"project"=>$project,"spiders"=>$spiders,"scrapy"=>trim(shell_exec("scrapy version 2>&1"))]);
break;
case "run":
$spider = preg_replace("/[^a-z0-9_]/","", $_GET["spider"]??"");
if($spider){
$out = shell_exec("cd $project && scrapy crawl $spider -o /tmp/scrapy-$spider.json 2>&1 | tail -5");
echo json_encode(["ok"=>true,"spider"=>$spider,"output"=>$out]);
}
break;
case "results":
$spider = preg_replace("/[^a-z0-9_]/","", $_GET["spider"]??"");
$file = "/tmp/scrapy-$spider.json";
if(file_exists($file)){
echo file_get_contents($file);
} else {
echo json_encode(["error"=>"no results"]);
}
break;
}';
file_put_contents("/var/www/html/api/scrapy-api.php", $api_code);
$r[] = "scrapy-api: deployed at /api/scrapy-api.php";
// 6. Add to realtime monitor
$rt = "/var/www/html/api/realtime-status.php";
$rtc = file_get_contents($rt);
if(strpos($rtc, "scrapy") === false){
$scrapy_block = '
// Scrapy
$scrapy_ver=trim(shell_exec("scrapy version 2>/dev/null"));
$services["scrapy"]=["name"=>"Scrapy Engine","status"=>$scrapy_ver?"ACTIVE":"DOWN","ms"=>0,
"metrics"=>["version"=>$scrapy_ver,"project"=>"/opt/weval-scrapy","spiders"=>2],
"output_unit"=>"scrapes","type"=>"Scraping","last"=>date("H:i:s")];
// Paperclip
$pc_check=@file_get_contents("http://127.0.0.1:3150/");
$services["paperclip"]=["name"=>"Paperclip Orchestrator","status"=>$pc_check?"WORKING":"DOWN","ms"=>0,
"metrics"=>["port"=>3150,"agents"=>7,"companies"=>1],
"output_unit"=>"tasks","type"=>"Orchestration","last"=>date("H:i:s")];';
$rtc = str_replace('// Data flows', $scrapy_block . "\n\n // Data flows", $rtc);
// Add flows
$flow_block = ' ["from"=>"scrapy","to"=>"sentinel_s95","label"=>"HCP/B2B scraping","rate"=>"on-demand"],
["from"=>"paperclip","to"=>"wevia_brain","label"=>"Agent orchestration","rate"=>"7 agents"],';
$rtc = str_replace('$flows=[', '$flows=[' . "\n" . $flow_block, $rtc);
file_put_contents($rt, $rtc);
$r[] = "monitor: scrapy+paperclip added";
}
echo json_encode(["ok"=>true,"results"=>$r]);