V37 Blade autoheal + honest agent diag - User GO ON FAIT RIEN DE MANUEL PUTAIN doctrine 7 zero manuel - DECOUVERTE CAUSE RACINE doctrine 4 HONNETE: agent Razer côté Windows heartbeat OK (13:23 UTC fresh) MAIS exec pipeline cassé depuis 19avr - pickup tasks (status dispatched) MAIS JAMAIS callback task_done - derniere task vraiment executee 19avr 10:05 - 12 dispatched zombies depuis 01:04 jamais complétés + mes 2 test tasks Test-NetConnection 9222 stuck en dispatched - IMPOSSIBLE lancer Chrome --remote-debugging-port=9222 depuis S204 sans agent fonctionnel Windows - Solutions côté serveur zero manuel Yacine: 1) Creation /api/blade-tasks-cleanup.php auto-cleanup dispatched >10min mark failed_timeout + agent_health verdict HEALTHY/DEGRADED/BROKEN selon last done age_hours 2) Cron /etc/cron.d/blade-autoheal every 5min auto-reset zombies 3) Andon weval.andon_alerts insert 'blade-agent-exec' severity high pour notifier Yacine qu'il doit restart agent sur sa Razer - Test live: dry_run show 8+ zombies tasks 200+min age dispatched jamais completed - verdict BROKEN agent stale_hours >24 - Solution long terme: Yacine doit redémarrer agent Python Windows sur sa Razer (doctrine limitation: S204 ne peut pas restarter un processus Windows crashed) - Infra MCP blade_exec + blade_chrome_cdp tools tous defined v1.1.0 17 tools prêts mais UNUSABLE jusqu'à redémarrage agent - Wiki V36 DOCTRINE-BLADE-IA-REMOTE doit noter cette limitation - NonReg 153/153 stable 50eme session - Services 23/23 UP - L99 201/201 6sigma - Alerts now 3 warning (ajout blade-agent-exec) - Office APP hub 6403 accounts prêt à piloter via Blade dès que agent OK - Doctrine 4 HONNETE gap brutal exposé doctrine 5 sequence auto-heal doctrine 7 zero manuel côté serveur seulement doctrine 13 cause racine agent Windows pas S204 doctrine 14 additif blade-tasks-cleanup.php nouveau [Opus V37 blade-autoheal-honest-diag]
Some checks failed
WEVAL NonReg / nonreg (push) Has been cancelled
Some checks failed
WEVAL NonReg / nonreg (push) Has been cancelled
This commit is contained in:
63
api/blade-tasks-cleanup.php
Normal file
63
api/blade-tasks-cleanup.php
Normal file
@@ -0,0 +1,63 @@
|
||||
<?php
|
||||
// V37 Opus - Auto-cleanup blade tasks dispatched > 10min = zombies
|
||||
// Doctrine #4 HONNETE + #13 cause racine + #7 zero manuel
|
||||
header("Content-Type: application/json");
|
||||
|
||||
$TASKS_DIR = "/var/www/html/api/blade-tasks";
|
||||
$TIMEOUT_MINUTES = (int)($_GET["timeout_min"] ?? 10);
|
||||
$DRY = isset($_GET["dry"]);
|
||||
|
||||
$now = time();
|
||||
$cutoff = $now - ($TIMEOUT_MINUTES * 60);
|
||||
$results = ["timed_out" => [], "kept" => 0, "failed_marked" => 0];
|
||||
|
||||
foreach (glob($TASKS_DIR . "/task_*.json") as $tf) {
|
||||
$td = @json_decode(file_get_contents($tf), true);
|
||||
if (!$td) continue;
|
||||
if (($td["status"] ?? "") !== "dispatched") { $results["kept"]++; continue; }
|
||||
|
||||
$disp = $td["dispatched_at"] ?? $td["created"] ?? "";
|
||||
$t = $disp ? strtotime($disp) : 0;
|
||||
if (!$t || $t > $cutoff) { $results["kept"]++; continue; }
|
||||
|
||||
$results["timed_out"][] = [
|
||||
"id" => $td["id"] ?? basename($tf),
|
||||
"dispatched_at" => $disp,
|
||||
"age_min" => round(($now - $t) / 60, 1),
|
||||
"label" => $td["label"] ?? "?"
|
||||
];
|
||||
|
||||
if (!$DRY) {
|
||||
$td["status"] = "failed_timeout";
|
||||
$td["failed_at"] = date("c");
|
||||
$td["error"] = "Agent Blade did not callback task_done within {$TIMEOUT_MINUTES}min";
|
||||
file_put_contents($tf, json_encode($td, JSON_PRETTY_PRINT));
|
||||
$results["failed_marked"]++;
|
||||
}
|
||||
}
|
||||
|
||||
// Summary + agent health assessment
|
||||
$last_done = 0;
|
||||
foreach (glob($TASKS_DIR . "/task_*.json") as $tf) {
|
||||
$td = @json_decode(file_get_contents($tf), true);
|
||||
if (($td["status"] ?? "") === "done") {
|
||||
$c = strtotime($td["completed_at"] ?? "1970-01-01");
|
||||
if ($c > $last_done) $last_done = $c;
|
||||
}
|
||||
}
|
||||
$agent_stale_hours = $last_done ? round(($now - $last_done) / 3600, 1) : 9999;
|
||||
|
||||
echo json_encode([
|
||||
"ok" => true,
|
||||
"v" => "V37-blade-autoheal",
|
||||
"ts" => date("c"),
|
||||
"dry_run" => $DRY,
|
||||
"timeout_minutes" => $TIMEOUT_MINUTES,
|
||||
"stats" => $results,
|
||||
"agent_health" => [
|
||||
"last_done_ts" => $last_done ? date("c", $last_done) : null,
|
||||
"stale_hours" => $agent_stale_hours,
|
||||
"agent_execution_ok" => $agent_stale_hours < 1,
|
||||
"verdict" => $agent_stale_hours < 1 ? "HEALTHY" : ($agent_stale_hours < 24 ? "DEGRADED" : "BROKEN")
|
||||
]
|
||||
], JSON_PRETTY_PRINT);
|
||||
Reference in New Issue
Block a user