142 lines
4.5 KiB
Python
Executable File
142 lines
4.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Blade self-heal automation — doctrine 64
|
|
Monitors Blade heartbeat + queue + auto-wakes if needed
|
|
Runs every 5 min via cron
|
|
"""
|
|
import json, sys, time, datetime, urllib.request, urllib.parse, subprocess
|
|
from pathlib import Path
|
|
|
|
LOG = "/var/log/weval/blade-selfheal.log"
|
|
Path(LOG).parent.mkdir(parents=True, exist_ok=True)
|
|
STATUS_URL = "http://127.0.0.1/api/blade-status.php?k=BLADE2026"
|
|
BLADE_MAC = "XX:XX:XX:XX:XX:XX" # update if known
|
|
STATE_FILE = "/var/lib/weval/blade-selfheal-state.json"
|
|
Path(STATE_FILE).parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
def log(msg):
|
|
line = f"[{datetime.datetime.now().isoformat()}] {msg}\n"
|
|
print(line, end="")
|
|
try:
|
|
with open(LOG, "a") as f: f.write(line)
|
|
except: pass
|
|
|
|
def load_state():
|
|
try: return json.loads(open(STATE_FILE).read())
|
|
except: return {"last_offline": None, "wake_attempts": 0}
|
|
|
|
def save_state(s):
|
|
try:
|
|
with open(STATE_FILE, "w") as f: json.dump(s, f)
|
|
except: pass
|
|
|
|
def fetch_status():
|
|
try:
|
|
r = urllib.request.urlopen(STATUS_URL, timeout=5)
|
|
return json.loads(r.read().decode())
|
|
except Exception as e:
|
|
log(f"FETCH_ERR {e}")
|
|
return None
|
|
|
|
def push_heal_task():
|
|
"""Task to reset agent + purge stale pending > 3 days"""
|
|
ps = """
|
|
# Blade self-heal
|
|
Write-Host "Self-heal triggered $(Get-Date)"
|
|
$agentProc = Get-Process powershell | Where-Object { $_.CommandLine -match 'sentinel-agent' }
|
|
if (!$agentProc) {
|
|
Write-Host "Agent not running, starting..."
|
|
Start-Process powershell -ArgumentList "-ExecutionPolicy","Bypass","-File","C:\\ProgramData\\WEVAL\\sentinel-agent.ps1" -WindowStyle Hidden
|
|
}
|
|
# Clear stale tasks > 3 days locally
|
|
$cutoff = (Get-Date).AddDays(-3)
|
|
Get-ChildItem "C:\\ProgramData\\WEVAL\\tasks\\*.json" -ErrorAction SilentlyContinue | Where-Object { $_.LastWriteTime -lt $cutoff } | Move-Item -Destination "C:\\ProgramData\\WEVAL\\tasks\\archived\\" -Force -ErrorAction SilentlyContinue
|
|
Write-Host "Self-heal complete"
|
|
"""
|
|
try:
|
|
data = urllib.parse.urlencode({
|
|
"k": "BLADE2026",
|
|
"name": f"Blade self-heal {datetime.datetime.now().strftime('%H:%M')}",
|
|
"cmd": ps,
|
|
"type": "powershell",
|
|
"priority": "high"
|
|
}).encode()
|
|
req = urllib.request.Request(
|
|
"http://127.0.0.1/api/blade-task-queue.php?k=BLADE2026&action=add",
|
|
data=data, method="POST"
|
|
)
|
|
r = urllib.request.urlopen(req, timeout=10)
|
|
d = json.loads(r.read().decode())
|
|
log(f"HEAL_TASK_QUEUED {d.get('task_id','?')}")
|
|
return True
|
|
except Exception as e:
|
|
log(f"PUSH_HEAL_ERR {e}")
|
|
return False
|
|
|
|
def attempt_wol():
|
|
"""Attempt Wake-on-LAN if MAC known + wakeonlan installed"""
|
|
if BLADE_MAC == "XX:XX:XX:XX:XX:XX":
|
|
log("WOL_SKIP no MAC configured")
|
|
return False
|
|
try:
|
|
r = subprocess.run(["wakeonlan", BLADE_MAC], capture_output=True, text=True, timeout=5)
|
|
log(f"WOL_SENT {r.stdout.strip()}")
|
|
return True
|
|
except Exception as e:
|
|
log(f"WOL_ERR {e}")
|
|
return False
|
|
|
|
def main():
|
|
state = load_state()
|
|
status = fetch_status()
|
|
|
|
if not status or not status.get("blade"):
|
|
log("STATUS_UNAVAILABLE")
|
|
return 1
|
|
|
|
blade = status["blade"]
|
|
online = blade.get("online", False)
|
|
stats = blade.get("stats", {})
|
|
pending = stats.get("pending", 0)
|
|
hb_ts = blade.get("heartbeat", {}).get("ts", "")
|
|
|
|
# Parse hb age
|
|
try:
|
|
hb_time = datetime.datetime.fromisoformat(hb_ts.replace("Z", "+00:00"))
|
|
age_sec = (datetime.datetime.now(datetime.timezone.utc) - hb_time).total_seconds()
|
|
except:
|
|
age_sec = 999999
|
|
|
|
log(f"CHECK online={online} pending={pending} age_sec={int(age_sec)}")
|
|
|
|
# Decision logic
|
|
heal_needed = False
|
|
reasons = []
|
|
|
|
if not online or age_sec > 600:
|
|
heal_needed = True
|
|
reasons.append(f"offline_or_stale (age={int(age_sec)}s)")
|
|
|
|
if pending > 50:
|
|
heal_needed = True
|
|
reasons.append(f"queue_backlog ({pending} tasks)")
|
|
|
|
if heal_needed:
|
|
log(f"HEAL_NEEDED reasons={reasons}")
|
|
pushed = push_heal_task()
|
|
if pushed:
|
|
state["wake_attempts"] = state.get("wake_attempts", 0) + 1
|
|
save_state(state)
|
|
# WoL if really offline
|
|
if age_sec > 600:
|
|
attempt_wol()
|
|
else:
|
|
log("BLADE_OK no action needed")
|
|
state["wake_attempts"] = 0
|
|
save_state(state)
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|