'auth'])); } $action = $_GET['action'] ?? 'report'; $DB = '/opt/wevads/vault/ai-benchmark.json'; if (!file_exists($DB)) { file_put_contents($DB, json_encode([ 'benchmarks' => [], 'leaderboard' => [], 'improvements' => [], 'last_run' => null, 'total_runs' => 0 ], JSON_PRETTY_PRINT)); } $db = json_decode(file_get_contents($DB), true); // AI Configurations $AIS = [ 'wevia_fast' => ['name'=>'WEVIA PUBLIC','endpoint'=>'/api/weval-ia','mode'=>'fast','type'=>'cloud','icon'=>'⚡'], 'wevcode' => ['name'=>'WEVCODE','endpoint'=>'/api/weval-ia','mode'=>'code','type'=>'cloud','icon'=>'💻'], 'manager' => ['name'=>'MANAGER','endpoint'=>'/api/weval-ia','mode'=>'deep','type'=>'cloud','icon'=>'🧠'], 'ollama_qwen3' => ['name'=>'Ollama Qwen3:4b','model'=>'qwen3:4b','type'=>'sovereign','icon'=>'🏠'], 'ollama_08b' => ['name'=>'Ollama Qwen3.5:0.8b','model'=>'qwen3.5:0.8b','type'=>'sovereign','icon'=>'🪶'], 'ollama_mistral' => ['name'=>'Ollama Mistral','model'=>'mistral:latest','type'=>'sovereign','icon'=>'🇫🇷'], 'opus' => ['name'=>'Claude Opus','type'=>'reference','icon'=>'👑'], ]; // Test Topics $TOPICS = [ 'strategy' => ['prompt'=>'Propose une stratégie digitale pour une PME marocaine de 200 employés','criteria'=>['structure','actionable','maroc_context','length>500']], 'code' => ['prompt'=>'Ecris en Python une classe CsvAnalyzer avec methodes load et describe','criteria'=>['has_class','has_def','has_docstring','runnable']], 'pharma' => ['prompt'=>'Quelles sont les étapes de la pharmacovigilance pour un nouveau médicament','criteria'=>['pharmacovigilance','steps','regulatory','africa_context']], 'security' => ['prompt'=>'Liste les 5 vulnérabilités web OWASP Top 10 avec remediations','criteria'=>['owasp','injection','xss','remediation']], 'erp' => ['prompt'=>'Compare SAP vs Oracle ERP pour une entreprise de 500 employés','criteria'=>['sap','oracle','comparison','recommendation']], ]; function score_response($resp, $criteria, $latency) { $score = 0; $details = []; $lower = strtolower($resp); $len = strlen($resp); // Content quality (0-40) foreach ($criteria as $c) { if (strpos($c, '>') !== false) { list($key, $val) = explode('>', $c); if ($key === 'length' && $len > intval($val)) { $score += 10; $details[] = "$c:OK"; } } else { $keywords = [ 'structure' => ['###','**','1.','2.','3.'], 'actionable' => ['étape','action','recommand','implément','déploy'], 'maroc_context' => ['maroc','marocain','casablanca','rabat','pme'], 'has_class' => ['class '], 'has_def' => ['def '], 'has_docstring' => ['"""','\'\'\''], 'runnable' => ['import ','return '], 'pharmacovigilance' => ['pharmacovigilance','effet indésirable','signal'], 'steps' => ['étape','phase','1)','1.','première'], 'regulatory' => ['amm','autorisation','réglementaire','anpp','ansm'], 'africa_context' => ['algérie','maroc','tunisie','afrique','maghreb'], 'owasp' => ['owasp','top 10'], 'injection' => ['injection','sql injection'], 'xss' => ['xss','cross-site','script'], 'remediation' => ['remédiation','correction','protéger','prévenir','mitigation'], 'sap' => ['sap','s/4hana','s4hana'], 'oracle' => ['oracle','erp cloud','jd edwards'], 'comparison' => ['avantage','inconvénient','vs','comparaison','différence'], 'recommendation' => ['recommand','conseil','préférable','optimal'], ]; if (isset($keywords[$c])) { foreach ($keywords[$c] as $kw) { if (stripos($lower, $kw) !== false) { $score += 10; $details[] = "$c:OK"; break; } } } } } // Length bonus (0-15) if ($len > 3000) $score += 15; elseif ($len > 1500) $score += 10; elseif ($len > 500) $score += 5; // Speed bonus (0-15) if ($latency > 0 && $latency < 1000) $score += 15; elseif ($latency < 2000) $score += 10; elseif ($latency < 4000) $score += 5; // Formatting bonus (0-10) if (preg_match('/```/', $resp)) $score += 3; // Code blocks if (preg_match('/\*\*/', $resp)) $score += 2; // Bold if (preg_match('/###/', $resp)) $score += 2; // Headers if (preg_match('/\d+\./', $resp)) $score += 3; // Numbered lists return ['score' => min($score, 100), 'details' => $details, 'length' => $len, 'latency' => $latency]; } function call_wevia($prompt, $mode) { $t0 = microtime(true); $ctx = stream_context_create(['http' => [ 'method' => 'POST', 'header' => 'Content-Type: application/json', 'content' => json_encode(['message' => $prompt, 'mode' => $mode]), 'timeout' => 15 ]]); $r = @file_get_contents('http://127.0.0.1/api/weval-ia', false, $ctx); $lat = intval((microtime(true) - $t0) * 1000); if (!$r) return ['response' => '', 'provider' => '?', 'latency' => $lat]; $d = json_decode($r, true); return ['response' => $d['response'] ?? '', 'provider' => $d['provider'] ?? '?', 'latency' => $d['latency_ms'] ?? $lat]; } function call_ollama($prompt, $model) { $t0 = microtime(true); $ctx = stream_context_create(['http' => [ 'method' => 'POST', 'header' => 'Content-Type: application/json', 'content' => json_encode(['model' => $model, 'prompt' => $prompt, 'stream' => false, 'options' => ['num_predict' => 200, 'num_ctx' => 512]]), 'timeout' => 30 ]]); $r = @file_get_contents('http://127.0.0.1:11434/api/generate', false, $ctx); $lat = intval((microtime(true) - $t0) * 1000); if (!$r) return ['response' => '', 'provider' => $model, 'latency' => $lat, 'tps' => 0]; $d = json_decode($r, true); $tps = ($d['eval_count'] ?? 0) / max(($d['eval_duration'] ?? 1) / 1e9, 0.01); return ['response' => $d['response'] ?? '', 'provider' => $model, 'latency' => $lat, 'tps' => round($tps, 1)]; } switch ($action) { case 'benchmark': $topic = $_GET['topic'] ?? 'strategy'; if (!isset($TOPICS[$topic])) { echo json_encode(['error' => 'invalid topic']); break; } $t = $TOPICS[$topic]; $results = []; // Test cloud AIs foreach (['fast', 'code', 'deep'] as $mode) { $ai_name = $mode === 'fast' ? 'wevia_fast' : ($mode === 'code' ? 'wevcode' : 'manager'); $r = call_wevia($t['prompt'], $mode); $s = score_response($r['response'], $t['criteria'], $r['latency']); $results[$ai_name] = array_merge($s, ['provider' => $r['provider'], 'response_preview' => mb_substr($r['response'], 0, 200)]); usleep(500000); } // Test sovereign AIs (with timeout protection) foreach (['qwen3.5:0.8b'] as $model) { $key = 'ollama_' . str_replace([':', '.'], ['_', ''], $model); $r = call_ollama($t['prompt'], $model); if (!empty($r['response'])) { $s = score_response($r['response'], $t['criteria'], $r['latency']); $results[$key] = array_merge($s, ['provider' => $model, 'tps' => $r['tps'], 'response_preview' => mb_substr($r['response'], 0, 200)]); } else { $results[$key] = ['score' => 0, 'details' => ['timeout'], 'length' => 0, 'latency' => $r['latency'], 'provider' => $model]; } } // Save $run = ['topic' => $topic, 'timestamp' => date('c'), 'results' => $results]; $db['benchmarks'][] = $run; $db['last_run'] = date('c'); $db['total_runs']++; // Update leaderboard $lb = []; foreach ($results as $ai => $r) { $lb[$ai] = ($lb[$ai] ?? 0) + ($r['score'] ?? 0); } arsort($lb); $db['leaderboard'] = $lb; file_put_contents($DB, json_encode($db, JSON_PRETTY_PRINT)); echo json_encode(['ok' => true, 'topic' => $topic, 'results' => $results, 'leaderboard' => $lb]); break; case 'report': $last = end($db['benchmarks']) ?: null; echo json_encode([ 'ok' => true, 'total_runs' => $db['total_runs'], 'last_run' => $db['last_run'], 'last_benchmark' => $last, 'leaderboard' => $db['leaderboard'], 'topics' => array_keys($TOPICS), 'ais' => array_map(fn($a) => $a['name'], $AIS) ]); break; case 'history': $limit = intval($_GET['limit'] ?? 10); $benchmarks = array_slice($db['benchmarks'], -$limit); echo json_encode(['ok' => true, 'benchmarks' => $benchmarks, 'total' => count($db['benchmarks'])]); break; case 'leaderboard': // Compute cumulative scores from all benchmarks $scores = []; $counts = []; foreach ($db['benchmarks'] as $b) { foreach ($b['results'] as $ai => $r) { $scores[$ai] = ($scores[$ai] ?? 0) + ($r['score'] ?? 0); $counts[$ai] = ($counts[$ai] ?? 0) + 1; } } $lb = []; foreach ($scores as $ai => $total) { $lb[] = [ 'ai' => $ai, 'name' => $AIS[$ai]['name'] ?? $ai, 'icon' => $AIS[$ai]['icon'] ?? '?', 'type' => $AIS[$ai]['type'] ?? '?', 'total_score' => $total, 'avg_score' => $counts[$ai] > 0 ? round($total / $counts[$ai], 1) : 0, 'runs' => $counts[$ai] ]; } usort($lb, fn($a, $b) => $b['total_score'] - $a['total_score']); echo json_encode(['ok' => true, 'leaderboard' => $lb, 'total_benchmarks' => count($db['benchmarks'])]); break; case 'improve': // Analyze weaknesses and suggest improvements $weaknesses = []; $strengths = []; foreach ($db['benchmarks'] as $b) { foreach ($b['results'] as $ai => $r) { if (($r['score'] ?? 0) < 40) { $weaknesses[$ai][] = ['topic' => $b['topic'], 'score' => $r['score'], 'details' => $r['details'] ?? []]; } if (($r['score'] ?? 0) >= 70) { $strengths[$ai][] = ['topic' => $b['topic'], 'score' => $r['score']]; } } } $suggestions = []; foreach ($weaknesses as $ai => $issues) { foreach ($issues as $issue) { $suggestions[] = [ 'ai' => $ai, 'topic' => $issue['topic'], 'current_score' => $issue['score'], 'suggestion' => $issue['score'] == 0 ? 'AI timeout/unavailable - check connectivity' : ($issue['score'] < 20 ? 'Response too short or intercepted by ToolFK - adjust routing' : 'Improve prompt engineering for this topic') ]; } } echo json_encode(['ok' => true, 'weaknesses' => $weaknesses, 'strengths' => $strengths, 'suggestions' => $suggestions]); break; }