Files
html/api/ai-benchmark.php
2026-04-16 02:28:32 +02:00

262 lines
11 KiB
PHP

<?php
/**
* WEVAL AI Benchmark — Compare & Improve IA Continue
* ?action=benchmark — Run comparative benchmark across all AIs
* ?action=report — Get latest benchmark results
* ?action=history — Historical benchmark data
* ?action=leaderboard — Current AI leaderboard
* ?action=improve — Get improvement suggestions
*/
header('Content-Type: application/json');
$KEY = 'WEVADS2026';
if (($_GET['k'] ?? $_POST['k'] ?? '') !== $KEY) { http_response_code(403); die(json_encode(['error'=>'auth'])); }
$action = $_GET['action'] ?? 'report';
$DB = '/opt/wevads/vault/ai-benchmark.json';
if (!file_exists($DB)) {
file_put_contents($DB, json_encode([
'benchmarks' => [], 'leaderboard' => [], 'improvements' => [],
'last_run' => null, 'total_runs' => 0
], JSON_PRETTY_PRINT));
}
$db = json_decode(file_get_contents($DB), true);
// AI Configurations
$AIS = [
'wevia_fast' => ['name'=>'WEVIA PUBLIC','endpoint'=>'/api/weval-ia','mode'=>'fast','type'=>'cloud','icon'=>'⚡'],
'wevcode' => ['name'=>'WEVCODE','endpoint'=>'/api/weval-ia','mode'=>'code','type'=>'cloud','icon'=>'💻'],
'manager' => ['name'=>'MANAGER','endpoint'=>'/api/weval-ia','mode'=>'deep','type'=>'cloud','icon'=>'🧠'],
'ollama_qwen3' => ['name'=>'Ollama Qwen3:4b','model'=>'qwen3:4b','type'=>'sovereign','icon'=>'🏠'],
'ollama_08b' => ['name'=>'Ollama Qwen3.5:0.8b','model'=>'qwen3.5:0.8b','type'=>'sovereign','icon'=>'🪶'],
'ollama_mistral' => ['name'=>'Ollama Mistral','model'=>'mistral:latest','type'=>'sovereign','icon'=>'🇫🇷'],
'opus' => ['name'=>'Claude Opus','type'=>'reference','icon'=>'👑'],
];
// Test Topics
$TOPICS = [
'strategy' => ['prompt'=>'Propose une stratégie digitale pour une PME marocaine de 200 employés','criteria'=>['structure','actionable','maroc_context','length>500']],
'code' => ['prompt'=>'Ecris en Python une classe CsvAnalyzer avec methodes load et describe','criteria'=>['has_class','has_def','has_docstring','runnable']],
'pharma' => ['prompt'=>'Quelles sont les étapes de la pharmacovigilance pour un nouveau médicament','criteria'=>['pharmacovigilance','steps','regulatory','africa_context']],
'security' => ['prompt'=>'Liste les 5 vulnérabilités web OWASP Top 10 avec remediations','criteria'=>['owasp','injection','xss','remediation']],
'erp' => ['prompt'=>'Compare SAP vs Oracle ERP pour une entreprise de 500 employés','criteria'=>['sap','oracle','comparison','recommendation']],
];
function score_response($resp, $criteria, $latency) {
$score = 0; $details = [];
$lower = strtolower($resp);
$len = strlen($resp);
// Content quality (0-40)
foreach ($criteria as $c) {
if (strpos($c, '>') !== false) {
list($key, $val) = explode('>', $c);
if ($key === 'length' && $len > intval($val)) { $score += 10; $details[] = "$c:OK"; }
} else {
$keywords = [
'structure' => ['###','**','1.','2.','3.'],
'actionable' => ['étape','action','recommand','implément','déploy'],
'maroc_context' => ['maroc','marocain','casablanca','rabat','pme'],
'has_class' => ['class '],
'has_def' => ['def '],
'has_docstring' => ['"""','\'\'\''],
'runnable' => ['import ','return '],
'pharmacovigilance' => ['pharmacovigilance','effet indésirable','signal'],
'steps' => ['étape','phase','1)','1.','première'],
'regulatory' => ['amm','autorisation','réglementaire','anpp','ansm'],
'africa_context' => ['algérie','maroc','tunisie','afrique','maghreb'],
'owasp' => ['owasp','top 10'],
'injection' => ['injection','sql injection'],
'xss' => ['xss','cross-site','script'],
'remediation' => ['remédiation','correction','protéger','prévenir','mitigation'],
'sap' => ['sap','s/4hana','s4hana'],
'oracle' => ['oracle','erp cloud','jd edwards'],
'comparison' => ['avantage','inconvénient','vs','comparaison','différence'],
'recommendation' => ['recommand','conseil','préférable','optimal'],
];
if (isset($keywords[$c])) {
foreach ($keywords[$c] as $kw) {
if (stripos($lower, $kw) !== false) {
$score += 10;
$details[] = "$c:OK";
break;
}
}
}
}
}
// Length bonus (0-15)
if ($len > 3000) $score += 15;
elseif ($len > 1500) $score += 10;
elseif ($len > 500) $score += 5;
// Speed bonus (0-15)
if ($latency > 0 && $latency < 1000) $score += 15;
elseif ($latency < 2000) $score += 10;
elseif ($latency < 4000) $score += 5;
// Formatting bonus (0-10)
if (preg_match('/```/', $resp)) $score += 3; // Code blocks
if (preg_match('/\*\*/', $resp)) $score += 2; // Bold
if (preg_match('/###/', $resp)) $score += 2; // Headers
if (preg_match('/\d+\./', $resp)) $score += 3; // Numbered lists
return ['score' => min($score, 100), 'details' => $details, 'length' => $len, 'latency' => $latency];
}
function call_wevia($prompt, $mode) {
$t0 = microtime(true);
$ctx = stream_context_create(['http' => [
'method' => 'POST',
'header' => 'Content-Type: application/json',
'content' => json_encode(['message' => $prompt, 'mode' => $mode]),
'timeout' => 15
]]);
$r = @file_get_contents('http://127.0.0.1/api/weval-ia', false, $ctx);
$lat = intval((microtime(true) - $t0) * 1000);
if (!$r) return ['response' => '', 'provider' => '?', 'latency' => $lat];
$d = json_decode($r, true);
return ['response' => $d['response'] ?? '', 'provider' => $d['provider'] ?? '?', 'latency' => $d['latency_ms'] ?? $lat];
}
function call_ollama($prompt, $model) {
$t0 = microtime(true);
$ctx = stream_context_create(['http' => [
'method' => 'POST',
'header' => 'Content-Type: application/json',
'content' => json_encode(['model' => $model, 'prompt' => $prompt, 'stream' => false, 'options' => ['num_predict' => 200, 'num_ctx' => 512]]),
'timeout' => 30
]]);
$r = @file_get_contents('http://127.0.0.1:11434/api/generate', false, $ctx);
$lat = intval((microtime(true) - $t0) * 1000);
if (!$r) return ['response' => '', 'provider' => $model, 'latency' => $lat, 'tps' => 0];
$d = json_decode($r, true);
$tps = ($d['eval_count'] ?? 0) / max(($d['eval_duration'] ?? 1) / 1e9, 0.01);
return ['response' => $d['response'] ?? '', 'provider' => $model, 'latency' => $lat, 'tps' => round($tps, 1)];
}
switch ($action) {
case 'benchmark':
$topic = $_GET['topic'] ?? 'strategy';
if (!isset($TOPICS[$topic])) { echo json_encode(['error' => 'invalid topic']); break; }
$t = $TOPICS[$topic];
$results = [];
// Test cloud AIs
foreach (['fast', 'code', 'deep'] as $mode) {
$ai_name = $mode === 'fast' ? 'wevia_fast' : ($mode === 'code' ? 'wevcode' : 'manager');
$r = call_wevia($t['prompt'], $mode);
$s = score_response($r['response'], $t['criteria'], $r['latency']);
$results[$ai_name] = array_merge($s, ['provider' => $r['provider'], 'response_preview' => mb_substr($r['response'], 0, 200)]);
usleep(500000);
}
// Test sovereign AIs (with timeout protection)
foreach (['qwen3.5:0.8b'] as $model) {
$key = 'ollama_' . str_replace([':', '.'], ['_', ''], $model);
$r = call_ollama($t['prompt'], $model);
if (!empty($r['response'])) {
$s = score_response($r['response'], $t['criteria'], $r['latency']);
$results[$key] = array_merge($s, ['provider' => $model, 'tps' => $r['tps'], 'response_preview' => mb_substr($r['response'], 0, 200)]);
} else {
$results[$key] = ['score' => 0, 'details' => ['timeout'], 'length' => 0, 'latency' => $r['latency'], 'provider' => $model];
}
}
// Save
$run = ['topic' => $topic, 'timestamp' => date('c'), 'results' => $results];
$db['benchmarks'][] = $run;
$db['last_run'] = date('c');
$db['total_runs']++;
// Update leaderboard
$lb = [];
foreach ($results as $ai => $r) {
$lb[$ai] = ($lb[$ai] ?? 0) + ($r['score'] ?? 0);
}
arsort($lb);
$db['leaderboard'] = $lb;
file_put_contents($DB, json_encode($db, JSON_PRETTY_PRINT));
echo json_encode(['ok' => true, 'topic' => $topic, 'results' => $results, 'leaderboard' => $lb]);
break;
case 'report':
$last = end($db['benchmarks']) ?: null;
echo json_encode([
'ok' => true,
'total_runs' => $db['total_runs'],
'last_run' => $db['last_run'],
'last_benchmark' => $last,
'leaderboard' => $db['leaderboard'],
'topics' => array_keys($TOPICS),
'ais' => array_map(fn($a) => $a['name'], $AIS)
]);
break;
case 'history':
$limit = intval($_GET['limit'] ?? 10);
$benchmarks = array_slice($db['benchmarks'], -$limit);
echo json_encode(['ok' => true, 'benchmarks' => $benchmarks, 'total' => count($db['benchmarks'])]);
break;
case 'leaderboard':
// Compute cumulative scores from all benchmarks
$scores = []; $counts = [];
foreach ($db['benchmarks'] as $b) {
foreach ($b['results'] as $ai => $r) {
$scores[$ai] = ($scores[$ai] ?? 0) + ($r['score'] ?? 0);
$counts[$ai] = ($counts[$ai] ?? 0) + 1;
}
}
$lb = [];
foreach ($scores as $ai => $total) {
$lb[] = [
'ai' => $ai,
'name' => $AIS[$ai]['name'] ?? $ai,
'icon' => $AIS[$ai]['icon'] ?? '?',
'type' => $AIS[$ai]['type'] ?? '?',
'total_score' => $total,
'avg_score' => $counts[$ai] > 0 ? round($total / $counts[$ai], 1) : 0,
'runs' => $counts[$ai]
];
}
usort($lb, fn($a, $b) => $b['total_score'] - $a['total_score']);
echo json_encode(['ok' => true, 'leaderboard' => $lb, 'total_benchmarks' => count($db['benchmarks'])]);
break;
case 'improve':
// Analyze weaknesses and suggest improvements
$weaknesses = []; $strengths = [];
foreach ($db['benchmarks'] as $b) {
foreach ($b['results'] as $ai => $r) {
if (($r['score'] ?? 0) < 40) {
$weaknesses[$ai][] = ['topic' => $b['topic'], 'score' => $r['score'], 'details' => $r['details'] ?? []];
}
if (($r['score'] ?? 0) >= 70) {
$strengths[$ai][] = ['topic' => $b['topic'], 'score' => $r['score']];
}
}
}
$suggestions = [];
foreach ($weaknesses as $ai => $issues) {
foreach ($issues as $issue) {
$suggestions[] = [
'ai' => $ai,
'topic' => $issue['topic'],
'current_score' => $issue['score'],
'suggestion' => $issue['score'] == 0 ? 'AI timeout/unavailable - check connectivity' :
($issue['score'] < 20 ? 'Response too short or intercepted by ToolFK - adjust routing' :
'Improve prompt engineering for this topic')
];
}
}
echo json_encode(['ok' => true, 'weaknesses' => $weaknesses, 'strengths' => $strengths, 'suggestions' => $suggestions]);
break;
}