262 lines
11 KiB
PHP
262 lines
11 KiB
PHP
<?php
|
|
/**
|
|
* WEVAL AI Benchmark — Compare & Improve IA Continue
|
|
* ?action=benchmark — Run comparative benchmark across all AIs
|
|
* ?action=report — Get latest benchmark results
|
|
* ?action=history — Historical benchmark data
|
|
* ?action=leaderboard — Current AI leaderboard
|
|
* ?action=improve — Get improvement suggestions
|
|
*/
|
|
header('Content-Type: application/json');
|
|
$KEY = 'WEVADS2026';
|
|
if (($_GET['k'] ?? $_POST['k'] ?? '') !== $KEY) { http_response_code(403); die(json_encode(['error'=>'auth'])); }
|
|
|
|
$action = $_GET['action'] ?? 'report';
|
|
$DB = '/opt/wevads/vault/ai-benchmark.json';
|
|
|
|
if (!file_exists($DB)) {
|
|
file_put_contents($DB, json_encode([
|
|
'benchmarks' => [], 'leaderboard' => [], 'improvements' => [],
|
|
'last_run' => null, 'total_runs' => 0
|
|
], JSON_PRETTY_PRINT));
|
|
}
|
|
$db = json_decode(file_get_contents($DB), true);
|
|
|
|
// AI Configurations
|
|
$AIS = [
|
|
'wevia_fast' => ['name'=>'WEVIA PUBLIC','endpoint'=>'/api/weval-ia','mode'=>'fast','type'=>'cloud','icon'=>'⚡'],
|
|
'wevcode' => ['name'=>'WEVCODE','endpoint'=>'/api/weval-ia','mode'=>'code','type'=>'cloud','icon'=>'💻'],
|
|
'manager' => ['name'=>'MANAGER','endpoint'=>'/api/weval-ia','mode'=>'deep','type'=>'cloud','icon'=>'🧠'],
|
|
'ollama_qwen3' => ['name'=>'Ollama Qwen3:4b','model'=>'qwen3:4b','type'=>'sovereign','icon'=>'🏠'],
|
|
'ollama_08b' => ['name'=>'Ollama Qwen3.5:0.8b','model'=>'qwen3.5:0.8b','type'=>'sovereign','icon'=>'🪶'],
|
|
'ollama_mistral' => ['name'=>'Ollama Mistral','model'=>'mistral:latest','type'=>'sovereign','icon'=>'🇫🇷'],
|
|
'opus' => ['name'=>'Claude Opus','type'=>'reference','icon'=>'👑'],
|
|
];
|
|
|
|
// Test Topics
|
|
$TOPICS = [
|
|
'strategy' => ['prompt'=>'Propose une stratégie digitale pour une PME marocaine de 200 employés','criteria'=>['structure','actionable','maroc_context','length>500']],
|
|
'code' => ['prompt'=>'Ecris en Python une classe CsvAnalyzer avec methodes load et describe','criteria'=>['has_class','has_def','has_docstring','runnable']],
|
|
'pharma' => ['prompt'=>'Quelles sont les étapes de la pharmacovigilance pour un nouveau médicament','criteria'=>['pharmacovigilance','steps','regulatory','africa_context']],
|
|
'security' => ['prompt'=>'Liste les 5 vulnérabilités web OWASP Top 10 avec remediations','criteria'=>['owasp','injection','xss','remediation']],
|
|
'erp' => ['prompt'=>'Compare SAP vs Oracle ERP pour une entreprise de 500 employés','criteria'=>['sap','oracle','comparison','recommendation']],
|
|
];
|
|
|
|
function score_response($resp, $criteria, $latency) {
|
|
$score = 0; $details = [];
|
|
$lower = strtolower($resp);
|
|
$len = strlen($resp);
|
|
|
|
// Content quality (0-40)
|
|
foreach ($criteria as $c) {
|
|
if (strpos($c, '>') !== false) {
|
|
list($key, $val) = explode('>', $c);
|
|
if ($key === 'length' && $len > intval($val)) { $score += 10; $details[] = "$c:OK"; }
|
|
} else {
|
|
$keywords = [
|
|
'structure' => ['###','**','1.','2.','3.'],
|
|
'actionable' => ['étape','action','recommand','implément','déploy'],
|
|
'maroc_context' => ['maroc','marocain','casablanca','rabat','pme'],
|
|
'has_class' => ['class '],
|
|
'has_def' => ['def '],
|
|
'has_docstring' => ['"""','\'\'\''],
|
|
'runnable' => ['import ','return '],
|
|
'pharmacovigilance' => ['pharmacovigilance','effet indésirable','signal'],
|
|
'steps' => ['étape','phase','1)','1.','première'],
|
|
'regulatory' => ['amm','autorisation','réglementaire','anpp','ansm'],
|
|
'africa_context' => ['algérie','maroc','tunisie','afrique','maghreb'],
|
|
'owasp' => ['owasp','top 10'],
|
|
'injection' => ['injection','sql injection'],
|
|
'xss' => ['xss','cross-site','script'],
|
|
'remediation' => ['remédiation','correction','protéger','prévenir','mitigation'],
|
|
'sap' => ['sap','s/4hana','s4hana'],
|
|
'oracle' => ['oracle','erp cloud','jd edwards'],
|
|
'comparison' => ['avantage','inconvénient','vs','comparaison','différence'],
|
|
'recommendation' => ['recommand','conseil','préférable','optimal'],
|
|
];
|
|
if (isset($keywords[$c])) {
|
|
foreach ($keywords[$c] as $kw) {
|
|
if (stripos($lower, $kw) !== false) {
|
|
$score += 10;
|
|
$details[] = "$c:OK";
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Length bonus (0-15)
|
|
if ($len > 3000) $score += 15;
|
|
elseif ($len > 1500) $score += 10;
|
|
elseif ($len > 500) $score += 5;
|
|
|
|
// Speed bonus (0-15)
|
|
if ($latency > 0 && $latency < 1000) $score += 15;
|
|
elseif ($latency < 2000) $score += 10;
|
|
elseif ($latency < 4000) $score += 5;
|
|
|
|
// Formatting bonus (0-10)
|
|
if (preg_match('/```/', $resp)) $score += 3; // Code blocks
|
|
if (preg_match('/\*\*/', $resp)) $score += 2; // Bold
|
|
if (preg_match('/###/', $resp)) $score += 2; // Headers
|
|
if (preg_match('/\d+\./', $resp)) $score += 3; // Numbered lists
|
|
|
|
return ['score' => min($score, 100), 'details' => $details, 'length' => $len, 'latency' => $latency];
|
|
}
|
|
|
|
function call_wevia($prompt, $mode) {
|
|
$t0 = microtime(true);
|
|
$ctx = stream_context_create(['http' => [
|
|
'method' => 'POST',
|
|
'header' => 'Content-Type: application/json',
|
|
'content' => json_encode(['message' => $prompt, 'mode' => $mode]),
|
|
'timeout' => 15
|
|
]]);
|
|
$r = @file_get_contents('http://127.0.0.1/api/weval-ia', false, $ctx);
|
|
$lat = intval((microtime(true) - $t0) * 1000);
|
|
if (!$r) return ['response' => '', 'provider' => '?', 'latency' => $lat];
|
|
$d = json_decode($r, true);
|
|
return ['response' => $d['response'] ?? '', 'provider' => $d['provider'] ?? '?', 'latency' => $d['latency_ms'] ?? $lat];
|
|
}
|
|
|
|
function call_ollama($prompt, $model) {
|
|
$t0 = microtime(true);
|
|
$ctx = stream_context_create(['http' => [
|
|
'method' => 'POST',
|
|
'header' => 'Content-Type: application/json',
|
|
'content' => json_encode(['model' => $model, 'prompt' => $prompt, 'stream' => false, 'options' => ['num_predict' => 200, 'num_ctx' => 512]]),
|
|
'timeout' => 30
|
|
]]);
|
|
$r = @file_get_contents('http://127.0.0.1:11434/api/generate', false, $ctx);
|
|
$lat = intval((microtime(true) - $t0) * 1000);
|
|
if (!$r) return ['response' => '', 'provider' => $model, 'latency' => $lat, 'tps' => 0];
|
|
$d = json_decode($r, true);
|
|
$tps = ($d['eval_count'] ?? 0) / max(($d['eval_duration'] ?? 1) / 1e9, 0.01);
|
|
return ['response' => $d['response'] ?? '', 'provider' => $model, 'latency' => $lat, 'tps' => round($tps, 1)];
|
|
}
|
|
|
|
switch ($action) {
|
|
|
|
case 'benchmark':
|
|
$topic = $_GET['topic'] ?? 'strategy';
|
|
if (!isset($TOPICS[$topic])) { echo json_encode(['error' => 'invalid topic']); break; }
|
|
|
|
$t = $TOPICS[$topic];
|
|
$results = [];
|
|
|
|
// Test cloud AIs
|
|
foreach (['fast', 'code', 'deep'] as $mode) {
|
|
$ai_name = $mode === 'fast' ? 'wevia_fast' : ($mode === 'code' ? 'wevcode' : 'manager');
|
|
$r = call_wevia($t['prompt'], $mode);
|
|
$s = score_response($r['response'], $t['criteria'], $r['latency']);
|
|
$results[$ai_name] = array_merge($s, ['provider' => $r['provider'], 'response_preview' => mb_substr($r['response'], 0, 200)]);
|
|
usleep(500000);
|
|
}
|
|
|
|
// Test sovereign AIs (with timeout protection)
|
|
foreach (['qwen3.5:0.8b'] as $model) {
|
|
$key = 'ollama_' . str_replace([':', '.'], ['_', ''], $model);
|
|
$r = call_ollama($t['prompt'], $model);
|
|
if (!empty($r['response'])) {
|
|
$s = score_response($r['response'], $t['criteria'], $r['latency']);
|
|
$results[$key] = array_merge($s, ['provider' => $model, 'tps' => $r['tps'], 'response_preview' => mb_substr($r['response'], 0, 200)]);
|
|
} else {
|
|
$results[$key] = ['score' => 0, 'details' => ['timeout'], 'length' => 0, 'latency' => $r['latency'], 'provider' => $model];
|
|
}
|
|
}
|
|
|
|
// Save
|
|
$run = ['topic' => $topic, 'timestamp' => date('c'), 'results' => $results];
|
|
$db['benchmarks'][] = $run;
|
|
$db['last_run'] = date('c');
|
|
$db['total_runs']++;
|
|
|
|
// Update leaderboard
|
|
$lb = [];
|
|
foreach ($results as $ai => $r) {
|
|
$lb[$ai] = ($lb[$ai] ?? 0) + ($r['score'] ?? 0);
|
|
}
|
|
arsort($lb);
|
|
$db['leaderboard'] = $lb;
|
|
|
|
file_put_contents($DB, json_encode($db, JSON_PRETTY_PRINT));
|
|
echo json_encode(['ok' => true, 'topic' => $topic, 'results' => $results, 'leaderboard' => $lb]);
|
|
break;
|
|
|
|
case 'report':
|
|
$last = end($db['benchmarks']) ?: null;
|
|
echo json_encode([
|
|
'ok' => true,
|
|
'total_runs' => $db['total_runs'],
|
|
'last_run' => $db['last_run'],
|
|
'last_benchmark' => $last,
|
|
'leaderboard' => $db['leaderboard'],
|
|
'topics' => array_keys($TOPICS),
|
|
'ais' => array_map(fn($a) => $a['name'], $AIS)
|
|
]);
|
|
break;
|
|
|
|
case 'history':
|
|
$limit = intval($_GET['limit'] ?? 10);
|
|
$benchmarks = array_slice($db['benchmarks'], -$limit);
|
|
echo json_encode(['ok' => true, 'benchmarks' => $benchmarks, 'total' => count($db['benchmarks'])]);
|
|
break;
|
|
|
|
case 'leaderboard':
|
|
// Compute cumulative scores from all benchmarks
|
|
$scores = []; $counts = [];
|
|
foreach ($db['benchmarks'] as $b) {
|
|
foreach ($b['results'] as $ai => $r) {
|
|
$scores[$ai] = ($scores[$ai] ?? 0) + ($r['score'] ?? 0);
|
|
$counts[$ai] = ($counts[$ai] ?? 0) + 1;
|
|
}
|
|
}
|
|
$lb = [];
|
|
foreach ($scores as $ai => $total) {
|
|
$lb[] = [
|
|
'ai' => $ai,
|
|
'name' => $AIS[$ai]['name'] ?? $ai,
|
|
'icon' => $AIS[$ai]['icon'] ?? '?',
|
|
'type' => $AIS[$ai]['type'] ?? '?',
|
|
'total_score' => $total,
|
|
'avg_score' => $counts[$ai] > 0 ? round($total / $counts[$ai], 1) : 0,
|
|
'runs' => $counts[$ai]
|
|
];
|
|
}
|
|
usort($lb, fn($a, $b) => $b['total_score'] - $a['total_score']);
|
|
echo json_encode(['ok' => true, 'leaderboard' => $lb, 'total_benchmarks' => count($db['benchmarks'])]);
|
|
break;
|
|
|
|
case 'improve':
|
|
// Analyze weaknesses and suggest improvements
|
|
$weaknesses = []; $strengths = [];
|
|
foreach ($db['benchmarks'] as $b) {
|
|
foreach ($b['results'] as $ai => $r) {
|
|
if (($r['score'] ?? 0) < 40) {
|
|
$weaknesses[$ai][] = ['topic' => $b['topic'], 'score' => $r['score'], 'details' => $r['details'] ?? []];
|
|
}
|
|
if (($r['score'] ?? 0) >= 70) {
|
|
$strengths[$ai][] = ['topic' => $b['topic'], 'score' => $r['score']];
|
|
}
|
|
}
|
|
}
|
|
|
|
$suggestions = [];
|
|
foreach ($weaknesses as $ai => $issues) {
|
|
foreach ($issues as $issue) {
|
|
$suggestions[] = [
|
|
'ai' => $ai,
|
|
'topic' => $issue['topic'],
|
|
'current_score' => $issue['score'],
|
|
'suggestion' => $issue['score'] == 0 ? 'AI timeout/unavailable - check connectivity' :
|
|
($issue['score'] < 20 ? 'Response too short or intercepted by ToolFK - adjust routing' :
|
|
'Improve prompt engineering for this topic')
|
|
];
|
|
}
|
|
}
|
|
|
|
echo json_encode(['ok' => true, 'weaknesses' => $weaknesses, 'strengths' => $strengths, 'suggestions' => $suggestions]);
|
|
break;
|
|
}
|