html/api/ai-benchmark.php

<?php
/**
 * WEVAL AI Benchmark — Compare & Improve IA Continue
 * ?action=benchmark     — Run comparative benchmark across all AIs
 * ?action=report        — Get latest benchmark results
 * ?action=history       — Historical benchmark data
 * ?action=leaderboard   — Current AI leaderboard
 * ?action=improve       — Get improvement suggestions
 */
header('Content-Type: application/json');
$KEY = 'WEVADS2026';
if (($_GET['k'] ?? $_POST['k'] ?? '') !== $KEY) { http_response_code(403); die(json_encode(['error'=>'auth'])); }

$action = $_GET['action'] ?? 'report';
$DB = '/opt/wevads/vault/ai-benchmark.json';

if (!file_exists($DB)) {
    file_put_contents($DB, json_encode([
        'benchmarks' => [], 'leaderboard' => [], 'improvements' => [],
        'last_run' => null, 'total_runs' => 0
    ], JSON_PRETTY_PRINT));
}
$db = json_decode(file_get_contents($DB), true);

// AI Configurations
$AIS = [
    'wevia_fast' => ['name'=>'WEVIA PUBLIC','endpoint'=>'/api/weval-ia','mode'=>'fast','type'=>'cloud','icon'=>'⚡'],
    'wevcode' => ['name'=>'WEVCODE','endpoint'=>'/api/weval-ia','mode'=>'code','type'=>'cloud','icon'=>'💻'],
    'manager' => ['name'=>'MANAGER','endpoint'=>'/api/weval-ia','mode'=>'deep','type'=>'cloud','icon'=>'🧠'],
    'ollama_qwen3' => ['name'=>'Ollama Qwen3:4b','model'=>'qwen3:4b','type'=>'sovereign','icon'=>'🏠'],
    'ollama_08b' => ['name'=>'Ollama Qwen3.5:0.8b','model'=>'qwen3.5:0.8b','type'=>'sovereign','icon'=>'🪶'],
    'ollama_mistral' => ['name'=>'Ollama Mistral','model'=>'mistral:latest','type'=>'sovereign','icon'=>'🇫🇷'],
    'opus' => ['name'=>'Claude Opus','type'=>'reference','icon'=>'👑'],
];

// Test Topics
$TOPICS = [
    'strategy' => ['prompt'=>'Propose une stratégie digitale pour une PME marocaine de 200 employés','criteria'=>['structure','actionable','maroc_context','length>500']],
    'code' => ['prompt'=>'Ecris en Python une classe CsvAnalyzer avec methodes load et describe','criteria'=>['has_class','has_def','has_docstring','runnable']],
    'pharma' => ['prompt'=>'Quelles sont les étapes de la pharmacovigilance pour un nouveau médicament','criteria'=>['pharmacovigilance','steps','regulatory','africa_context']],
    'security' => ['prompt'=>'Liste les 5 vulnérabilités web OWASP Top 10 avec remediations','criteria'=>['owasp','injection','xss','remediation']],
    'erp' => ['prompt'=>'Compare SAP vs Oracle ERP pour une entreprise de 500 employés','criteria'=>['sap','oracle','comparison','recommendation']],
];

function score_response($resp, $criteria, $latency) {
    $score = 0; $details = [];
    $lower = strtolower($resp);
    $len = strlen($resp);

    // Content quality (0-40)
    foreach ($criteria as $c) {
        if (strpos($c, '>') !== false) {
            list($key, $val) = explode('>', $c);
            if ($key === 'length' && $len > intval($val)) { $score += 10; $details[] = "$c:OK"; }
        } else {
            $keywords = [
                'structure' => ['###','**','1.','2.','3.'],
                'actionable' => ['étape','action','recommand','implément','déploy'],
                'maroc_context' => ['maroc','marocain','casablanca','rabat','pme'],
                'has_class' => ['class '],
                'has_def' => ['def '],
                'has_docstring' => ['"""','\'\'\''],
                'runnable' => ['import ','return '],
                'pharmacovigilance' => ['pharmacovigilance','effet indésirable','signal'],
                'steps' => ['étape','phase','1)','1.','première'],
                'regulatory' => ['amm','autorisation','réglementaire','anpp','ansm'],
                'africa_context' => ['algérie','maroc','tunisie','afrique','maghreb'],
                'owasp' => ['owasp','top 10'],
                'injection' => ['injection','sql injection'],
                'xss' => ['xss','cross-site','script'],
                'remediation' => ['remédiation','correction','protéger','prévenir','mitigation'],
                'sap' => ['sap','s/4hana','s4hana'],
                'oracle' => ['oracle','erp cloud','jd edwards'],
                'comparison' => ['avantage','inconvénient','vs','comparaison','différence'],
                'recommendation' => ['recommand','conseil','préférable','optimal'],
            ];
            if (isset($keywords[$c])) {
                foreach ($keywords[$c] as $kw) {
                    if (stripos($lower, $kw) !== false) {
                        $score += 10;
                        $details[] = "$c:OK";
                        break;
                    }
                }
            }
        }
    }

    // Length bonus (0-15)
    if ($len > 3000) $score += 15;
    elseif ($len > 1500) $score += 10;
    elseif ($len > 500) $score += 5;

    // Speed bonus (0-15)
    if ($latency > 0 && $latency < 1000) $score += 15;
    elseif ($latency < 2000) $score += 10;
    elseif ($latency < 4000) $score += 5;

    // Formatting bonus (0-10)
    if (preg_match('/```/', $resp)) $score += 3; // Code blocks
    if (preg_match('/\*\*/', $resp)) $score += 2; // Bold
    if (preg_match('/###/', $resp)) $score += 2; // Headers
    if (preg_match('/\d+\./', $resp)) $score += 3; // Numbered lists

    return ['score' => min($score, 100), 'details' => $details, 'length' => $len, 'latency' => $latency];
}

function call_wevia($prompt, $mode) {
    $t0 = microtime(true);
    $ctx = stream_context_create(['http' => [
        'method' => 'POST',
        'header' => 'Content-Type: application/json',
        'content' => json_encode(['message' => $prompt, 'mode' => $mode]),
        'timeout' => 15
    ]]);
    $r = @file_get_contents('http://127.0.0.1/api/weval-ia', false, $ctx);
    $lat = intval((microtime(true) - $t0) * 1000);
    if (!$r) return ['response' => '', 'provider' => '?', 'latency' => $lat];
    $d = json_decode($r, true);
    return ['response' => $d['response'] ?? '', 'provider' => $d['provider'] ?? '?', 'latency' => $d['latency_ms'] ?? $lat];
}

function call_ollama($prompt, $model) {
    $t0 = microtime(true);
    $ctx = stream_context_create(['http' => [
        'method' => 'POST',
        'header' => 'Content-Type: application/json',
        'content' => json_encode(['model' => $model, 'prompt' => $prompt, 'stream' => false, 'options' => ['num_predict' => 200, 'num_ctx' => 512]]),
        'timeout' => 30
    ]]);
    $r = @file_get_contents('http://127.0.0.1:11434/api/generate', false, $ctx);
    $lat = intval((microtime(true) - $t0) * 1000);
    if (!$r) return ['response' => '', 'provider' => $model, 'latency' => $lat, 'tps' => 0];
    $d = json_decode($r, true);
    $tps = ($d['eval_count'] ?? 0) / max(($d['eval_duration'] ?? 1) / 1e9, 0.01);
    return ['response' => $d['response'] ?? '', 'provider' => $model, 'latency' => $lat, 'tps' => round($tps, 1)];
}

switch ($action) {

case 'benchmark':
    $topic = $_GET['topic'] ?? 'strategy';
    if (!isset($TOPICS[$topic])) { echo json_encode(['error' => 'invalid topic']); break; }

    $t = $TOPICS[$topic];
    $results = [];

    // Test cloud AIs
    foreach (['fast', 'code', 'deep'] as $mode) {
        $ai_name = $mode === 'fast' ? 'wevia_fast' : ($mode === 'code' ? 'wevcode' : 'manager');
        $r = call_wevia($t['prompt'], $mode);
        $s = score_response($r['response'], $t['criteria'], $r['latency']);
        $results[$ai_name] = array_merge($s, ['provider' => $r['provider'], 'response_preview' => mb_substr($r['response'], 0, 200)]);
        usleep(500000);
    }

    // Test sovereign AIs (with timeout protection)
    foreach (['qwen3.5:0.8b'] as $model) {
        $key = 'ollama_' . str_replace([':', '.'], ['_', ''], $model);
        $r = call_ollama($t['prompt'], $model);
        if (!empty($r['response'])) {
            $s = score_response($r['response'], $t['criteria'], $r['latency']);
            $results[$key] = array_merge($s, ['provider' => $model, 'tps' => $r['tps'], 'response_preview' => mb_substr($r['response'], 0, 200)]);
        } else {
            $results[$key] = ['score' => 0, 'details' => ['timeout'], 'length' => 0, 'latency' => $r['latency'], 'provider' => $model];
        }
    }

    // Save
    $run = ['topic' => $topic, 'timestamp' => date('c'), 'results' => $results];
    $db['benchmarks'][] = $run;
    $db['last_run'] = date('c');
    $db['total_runs']++;

    // Update leaderboard
    $lb = [];
    foreach ($results as $ai => $r) {
        $lb[$ai] = ($lb[$ai] ?? 0) + ($r['score'] ?? 0);
    }
    arsort($lb);
    $db['leaderboard'] = $lb;

    file_put_contents($DB, json_encode($db, JSON_PRETTY_PRINT));
    echo json_encode(['ok' => true, 'topic' => $topic, 'results' => $results, 'leaderboard' => $lb]);
    break;

case 'report':
    $last = end($db['benchmarks']) ?: null;
    echo json_encode([
        'ok' => true,
        'total_runs' => $db['total_runs'],
        'last_run' => $db['last_run'],
        'last_benchmark' => $last,
        'leaderboard' => $db['leaderboard'],
        'topics' => array_keys($TOPICS),
        'ais' => array_map(fn($a) => $a['name'], $AIS)
    ]);
    break;

case 'history':
    $limit = intval($_GET['limit'] ?? 10);
    $benchmarks = array_slice($db['benchmarks'], -$limit);
    echo json_encode(['ok' => true, 'benchmarks' => $benchmarks, 'total' => count($db['benchmarks'])]);
    break;

case 'leaderboard':
    // Compute cumulative scores from all benchmarks
    $scores = []; $counts = [];
    foreach ($db['benchmarks'] as $b) {
        foreach ($b['results'] as $ai => $r) {
            $scores[$ai] = ($scores[$ai] ?? 0) + ($r['score'] ?? 0);
            $counts[$ai] = ($counts[$ai] ?? 0) + 1;
        }
    }
    $lb = [];
    foreach ($scores as $ai => $total) {
        $lb[] = [
            'ai' => $ai,
            'name' => $AIS[$ai]['name'] ?? $ai,
            'icon' => $AIS[$ai]['icon'] ?? '?',
            'type' => $AIS[$ai]['type'] ?? '?',
            'total_score' => $total,
            'avg_score' => $counts[$ai] > 0 ? round($total / $counts[$ai], 1) : 0,
            'runs' => $counts[$ai]
        ];
    }
    usort($lb, fn($a, $b) => $b['total_score'] - $a['total_score']);
    echo json_encode(['ok' => true, 'leaderboard' => $lb, 'total_benchmarks' => count($db['benchmarks'])]);
    break;

case 'improve':
    // Analyze weaknesses and suggest improvements
    $weaknesses = []; $strengths = [];
    foreach ($db['benchmarks'] as $b) {
        foreach ($b['results'] as $ai => $r) {
            if (($r['score'] ?? 0) < 40) {
                $weaknesses[$ai][] = ['topic' => $b['topic'], 'score' => $r['score'], 'details' => $r['details'] ?? []];
            }
            if (($r['score'] ?? 0) >= 70) {
                $strengths[$ai][] = ['topic' => $b['topic'], 'score' => $r['score']];
            }
        }
    }

    $suggestions = [];
    foreach ($weaknesses as $ai => $issues) {
        foreach ($issues as $issue) {
            $suggestions[] = [
                'ai' => $ai,
                'topic' => $issue['topic'],
                'current_score' => $issue['score'],
                'suggestion' => $issue['score'] == 0 ? 'AI timeout/unavailable - check connectivity' :
                    ($issue['score'] < 20 ? 'Response too short or intercepted by ToolFK - adjust routing' :
                    'Improve prompt engineering for this topic')
            ];
        }
    }

    echo json_encode(['ok' => true, 'weaknesses' => $weaknesses, 'strengths' => $strengths, 'suggestions' => $suggestions]);
    break;
}