wevia-brain/cognitive-gpu-rotation.php

<?php
/**
 * ╔═══════════════════════════════════════════════════════════════════════════╗
 * ║  WEVIA GPU ROTATION + CROSS-VERIFICATION ENGINE                         ║
 * ║  Local Ollama (S88) → Verify with Cerebras + Groq                       ║
 * ║  2026-03-03                                                              ║
 * ╚═══════════════════════════════════════════════════════════════════════════╝
 *
 * ARCHITECTURE:
 *   1. Primary: Ollama local GPU (0ms network, sovereign)
 *   2. Verify: Cerebras (ultrafast) + Groq (fast) cross-check
 *   3. Synthesize: Best answer wins, merge insights
 *
 * GPU TIERS (RTX 4000 Ada 20GB):
 *   VRAM-fit (fast):  qwen2.5:14b, deepseek-v2:16b, granite3.1-dense:8b
 *   CPU+RAM (quality): deepseek-r1:70b, qwen2.5:72b, nemotron:70b
 *   Code-fast:        qwen2.5-coder:7b
 *   Vision:           llama3.2-vision:11b
 */

define('OLLAMA_URL', 'http://127.0.0.1:11434');
define('GPU_TIMEOUT_FAST', 15);
define('GPU_TIMEOUT_QUALITY', 45);
define('VERIFY_TIMEOUT', 10);

// ═══════════════════════════════════════════════════════════════════════════
// GPU MODEL ROTATION — Intent-based optimal model selection
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Select best Ollama model per intent + complexity
 * Returns array: [primary_model, fallback_model, timeout]
 */
function gpuSelectModel($intent, $complexity = 'moderate', $msgLen = 0) {
    // VRAM-fit models (fast, <20GB) — primary choices
    $fastModels = [
        'qwen2.5:14b',           // 9GB — excellent general + reasoning
        'deepseek-v2:16b',       // 9GB — strong general
        'granite3.1-dense:8b',   // 5GB — fast, good quality
        'qwen2.5-coder:7b',     // 5GB — code specialist
        'orca2:13b',             // 7GB — instruction following
    ];

    // CPU+RAM models (slower but higher quality, need 64GB RAM)
    $qualityModels = [
        'deepseek-r1:70b',      // 42GB — deep reasoning champion
        'qwen2.5:72b',          // 47GB — strong multi-domain
        'nemotron:70b',         // 42GB — NVIDIA, balanced
        'aya:35b',              // 20GB — multilingual (FR/AR/EN)
        'vicuna:33b',           // 18GB — conversational
    ];

    // Expert models per domain
    $codeModels = ['qwen2.5-coder:7b', 'deepseek-v2:16b', 'granite3.1-dense:8b'];
    $reasonModels = ['deepseek-r1:70b', 'qwen2.5:72b', 'qwen2.5:14b'];
    $creativeModels = ['wizardlm2:8x22b', 'dolphin-mixtral:8x22b', 'nous-hermes2:34b'];
    $multilingModels = ['aya:35b', 'qwen2.5:14b', 'command-r-plus:104b'];

    $primary = $intent; // normalize
    if (is_array($intent)) $primary = $intent['primary'] ?? 'general';

    // Route by intent
    switch ($primary) {
        case 'code':
        case 'technical':
        case 'operational':
            if ($complexity === 'complex') return [$codeModels[0], $reasonModels[2], GPU_TIMEOUT_FAST];
            return [$codeModels[0], $codeModels[1], GPU_TIMEOUT_FAST];

        case 'analytical':
        case 'mathematical':
        case 'causal':
            if ($complexity === 'complex') return [$reasonModels[0], $reasonModels[1], GPU_TIMEOUT_QUALITY];
            return [$reasonModels[2], $fastModels[0], GPU_TIMEOUT_FAST];

        case 'creative':
            return [$fastModels[0], $fastModels[1], GPU_TIMEOUT_FAST];

        case 'strategic':
        case 'consulting':
        case 'compliance':
            if ($complexity === 'complex') return [$reasonModels[0], $reasonModels[1], GPU_TIMEOUT_QUALITY];
            return [$fastModels[0], $fastModels[1], GPU_TIMEOUT_FAST];

        case 'teaching':
        case 'social_intelligence':
            return [$multilingModels[1], $fastModels[0], GPU_TIMEOUT_FAST];

        case 'greeting':
        case 'conversational':
            return [$fastModels[2], $fastModels[0], GPU_TIMEOUT_FAST]; // Fastest

        default:
            return [$fastModels[0], $fastModels[1], GPU_TIMEOUT_FAST];
    }
}

/**
 * Rotate model on retry — avoid reusing failed model
 */
function gpuRotateModel($failedModel, $intent) {
    $allModels = [
        'qwen2.5:14b', 'deepseek-v2:16b', 'granite3.1-dense:8b',
        'qwen2.5-coder:7b', 'orca2:13b', 'aya:35b',
    ];
    foreach ($allModels as $m) {
        if ($m !== $failedModel) return $m;
    }
    return 'granite3.1-dense:8b'; // Ultimate fallback (small, fast)
}

// ═══════════════════════════════════════════════════════════════════════════
// GPU CALL — Local Ollama API
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Call Ollama local model
 */
function gpuCallOllama($model, $systemPrompt, $userMessage, $history = [], $timeout = 15, $temperature = 0.4, $maxTokens = 2048) {
    $messages = [];
    if ($systemPrompt) $messages[] = ['role' => 'system', 'content' => $systemPrompt];

    // Add history (last 6 messages max)
    if (!empty($history)) {
        $recent = array_slice($history, -6);
        foreach ($recent as $h) {
            if (isset($h['role'], $h['content'])) {
                $messages[] = ['role' => $h['role'], 'content' => mb_substr($h['content'], 0, 1000)];
            }
        }
    }

    $messages[] = ['role' => 'user', 'content' => $userMessage];

    $payload = json_encode([
        'model' => $model,
        'messages' => $messages,
        'stream' => false,
        'options' => [
            'temperature' => $temperature,
            'num_predict' => $maxTokens,
            'top_p' => 0.9,
            'repeat_penalty' => 1.1,
        ]
    ], JSON_UNESCAPED_UNICODE);

    $ch = curl_init(OLLAMA_URL . '/api/chat');
    curl_setopt_array($ch, [
        CURLOPT_POST => true,
        CURLOPT_POSTFIELDS => $payload,
        CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT => $timeout,
        CURLOPT_CONNECTTIMEOUT => 3,
    ]);

    $start = microtime(true);
    $result = curl_exec($ch);
    $latency = round((microtime(true) - $start) * 1000);
    $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    $error = curl_error($ch);
    curl_close($ch);

    if ($httpCode !== 200 || !$result) {
        error_log("WEVIA_GPU: FAIL model=$model http=$httpCode err=$error lat={$latency}ms");
        return null;
    }

    $data = json_decode($result, true);
    $content = $data['message']['content'] ?? '';

    if (empty(trim($content))) {
        error_log("WEVIA_GPU: EMPTY model=$model lat={$latency}ms");
        return null;
    }

    error_log("WEVIA_GPU: OK model=$model lat={$latency}ms len=" . mb_strlen($content));

    return [
        'content' => $content,
        'model' => $model,
        'latency_ms' => $latency,
        'tokens_est' => (int)(mb_strlen($content) / 3.5),
        'source' => 'gpu_local',
    ];
}

/**
 * GPU call with rotation fallback
 */
function gpuCallWithFallback($intent, $systemPrompt, $userMessage, $history = [], $complexity = 'moderate') {
    [$primary, $fallback, $timeout] = gpuSelectModel($intent, $complexity, mb_strlen($userMessage));

    // Try primary
    $result = gpuCallOllama($primary, $systemPrompt, $userMessage, $history, $timeout);
    if ($result) return $result;

    // Try fallback
    error_log("WEVIA_GPU_ROTATE: primary=$primary failed, trying fallback=$fallback");
    $result = gpuCallOllama($fallback, $systemPrompt, $userMessage, $history, GPU_TIMEOUT_FAST);
    if ($result) return $result;

    // Try rotation
    $rotated = gpuRotateModel($fallback, $intent);
    error_log("WEVIA_GPU_ROTATE: fallback=$fallback failed, trying rotation=$rotated");
    return gpuCallOllama($rotated, $systemPrompt, $userMessage, $history, GPU_TIMEOUT_FAST);
}

// ═══════════════════════════════════════════════════════════════════════════
// CROSS-VERIFICATION ENGINE — Cerebras + Groq check GPU response
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Build verification prompt for cross-checking
 */
function buildVerificationPrompt($originalQuestion, $gpuResponse) {
    $excerpt = mb_substr($gpuResponse, 0, 1500);
    return "Vérifie cette réponse et corrige UNIQUEMENT les erreurs factuelles ou techniques. Ne reformule PAS si c'est correct. Réponds en 2-3 phrases max.

QUESTION: {$originalQuestion}

RÉPONSE À VÉRIFIER:
{$excerpt}

Si la réponse est correcte, dis simplement 'VÉRIFIÉ_OK'. Si elle contient des erreurs, liste-les brièvement.";
}

/**
 * Call cloud provider for verification (lightweight, fast)
 */
function cloudVerifyCall($provider, $key, $model, $url, $prompt, $timeout = 10) {
    $payload = json_encode([
        'model' => $model,
        'messages' => [
            ['role' => 'system', 'content' => 'Tu es un vérificateur technique. Vérifie et corrige uniquement les erreurs factuelles. Sois bref.'],
            ['role' => 'user', 'content' => $prompt]
        ],
        'max_tokens' => 300,
        'temperature' => 0.1,
    ], JSON_UNESCAPED_UNICODE);

    $ch = curl_init($url);
    curl_setopt_array($ch, [
        CURLOPT_POST => true,
        CURLOPT_POSTFIELDS => $payload,
        CURLOPT_HTTPHEADER => [
            'Content-Type: application/json',
            'Authorization: Bearer ' . $key,
        ],
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT => $timeout,
        CURLOPT_CONNECTTIMEOUT => 3,
    ]);

    $start = microtime(true);
    $result = curl_exec($ch);
    $latency = round((microtime(true) - $start) * 1000);
    $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);

    if ($httpCode !== 200 || !$result) {
        error_log("WEVIA_VERIFY: FAIL provider=$provider http=$httpCode lat={$latency}ms");
        return null;
    }

    $data = json_decode($result, true);
    $content = $data['choices'][0]['message']['content'] ?? '';
    error_log("WEVIA_VERIFY: OK provider=$provider lat={$latency}ms verdict=" . mb_substr($content, 0, 50));

    return [
        'content' => $content,
        'provider' => $provider,
        'latency_ms' => $latency,
    ];
}

/**
 * Cross-verify GPU response with Cerebras AND Groq in parallel (via multi_curl)
 */
function crossVerifyResponse($originalQuestion, $gpuResponse, $providers = []) {
    if (mb_strlen($gpuResponse) < 100) return ['verified' => true, 'corrections' => [], 'method' => 'skip_short'];

    $prompt = buildVerificationPrompt($originalQuestion, $gpuResponse);
    $verifications = [];

    // Get providers from global if not passed
    if (empty($providers)) {
        global $PROVIDERS;
        $providers = $PROVIDERS ?? [];
    }

    // Cerebras verification
    if (isset($providers['cerebras'])) {
        $v = cloudVerifyCall(
            'cerebras',
            $providers['cerebras']['key'],
            $providers['cerebras']['model'],
            $providers['cerebras']['url'],
            $prompt,
            VERIFY_TIMEOUT
        );
        if ($v) $verifications[] = $v;
    }

    // Groq verification
    if (isset($providers['groq'])) {
        $v = cloudVerifyCall(
            'groq',
            $providers['groq']['key'],
            $providers['groq']['model'],
            $providers['groq']['url'],
            $prompt,
            VERIFY_TIMEOUT
        );
        if ($v) $verifications[] = $v;
    }

    // Analyze verifications
    $corrections = [];
    $allVerified = true;

    foreach ($verifications as $v) {
        $content = $v['content'] ?? '';
        if (stripos($content, 'VÉRIFIÉ_OK') !== false || stripos($content, 'VERIFIE_OK') !== false || stripos($content, 'correcte') !== false) {
            continue; // This verifier says OK
        }
        $allVerified = false;
        $corrections[] = [
            'provider' => $v['provider'],
            'correction' => mb_substr($content, 0, 500),
            'latency_ms' => $v['latency_ms'],
        ];
    }

    return [
        'verified' => $allVerified,
        'corrections' => $corrections,
        'verifiers_count' => count($verifications),
        'method' => 'cerebras+groq',
    ];
}

/**
 * Apply corrections from verification to the GPU response
 */
function applyVerificationCorrections($gpuResponse, $verificationResult) {
    if ($verificationResult['verified'] || empty($verificationResult['corrections'])) {
        return $gpuResponse; // No corrections needed
    }

    // Append correction notes
    $correctionNotes = [];
    foreach ($verificationResult['corrections'] as $c) {
        $correctionNotes[] = $c['correction'];
    }

    // If corrections found, append a refined note
    $correctionText = implode("\n", $correctionNotes);

    // Don't append if corrections are too vague
    if (mb_strlen($correctionText) < 20) return $gpuResponse;

    // Smart merge: only add if corrections contain specific fixes
    if (preg_match('/(erreur|incorrect|faux|attention|devrait|plutôt|correction|en fait)/i', $correctionText)) {
        $gpuResponse .= "\n\n---\n📋 **Vérification croisée (Cerebras/Groq):** " . mb_substr($correctionText, 0, 300);
    }

    return $gpuResponse;
}

// ═══════════════════════════════════════════════════════════════════════════
// MASTER PIPELINE — GPU → Verify → Synthesize
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Full sovereign pipeline:
 *   1. GPU local (Ollama) generates primary response
 *   2. Cerebras + Groq verify in parallel
 *   3. Apply corrections if needed
 *   4. Return enriched response
 */
function sovereignGPUPipeline($intent, $systemPrompt, $userMessage, $history = [], $providers = []) {
    $start = microtime(true);
    $complexity = function_exists('calculateComplexity') ? calculateComplexity($userMessage) : 'moderate';

    // Step 1: GPU primary response
    $gpuResult = gpuCallWithFallback($intent, $systemPrompt, $userMessage, $history, $complexity);

    if (!$gpuResult) {
        error_log("WEVIA_SOVEREIGN: GPU pipeline failed — all models exhausted");
        return null; // Let cloud fallback handle it
    }

    $response = $gpuResult['content'];

    // Step 2: Cross-verify (skip for greetings, very short queries, or widget mode)
    $shouldVerify = (
        $intent !== 'greeting' &&
        $intent !== 'conversational' &&
        mb_strlen($userMessage) >= 30 &&
        mb_strlen($response) >= 100 &&
        $complexity !== 'simple'
    );

    $verification = ['verified' => true, 'corrections' => [], 'method' => 'skip'];

    if ($shouldVerify) {
        $verification = crossVerifyResponse($userMessage, $response, $providers);

        // Step 3: Apply corrections
        if (!$verification['verified']) {
            $response = applyVerificationCorrections($response, $verification);
        }
    }

    $totalLatency = round((microtime(true) - $start) * 1000);

    error_log(sprintf(
        "WEVIA_SOVEREIGN: model=%s gpu_lat=%dms verified=%s corrections=%d total=%dms complexity=%s",
        $gpuResult['model'],
        $gpuResult['latency_ms'],
        $verification['verified'] ? 'YES' : 'NO',
        count($verification['corrections']),
        $totalLatency,
        $complexity
    ));

    return [
        'content' => $response,
        'model' => $gpuResult['model'],
        'source' => 'sovereign_gpu',
        'gpu_latency_ms' => $gpuResult['latency_ms'],
        'verification' => $verification,
        'total_latency_ms' => $totalLatency,
        'complexity' => $complexity,
    ];
}

/**
 * Determine if sovereign GPU should be used (vs cloud-first)
 */
function shouldUseSovereignGPU($intent, $msg, $mode = 'full') {
    // Widget mode = always cloud (faster)
    if ($mode === 'widget' || $mode === 'fast') return false;

    // Greetings = cloud (faster)
    if ($intent === 'greeting') return false;

    // Very short messages = cloud
    if (mb_strlen(trim($msg)) < 20) return false;

    // Complex queries benefit from GPU reasoning models
    if (function_exists('calculateComplexity')) {
        $complexity = calculateComplexity($msg);
        if ($complexity === 'complex') return true;
    }

    // Code, analysis, reasoning = GPU preferred
    if (in_array($intent, ['code', 'technical', 'analytical', 'mathematical', 'causal', 'strategic', 'compliance'])) {
        return true;
    }

    // Default: alternate GPU/Cloud for load distribution
    return (time() % 3 !== 0); // ~66% GPU, ~33% cloud
}

// ═══════════════════════════════════════════════════════════════════════════
// GPU HEALTH & MONITORING
// ═══════════════════════════════════════════════════════════════════════════

function gpuHealthCheck() {
    $ch = curl_init(OLLAMA_URL . '/api/tags');
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT => 3,
    ]);
    $result = curl_exec($ch);
    $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);

    if ($httpCode !== 200) return ['status' => 'down', 'models' => 0];

    $data = json_decode($result, true);
    $models = $data['models'] ?? [];

    return [
        'status' => 'up',
        'models' => count($models),
        'fast_models' => count(array_filter($models, fn($m) => ($m['size'] ?? 0) < 20e9)),
        'quality_models' => count(array_filter($models, fn($m) => ($m['size'] ?? 0) >= 20e9)),
    ];
}

function gpuGetLoadedModel() {
    $ch = curl_init(OLLAMA_URL . '/api/ps');
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT => 3,
    ]);
    $result = curl_exec($ch);
    curl_close($ch);

    $data = json_decode($result, true);
    return $data['models'][0]['name'] ?? null;
}

/**
 * Prefer currently loaded model to avoid swap latency
 */
function gpuPreferLoaded($selected, $intent) {
    $loaded = gpuGetLoadedModel();
    if (!$loaded) return $selected;

    // If loaded model is suitable for this intent, prefer it
    $fastModels = ['qwen2.5:14b', 'deepseek-v2:16b', 'granite3.1-dense:8b', 'qwen2.5-coder:7b', 'orca2:13b'];
    if (in_array($loaded, $fastModels)) {
        // Code intent needs code model
        if (in_array($intent, ['code', 'technical']) && $loaded === 'qwen2.5-coder:7b') return $loaded;
        // If loaded is a fast general model, reuse it to avoid swap
        if (!in_array($intent, ['code', 'technical'])) return $loaded;
    }

    return $selected;
}