'system', 'content' => $systemPrompt]; // Add history (last 6 messages max) if (!empty($history)) { $recent = array_slice($history, -6); foreach ($recent as $h) { if (isset($h['role'], $h['content'])) { $messages[] = ['role' => $h['role'], 'content' => mb_substr($h['content'], 0, 1000)]; } } } $messages[] = ['role' => 'user', 'content' => $userMessage]; $payload = json_encode([ 'model' => $model, 'messages' => $messages, 'stream' => false, 'options' => [ 'temperature' => $temperature, 'num_predict' => $maxTokens, 'top_p' => 0.9, 'repeat_penalty' => 1.1, ] ], JSON_UNESCAPED_UNICODE); $ch = curl_init(OLLAMA_URL . '/api/chat'); curl_setopt_array($ch, [ CURLOPT_POST => true, CURLOPT_POSTFIELDS => $payload, CURLOPT_HTTPHEADER => ['Content-Type: application/json'], CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => $timeout, CURLOPT_CONNECTTIMEOUT => 3, ]); $start = microtime(true); $result = curl_exec($ch); $latency = round((microtime(true) - $start) * 1000); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); $error = curl_error($ch); curl_close($ch); if ($httpCode !== 200 || !$result) { error_log("WEVIA_GPU: FAIL model=$model http=$httpCode err=$error lat={$latency}ms"); return null; } $data = json_decode($result, true); $content = $data['message']['content'] ?? ''; if (empty(trim($content))) { error_log("WEVIA_GPU: EMPTY model=$model lat={$latency}ms"); return null; } error_log("WEVIA_GPU: OK model=$model lat={$latency}ms len=" . mb_strlen($content)); return [ 'content' => $content, 'model' => $model, 'latency_ms' => $latency, 'tokens_est' => (int)(mb_strlen($content) / 3.5), 'source' => 'gpu_local', ]; } /** * GPU call with rotation fallback */ function gpuCallWithFallback($intent, $systemPrompt, $userMessage, $history = [], $complexity = 'moderate') { [$primary, $fallback, $timeout] = gpuSelectModel($intent, $complexity, mb_strlen($userMessage)); // Try primary $result = gpuCallOllama($primary, $systemPrompt, $userMessage, $history, $timeout); if ($result) return $result; // Try fallback error_log("WEVIA_GPU_ROTATE: primary=$primary failed, trying fallback=$fallback"); $result = gpuCallOllama($fallback, $systemPrompt, $userMessage, $history, GPU_TIMEOUT_FAST); if ($result) return $result; // Try rotation $rotated = gpuRotateModel($fallback, $intent); error_log("WEVIA_GPU_ROTATE: fallback=$fallback failed, trying rotation=$rotated"); return gpuCallOllama($rotated, $systemPrompt, $userMessage, $history, GPU_TIMEOUT_FAST); } // ═══════════════════════════════════════════════════════════════════════════ // CROSS-VERIFICATION ENGINE — Cerebras + Groq check GPU response // ═══════════════════════════════════════════════════════════════════════════ /** * Build verification prompt for cross-checking */ function buildVerificationPrompt($originalQuestion, $gpuResponse) { $excerpt = mb_substr($gpuResponse, 0, 1500); return "Vérifie cette réponse et corrige UNIQUEMENT les erreurs factuelles ou techniques. Ne reformule PAS si c'est correct. Réponds en 2-3 phrases max. QUESTION: {$originalQuestion} RÉPONSE À VÉRIFIER: {$excerpt} Si la réponse est correcte, dis simplement 'VÉRIFIÉ_OK'. Si elle contient des erreurs, liste-les brièvement."; } /** * Call cloud provider for verification (lightweight, fast) */ function cloudVerifyCall($provider, $key, $model, $url, $prompt, $timeout = 10) { $payload = json_encode([ 'model' => $model, 'messages' => [ ['role' => 'system', 'content' => 'Tu es un vérificateur technique. Vérifie et corrige uniquement les erreurs factuelles. Sois bref.'], ['role' => 'user', 'content' => $prompt] ], 'max_tokens' => 300, 'temperature' => 0.1, ], JSON_UNESCAPED_UNICODE); $ch = curl_init($url); curl_setopt_array($ch, [ CURLOPT_POST => true, CURLOPT_POSTFIELDS => $payload, CURLOPT_HTTPHEADER => [ 'Content-Type: application/json', 'Authorization: Bearer ' . $key, ], CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => $timeout, CURLOPT_CONNECTTIMEOUT => 3, ]); $start = microtime(true); $result = curl_exec($ch); $latency = round((microtime(true) - $start) * 1000); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if ($httpCode !== 200 || !$result) { error_log("WEVIA_VERIFY: FAIL provider=$provider http=$httpCode lat={$latency}ms"); return null; } $data = json_decode($result, true); $content = $data['choices'][0]['message']['content'] ?? ''; error_log("WEVIA_VERIFY: OK provider=$provider lat={$latency}ms verdict=" . mb_substr($content, 0, 50)); return [ 'content' => $content, 'provider' => $provider, 'latency_ms' => $latency, ]; } /** * Cross-verify GPU response with Cerebras AND Groq in parallel (via multi_curl) */ function crossVerifyResponse($originalQuestion, $gpuResponse, $providers = []) { if (mb_strlen($gpuResponse) < 100) return ['verified' => true, 'corrections' => [], 'method' => 'skip_short']; $prompt = buildVerificationPrompt($originalQuestion, $gpuResponse); $verifications = []; // Get providers from global if not passed if (empty($providers)) { global $PROVIDERS; $providers = $PROVIDERS ?? []; } // Cerebras verification if (isset($providers['cerebras'])) { $v = cloudVerifyCall( 'cerebras', $providers['cerebras']['key'], $providers['cerebras']['model'], $providers['cerebras']['url'], $prompt, VERIFY_TIMEOUT ); if ($v) $verifications[] = $v; } // Groq verification if (isset($providers['groq'])) { $v = cloudVerifyCall( 'groq', $providers['groq']['key'], $providers['groq']['model'], $providers['groq']['url'], $prompt, VERIFY_TIMEOUT ); if ($v) $verifications[] = $v; } // Analyze verifications $corrections = []; $allVerified = true; foreach ($verifications as $v) { $content = $v['content'] ?? ''; if (stripos($content, 'VÉRIFIÉ_OK') !== false || stripos($content, 'VERIFIE_OK') !== false || stripos($content, 'correcte') !== false) { continue; // This verifier says OK } $allVerified = false; $corrections[] = [ 'provider' => $v['provider'], 'correction' => mb_substr($content, 0, 500), 'latency_ms' => $v['latency_ms'], ]; } return [ 'verified' => $allVerified, 'corrections' => $corrections, 'verifiers_count' => count($verifications), 'method' => 'cerebras+groq', ]; } /** * Apply corrections from verification to the GPU response */ function applyVerificationCorrections($gpuResponse, $verificationResult) { if ($verificationResult['verified'] || empty($verificationResult['corrections'])) { return $gpuResponse; // No corrections needed } // Append correction notes $correctionNotes = []; foreach ($verificationResult['corrections'] as $c) { $correctionNotes[] = $c['correction']; } // If corrections found, append a refined note $correctionText = implode("\n", $correctionNotes); // Don't append if corrections are too vague if (mb_strlen($correctionText) < 20) return $gpuResponse; // Smart merge: only add if corrections contain specific fixes if (preg_match('/(erreur|incorrect|faux|attention|devrait|plutôt|correction|en fait)/i', $correctionText)) { $gpuResponse .= "\n\n---\n📋 **Vérification croisée (Cerebras/Groq):** " . mb_substr($correctionText, 0, 300); } return $gpuResponse; } // ═══════════════════════════════════════════════════════════════════════════ // MASTER PIPELINE — GPU → Verify → Synthesize // ═══════════════════════════════════════════════════════════════════════════ /** * Full sovereign pipeline: * 1. GPU local (Ollama) generates primary response * 2. Cerebras + Groq verify in parallel * 3. Apply corrections if needed * 4. Return enriched response */ function sovereignGPUPipeline($intent, $systemPrompt, $userMessage, $history = [], $providers = []) { $start = microtime(true); $complexity = function_exists('calculateComplexity') ? calculateComplexity($userMessage) : 'moderate'; // Step 1: GPU primary response $gpuResult = gpuCallWithFallback($intent, $systemPrompt, $userMessage, $history, $complexity); if (!$gpuResult) { error_log("WEVIA_SOVEREIGN: GPU pipeline failed — all models exhausted"); return null; // Let cloud fallback handle it } $response = $gpuResult['content']; // Step 2: Cross-verify (skip for greetings, very short queries, or widget mode) $shouldVerify = ( $intent !== 'greeting' && $intent !== 'conversational' && mb_strlen($userMessage) >= 30 && mb_strlen($response) >= 100 && $complexity !== 'simple' ); $verification = ['verified' => true, 'corrections' => [], 'method' => 'skip']; if ($shouldVerify) { $verification = crossVerifyResponse($userMessage, $response, $providers); // Step 3: Apply corrections if (!$verification['verified']) { $response = applyVerificationCorrections($response, $verification); } } $totalLatency = round((microtime(true) - $start) * 1000); error_log(sprintf( "WEVIA_SOVEREIGN: model=%s gpu_lat=%dms verified=%s corrections=%d total=%dms complexity=%s", $gpuResult['model'], $gpuResult['latency_ms'], $verification['verified'] ? 'YES' : 'NO', count($verification['corrections']), $totalLatency, $complexity )); return [ 'content' => $response, 'model' => $gpuResult['model'], 'source' => 'sovereign_gpu', 'gpu_latency_ms' => $gpuResult['latency_ms'], 'verification' => $verification, 'total_latency_ms' => $totalLatency, 'complexity' => $complexity, ]; } /** * Determine if sovereign GPU should be used (vs cloud-first) */ function shouldUseSovereignGPU($intent, $msg, $mode = 'full') { // Widget mode = always cloud (faster) if ($mode === 'widget' || $mode === 'fast') return false; // Greetings = cloud (faster) if ($intent === 'greeting') return false; // Very short messages = cloud if (mb_strlen(trim($msg)) < 20) return false; // Complex queries benefit from GPU reasoning models if (function_exists('calculateComplexity')) { $complexity = calculateComplexity($msg); if ($complexity === 'complex') return true; } // Code, analysis, reasoning = GPU preferred if (in_array($intent, ['code', 'technical', 'analytical', 'mathematical', 'causal', 'strategic', 'compliance'])) { return true; } // Default: alternate GPU/Cloud for load distribution return (time() % 3 !== 0); // ~66% GPU, ~33% cloud } // ═══════════════════════════════════════════════════════════════════════════ // GPU HEALTH & MONITORING // ═══════════════════════════════════════════════════════════════════════════ function gpuHealthCheck() { $ch = curl_init(OLLAMA_URL . '/api/tags'); curl_setopt_array($ch, [ CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 3, ]); $result = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if ($httpCode !== 200) return ['status' => 'down', 'models' => 0]; $data = json_decode($result, true); $models = $data['models'] ?? []; return [ 'status' => 'up', 'models' => count($models), 'fast_models' => count(array_filter($models, fn($m) => ($m['size'] ?? 0) < 20e9)), 'quality_models' => count(array_filter($models, fn($m) => ($m['size'] ?? 0) >= 20e9)), ]; } function gpuGetLoadedModel() { $ch = curl_init(OLLAMA_URL . '/api/ps'); curl_setopt_array($ch, [ CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 3, ]); $result = curl_exec($ch); curl_close($ch); $data = json_decode($result, true); return $data['models'][0]['name'] ?? null; } /** * Prefer currently loaded model to avoid swap latency */ function gpuPreferLoaded($selected, $intent) { $loaded = gpuGetLoadedModel(); if (!$loaded) return $selected; // If loaded model is suitable for this intent, prefer it $fastModels = ['qwen2.5:14b', 'deepseek-v2:16b', 'granite3.1-dense:8b', 'qwen2.5-coder:7b', 'orca2:13b']; if (in_array($loaded, $fastModels)) { // Code intent needs code model if (in_array($intent, ['code', 'technical']) && $loaded === 'qwen2.5-coder:7b') return $loaded; // If loaded is a fast general model, reuse it to avoid swap if (!in_array($intent, ['code', 'technical'])) return $loaded; } return $selected; }