Files
wevia-brain/cognitive-gpu-rotation.php
2026-04-12 23:01:36 +02:00

525 lines
20 KiB
PHP
Executable File

<?php
/**
* ╔═══════════════════════════════════════════════════════════════════════════╗
* ║ WEVIA GPU ROTATION + CROSS-VERIFICATION ENGINE ║
* ║ Local Ollama (S88) → Verify with Cerebras + Groq ║
* ║ 2026-03-03 ║
* ╚═══════════════════════════════════════════════════════════════════════════╝
*
* ARCHITECTURE:
* 1. Primary: Ollama local GPU (0ms network, sovereign)
* 2. Verify: Cerebras (ultrafast) + Groq (fast) cross-check
* 3. Synthesize: Best answer wins, merge insights
*
* GPU TIERS (RTX 4000 Ada 20GB):
* VRAM-fit (fast): qwen2.5:14b, deepseek-v2:16b, granite3.1-dense:8b
* CPU+RAM (quality): deepseek-r1:70b, qwen2.5:72b, nemotron:70b
* Code-fast: qwen2.5-coder:7b
* Vision: llama3.2-vision:11b
*/
define('OLLAMA_URL', 'http://127.0.0.1:11434');
define('GPU_TIMEOUT_FAST', 15);
define('GPU_TIMEOUT_QUALITY', 45);
define('VERIFY_TIMEOUT', 10);
// ═══════════════════════════════════════════════════════════════════════════
// GPU MODEL ROTATION — Intent-based optimal model selection
// ═══════════════════════════════════════════════════════════════════════════
/**
* Select best Ollama model per intent + complexity
* Returns array: [primary_model, fallback_model, timeout]
*/
function gpuSelectModel($intent, $complexity = 'moderate', $msgLen = 0) {
// VRAM-fit models (fast, <20GB) — primary choices
$fastModels = [
'qwen2.5:14b', // 9GB — excellent general + reasoning
'deepseek-v2:16b', // 9GB — strong general
'granite3.1-dense:8b', // 5GB — fast, good quality
'qwen2.5-coder:7b', // 5GB — code specialist
'orca2:13b', // 7GB — instruction following
];
// CPU+RAM models (slower but higher quality, need 64GB RAM)
$qualityModels = [
'deepseek-r1:70b', // 42GB — deep reasoning champion
'qwen2.5:72b', // 47GB — strong multi-domain
'nemotron:70b', // 42GB — NVIDIA, balanced
'aya:35b', // 20GB — multilingual (FR/AR/EN)
'vicuna:33b', // 18GB — conversational
];
// Expert models per domain
$codeModels = ['qwen2.5-coder:7b', 'deepseek-v2:16b', 'granite3.1-dense:8b'];
$reasonModels = ['deepseek-r1:70b', 'qwen2.5:72b', 'qwen2.5:14b'];
$creativeModels = ['wizardlm2:8x22b', 'dolphin-mixtral:8x22b', 'nous-hermes2:34b'];
$multilingModels = ['aya:35b', 'qwen2.5:14b', 'command-r-plus:104b'];
$primary = $intent; // normalize
if (is_array($intent)) $primary = $intent['primary'] ?? 'general';
// Route by intent
switch ($primary) {
case 'code':
case 'technical':
case 'operational':
if ($complexity === 'complex') return [$codeModels[0], $reasonModels[2], GPU_TIMEOUT_FAST];
return [$codeModels[0], $codeModels[1], GPU_TIMEOUT_FAST];
case 'analytical':
case 'mathematical':
case 'causal':
if ($complexity === 'complex') return [$reasonModels[0], $reasonModels[1], GPU_TIMEOUT_QUALITY];
return [$reasonModels[2], $fastModels[0], GPU_TIMEOUT_FAST];
case 'creative':
return [$fastModels[0], $fastModels[1], GPU_TIMEOUT_FAST];
case 'strategic':
case 'consulting':
case 'compliance':
if ($complexity === 'complex') return [$reasonModels[0], $reasonModels[1], GPU_TIMEOUT_QUALITY];
return [$fastModels[0], $fastModels[1], GPU_TIMEOUT_FAST];
case 'teaching':
case 'social_intelligence':
return [$multilingModels[1], $fastModels[0], GPU_TIMEOUT_FAST];
case 'greeting':
case 'conversational':
return [$fastModels[2], $fastModels[0], GPU_TIMEOUT_FAST]; // Fastest
default:
return [$fastModels[0], $fastModels[1], GPU_TIMEOUT_FAST];
}
}
/**
* Rotate model on retry — avoid reusing failed model
*/
function gpuRotateModel($failedModel, $intent) {
$allModels = [
'qwen2.5:14b', 'deepseek-v2:16b', 'granite3.1-dense:8b',
'qwen2.5-coder:7b', 'orca2:13b', 'aya:35b',
];
foreach ($allModels as $m) {
if ($m !== $failedModel) return $m;
}
return 'granite3.1-dense:8b'; // Ultimate fallback (small, fast)
}
// ═══════════════════════════════════════════════════════════════════════════
// GPU CALL — Local Ollama API
// ═══════════════════════════════════════════════════════════════════════════
/**
* Call Ollama local model
*/
function gpuCallOllama($model, $systemPrompt, $userMessage, $history = [], $timeout = 15, $temperature = 0.4, $maxTokens = 2048) {
$messages = [];
if ($systemPrompt) $messages[] = ['role' => 'system', 'content' => $systemPrompt];
// Add history (last 6 messages max)
if (!empty($history)) {
$recent = array_slice($history, -6);
foreach ($recent as $h) {
if (isset($h['role'], $h['content'])) {
$messages[] = ['role' => $h['role'], 'content' => mb_substr($h['content'], 0, 1000)];
}
}
}
$messages[] = ['role' => 'user', 'content' => $userMessage];
$payload = json_encode([
'model' => $model,
'messages' => $messages,
'stream' => false,
'options' => [
'temperature' => $temperature,
'num_predict' => $maxTokens,
'top_p' => 0.9,
'repeat_penalty' => 1.1,
]
], JSON_UNESCAPED_UNICODE);
$ch = curl_init(OLLAMA_URL . '/api/chat');
curl_setopt_array($ch, [
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $payload,
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_CONNECTTIMEOUT => 3,
]);
$start = microtime(true);
$result = curl_exec($ch);
$latency = round((microtime(true) - $start) * 1000);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($httpCode !== 200 || !$result) {
error_log("WEVIA_GPU: FAIL model=$model http=$httpCode err=$error lat={$latency}ms");
return null;
}
$data = json_decode($result, true);
$content = $data['message']['content'] ?? '';
if (empty(trim($content))) {
error_log("WEVIA_GPU: EMPTY model=$model lat={$latency}ms");
return null;
}
error_log("WEVIA_GPU: OK model=$model lat={$latency}ms len=" . mb_strlen($content));
return [
'content' => $content,
'model' => $model,
'latency_ms' => $latency,
'tokens_est' => (int)(mb_strlen($content) / 3.5),
'source' => 'gpu_local',
];
}
/**
* GPU call with rotation fallback
*/
function gpuCallWithFallback($intent, $systemPrompt, $userMessage, $history = [], $complexity = 'moderate') {
[$primary, $fallback, $timeout] = gpuSelectModel($intent, $complexity, mb_strlen($userMessage));
// Try primary
$result = gpuCallOllama($primary, $systemPrompt, $userMessage, $history, $timeout);
if ($result) return $result;
// Try fallback
error_log("WEVIA_GPU_ROTATE: primary=$primary failed, trying fallback=$fallback");
$result = gpuCallOllama($fallback, $systemPrompt, $userMessage, $history, GPU_TIMEOUT_FAST);
if ($result) return $result;
// Try rotation
$rotated = gpuRotateModel($fallback, $intent);
error_log("WEVIA_GPU_ROTATE: fallback=$fallback failed, trying rotation=$rotated");
return gpuCallOllama($rotated, $systemPrompt, $userMessage, $history, GPU_TIMEOUT_FAST);
}
// ═══════════════════════════════════════════════════════════════════════════
// CROSS-VERIFICATION ENGINE — Cerebras + Groq check GPU response
// ═══════════════════════════════════════════════════════════════════════════
/**
* Build verification prompt for cross-checking
*/
function buildVerificationPrompt($originalQuestion, $gpuResponse) {
$excerpt = mb_substr($gpuResponse, 0, 1500);
return "Vérifie cette réponse et corrige UNIQUEMENT les erreurs factuelles ou techniques. Ne reformule PAS si c'est correct. Réponds en 2-3 phrases max.
QUESTION: {$originalQuestion}
RÉPONSE À VÉRIFIER:
{$excerpt}
Si la réponse est correcte, dis simplement 'VÉRIFIÉ_OK'. Si elle contient des erreurs, liste-les brièvement.";
}
/**
* Call cloud provider for verification (lightweight, fast)
*/
function cloudVerifyCall($provider, $key, $model, $url, $prompt, $timeout = 10) {
$payload = json_encode([
'model' => $model,
'messages' => [
['role' => 'system', 'content' => 'Tu es un vérificateur technique. Vérifie et corrige uniquement les erreurs factuelles. Sois bref.'],
['role' => 'user', 'content' => $prompt]
],
'max_tokens' => 300,
'temperature' => 0.1,
], JSON_UNESCAPED_UNICODE);
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $payload,
CURLOPT_HTTPHEADER => [
'Content-Type: application/json',
'Authorization: Bearer ' . $key,
],
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_CONNECTTIMEOUT => 3,
]);
$start = microtime(true);
$result = curl_exec($ch);
$latency = round((microtime(true) - $start) * 1000);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode !== 200 || !$result) {
error_log("WEVIA_VERIFY: FAIL provider=$provider http=$httpCode lat={$latency}ms");
return null;
}
$data = json_decode($result, true);
$content = $data['choices'][0]['message']['content'] ?? '';
error_log("WEVIA_VERIFY: OK provider=$provider lat={$latency}ms verdict=" . mb_substr($content, 0, 50));
return [
'content' => $content,
'provider' => $provider,
'latency_ms' => $latency,
];
}
/**
* Cross-verify GPU response with Cerebras AND Groq in parallel (via multi_curl)
*/
function crossVerifyResponse($originalQuestion, $gpuResponse, $providers = []) {
if (mb_strlen($gpuResponse) < 100) return ['verified' => true, 'corrections' => [], 'method' => 'skip_short'];
$prompt = buildVerificationPrompt($originalQuestion, $gpuResponse);
$verifications = [];
// Get providers from global if not passed
if (empty($providers)) {
global $PROVIDERS;
$providers = $PROVIDERS ?? [];
}
// Cerebras verification
if (isset($providers['cerebras'])) {
$v = cloudVerifyCall(
'cerebras',
$providers['cerebras']['key'],
$providers['cerebras']['model'],
$providers['cerebras']['url'],
$prompt,
VERIFY_TIMEOUT
);
if ($v) $verifications[] = $v;
}
// Groq verification
if (isset($providers['groq'])) {
$v = cloudVerifyCall(
'groq',
$providers['groq']['key'],
$providers['groq']['model'],
$providers['groq']['url'],
$prompt,
VERIFY_TIMEOUT
);
if ($v) $verifications[] = $v;
}
// Analyze verifications
$corrections = [];
$allVerified = true;
foreach ($verifications as $v) {
$content = $v['content'] ?? '';
if (stripos($content, 'VÉRIFIÉ_OK') !== false || stripos($content, 'VERIFIE_OK') !== false || stripos($content, 'correcte') !== false) {
continue; // This verifier says OK
}
$allVerified = false;
$corrections[] = [
'provider' => $v['provider'],
'correction' => mb_substr($content, 0, 500),
'latency_ms' => $v['latency_ms'],
];
}
return [
'verified' => $allVerified,
'corrections' => $corrections,
'verifiers_count' => count($verifications),
'method' => 'cerebras+groq',
];
}
/**
* Apply corrections from verification to the GPU response
*/
function applyVerificationCorrections($gpuResponse, $verificationResult) {
if ($verificationResult['verified'] || empty($verificationResult['corrections'])) {
return $gpuResponse; // No corrections needed
}
// Append correction notes
$correctionNotes = [];
foreach ($verificationResult['corrections'] as $c) {
$correctionNotes[] = $c['correction'];
}
// If corrections found, append a refined note
$correctionText = implode("\n", $correctionNotes);
// Don't append if corrections are too vague
if (mb_strlen($correctionText) < 20) return $gpuResponse;
// Smart merge: only add if corrections contain specific fixes
if (preg_match('/(erreur|incorrect|faux|attention|devrait|plutôt|correction|en fait)/i', $correctionText)) {
$gpuResponse .= "\n\n---\n📋 **Vérification croisée (Cerebras/Groq):** " . mb_substr($correctionText, 0, 300);
}
return $gpuResponse;
}
// ═══════════════════════════════════════════════════════════════════════════
// MASTER PIPELINE — GPU → Verify → Synthesize
// ═══════════════════════════════════════════════════════════════════════════
/**
* Full sovereign pipeline:
* 1. GPU local (Ollama) generates primary response
* 2. Cerebras + Groq verify in parallel
* 3. Apply corrections if needed
* 4. Return enriched response
*/
function sovereignGPUPipeline($intent, $systemPrompt, $userMessage, $history = [], $providers = []) {
$start = microtime(true);
$complexity = function_exists('calculateComplexity') ? calculateComplexity($userMessage) : 'moderate';
// Step 1: GPU primary response
$gpuResult = gpuCallWithFallback($intent, $systemPrompt, $userMessage, $history, $complexity);
if (!$gpuResult) {
error_log("WEVIA_SOVEREIGN: GPU pipeline failed — all models exhausted");
return null; // Let cloud fallback handle it
}
$response = $gpuResult['content'];
// Step 2: Cross-verify (skip for greetings, very short queries, or widget mode)
$shouldVerify = (
$intent !== 'greeting' &&
$intent !== 'conversational' &&
mb_strlen($userMessage) >= 30 &&
mb_strlen($response) >= 100 &&
$complexity !== 'simple'
);
$verification = ['verified' => true, 'corrections' => [], 'method' => 'skip'];
if ($shouldVerify) {
$verification = crossVerifyResponse($userMessage, $response, $providers);
// Step 3: Apply corrections
if (!$verification['verified']) {
$response = applyVerificationCorrections($response, $verification);
}
}
$totalLatency = round((microtime(true) - $start) * 1000);
error_log(sprintf(
"WEVIA_SOVEREIGN: model=%s gpu_lat=%dms verified=%s corrections=%d total=%dms complexity=%s",
$gpuResult['model'],
$gpuResult['latency_ms'],
$verification['verified'] ? 'YES' : 'NO',
count($verification['corrections']),
$totalLatency,
$complexity
));
return [
'content' => $response,
'model' => $gpuResult['model'],
'source' => 'sovereign_gpu',
'gpu_latency_ms' => $gpuResult['latency_ms'],
'verification' => $verification,
'total_latency_ms' => $totalLatency,
'complexity' => $complexity,
];
}
/**
* Determine if sovereign GPU should be used (vs cloud-first)
*/
function shouldUseSovereignGPU($intent, $msg, $mode = 'full') {
// Widget mode = always cloud (faster)
if ($mode === 'widget' || $mode === 'fast') return false;
// Greetings = cloud (faster)
if ($intent === 'greeting') return false;
// Very short messages = cloud
if (mb_strlen(trim($msg)) < 20) return false;
// Complex queries benefit from GPU reasoning models
if (function_exists('calculateComplexity')) {
$complexity = calculateComplexity($msg);
if ($complexity === 'complex') return true;
}
// Code, analysis, reasoning = GPU preferred
if (in_array($intent, ['code', 'technical', 'analytical', 'mathematical', 'causal', 'strategic', 'compliance'])) {
return true;
}
// Default: alternate GPU/Cloud for load distribution
return (time() % 3 !== 0); // ~66% GPU, ~33% cloud
}
// ═══════════════════════════════════════════════════════════════════════════
// GPU HEALTH & MONITORING
// ═══════════════════════════════════════════════════════════════════════════
function gpuHealthCheck() {
$ch = curl_init(OLLAMA_URL . '/api/tags');
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 3,
]);
$result = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode !== 200) return ['status' => 'down', 'models' => 0];
$data = json_decode($result, true);
$models = $data['models'] ?? [];
return [
'status' => 'up',
'models' => count($models),
'fast_models' => count(array_filter($models, fn($m) => ($m['size'] ?? 0) < 20e9)),
'quality_models' => count(array_filter($models, fn($m) => ($m['size'] ?? 0) >= 20e9)),
];
}
function gpuGetLoadedModel() {
$ch = curl_init(OLLAMA_URL . '/api/ps');
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 3,
]);
$result = curl_exec($ch);
curl_close($ch);
$data = json_decode($result, true);
return $data['models'][0]['name'] ?? null;
}
/**
* Prefer currently loaded model to avoid swap latency
*/
function gpuPreferLoaded($selected, $intent) {
$loaded = gpuGetLoadedModel();
if (!$loaded) return $selected;
// If loaded model is suitable for this intent, prefer it
$fastModels = ['qwen2.5:14b', 'deepseek-v2:16b', 'granite3.1-dense:8b', 'qwen2.5-coder:7b', 'orca2:13b'];
if (in_array($loaded, $fastModels)) {
// Code intent needs code model
if (in_array($intent, ['code', 'technical']) && $loaded === 'qwen2.5-coder:7b') return $loaded;
// If loaded is a fast general model, reuse it to avoid swap
if (!in_array($intent, ['code', 'technical'])) return $loaded;
}
return $selected;
}