525 lines
20 KiB
PHP
Executable File
525 lines
20 KiB
PHP
Executable File
<?php
|
|
/**
|
|
* ╔═══════════════════════════════════════════════════════════════════════════╗
|
|
* ║ WEVIA GPU ROTATION + CROSS-VERIFICATION ENGINE ║
|
|
* ║ Local Ollama (S88) → Verify with Cerebras + Groq ║
|
|
* ║ 2026-03-03 ║
|
|
* ╚═══════════════════════════════════════════════════════════════════════════╝
|
|
*
|
|
* ARCHITECTURE:
|
|
* 1. Primary: Ollama local GPU (0ms network, sovereign)
|
|
* 2. Verify: Cerebras (ultrafast) + Groq (fast) cross-check
|
|
* 3. Synthesize: Best answer wins, merge insights
|
|
*
|
|
* GPU TIERS (RTX 4000 Ada 20GB):
|
|
* VRAM-fit (fast): qwen2.5:14b, deepseek-v2:16b, granite3.1-dense:8b
|
|
* CPU+RAM (quality): deepseek-r1:70b, qwen2.5:72b, nemotron:70b
|
|
* Code-fast: qwen2.5-coder:7b
|
|
* Vision: llama3.2-vision:11b
|
|
*/
|
|
|
|
define('OLLAMA_URL', 'http://127.0.0.1:11434');
|
|
define('GPU_TIMEOUT_FAST', 15);
|
|
define('GPU_TIMEOUT_QUALITY', 45);
|
|
define('VERIFY_TIMEOUT', 10);
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// GPU MODEL ROTATION — Intent-based optimal model selection
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
/**
|
|
* Select best Ollama model per intent + complexity
|
|
* Returns array: [primary_model, fallback_model, timeout]
|
|
*/
|
|
function gpuSelectModel($intent, $complexity = 'moderate', $msgLen = 0) {
|
|
// VRAM-fit models (fast, <20GB) — primary choices
|
|
$fastModels = [
|
|
'qwen2.5:14b', // 9GB — excellent general + reasoning
|
|
'deepseek-v2:16b', // 9GB — strong general
|
|
'granite3.1-dense:8b', // 5GB — fast, good quality
|
|
'qwen2.5-coder:7b', // 5GB — code specialist
|
|
'orca2:13b', // 7GB — instruction following
|
|
];
|
|
|
|
// CPU+RAM models (slower but higher quality, need 64GB RAM)
|
|
$qualityModels = [
|
|
'deepseek-r1:70b', // 42GB — deep reasoning champion
|
|
'qwen2.5:72b', // 47GB — strong multi-domain
|
|
'nemotron:70b', // 42GB — NVIDIA, balanced
|
|
'aya:35b', // 20GB — multilingual (FR/AR/EN)
|
|
'vicuna:33b', // 18GB — conversational
|
|
];
|
|
|
|
// Expert models per domain
|
|
$codeModels = ['qwen2.5-coder:7b', 'deepseek-v2:16b', 'granite3.1-dense:8b'];
|
|
$reasonModels = ['deepseek-r1:70b', 'qwen2.5:72b', 'qwen2.5:14b'];
|
|
$creativeModels = ['wizardlm2:8x22b', 'dolphin-mixtral:8x22b', 'nous-hermes2:34b'];
|
|
$multilingModels = ['aya:35b', 'qwen2.5:14b', 'command-r-plus:104b'];
|
|
|
|
$primary = $intent; // normalize
|
|
if (is_array($intent)) $primary = $intent['primary'] ?? 'general';
|
|
|
|
// Route by intent
|
|
switch ($primary) {
|
|
case 'code':
|
|
case 'technical':
|
|
case 'operational':
|
|
if ($complexity === 'complex') return [$codeModels[0], $reasonModels[2], GPU_TIMEOUT_FAST];
|
|
return [$codeModels[0], $codeModels[1], GPU_TIMEOUT_FAST];
|
|
|
|
case 'analytical':
|
|
case 'mathematical':
|
|
case 'causal':
|
|
if ($complexity === 'complex') return [$reasonModels[0], $reasonModels[1], GPU_TIMEOUT_QUALITY];
|
|
return [$reasonModels[2], $fastModels[0], GPU_TIMEOUT_FAST];
|
|
|
|
case 'creative':
|
|
return [$fastModels[0], $fastModels[1], GPU_TIMEOUT_FAST];
|
|
|
|
case 'strategic':
|
|
case 'consulting':
|
|
case 'compliance':
|
|
if ($complexity === 'complex') return [$reasonModels[0], $reasonModels[1], GPU_TIMEOUT_QUALITY];
|
|
return [$fastModels[0], $fastModels[1], GPU_TIMEOUT_FAST];
|
|
|
|
case 'teaching':
|
|
case 'social_intelligence':
|
|
return [$multilingModels[1], $fastModels[0], GPU_TIMEOUT_FAST];
|
|
|
|
case 'greeting':
|
|
case 'conversational':
|
|
return [$fastModels[2], $fastModels[0], GPU_TIMEOUT_FAST]; // Fastest
|
|
|
|
default:
|
|
return [$fastModels[0], $fastModels[1], GPU_TIMEOUT_FAST];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Rotate model on retry — avoid reusing failed model
|
|
*/
|
|
function gpuRotateModel($failedModel, $intent) {
|
|
$allModels = [
|
|
'qwen2.5:14b', 'deepseek-v2:16b', 'granite3.1-dense:8b',
|
|
'qwen2.5-coder:7b', 'orca2:13b', 'aya:35b',
|
|
];
|
|
foreach ($allModels as $m) {
|
|
if ($m !== $failedModel) return $m;
|
|
}
|
|
return 'granite3.1-dense:8b'; // Ultimate fallback (small, fast)
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// GPU CALL — Local Ollama API
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
/**
|
|
* Call Ollama local model
|
|
*/
|
|
function gpuCallOllama($model, $systemPrompt, $userMessage, $history = [], $timeout = 15, $temperature = 0.4, $maxTokens = 2048) {
|
|
$messages = [];
|
|
if ($systemPrompt) $messages[] = ['role' => 'system', 'content' => $systemPrompt];
|
|
|
|
// Add history (last 6 messages max)
|
|
if (!empty($history)) {
|
|
$recent = array_slice($history, -6);
|
|
foreach ($recent as $h) {
|
|
if (isset($h['role'], $h['content'])) {
|
|
$messages[] = ['role' => $h['role'], 'content' => mb_substr($h['content'], 0, 1000)];
|
|
}
|
|
}
|
|
}
|
|
|
|
$messages[] = ['role' => 'user', 'content' => $userMessage];
|
|
|
|
$payload = json_encode([
|
|
'model' => $model,
|
|
'messages' => $messages,
|
|
'stream' => false,
|
|
'options' => [
|
|
'temperature' => $temperature,
|
|
'num_predict' => $maxTokens,
|
|
'top_p' => 0.9,
|
|
'repeat_penalty' => 1.1,
|
|
]
|
|
], JSON_UNESCAPED_UNICODE);
|
|
|
|
$ch = curl_init(OLLAMA_URL . '/api/chat');
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_POST => true,
|
|
CURLOPT_POSTFIELDS => $payload,
|
|
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_TIMEOUT => $timeout,
|
|
CURLOPT_CONNECTTIMEOUT => 3,
|
|
]);
|
|
|
|
$start = microtime(true);
|
|
$result = curl_exec($ch);
|
|
$latency = round((microtime(true) - $start) * 1000);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
$error = curl_error($ch);
|
|
curl_close($ch);
|
|
|
|
if ($httpCode !== 200 || !$result) {
|
|
error_log("WEVIA_GPU: FAIL model=$model http=$httpCode err=$error lat={$latency}ms");
|
|
return null;
|
|
}
|
|
|
|
$data = json_decode($result, true);
|
|
$content = $data['message']['content'] ?? '';
|
|
|
|
if (empty(trim($content))) {
|
|
error_log("WEVIA_GPU: EMPTY model=$model lat={$latency}ms");
|
|
return null;
|
|
}
|
|
|
|
error_log("WEVIA_GPU: OK model=$model lat={$latency}ms len=" . mb_strlen($content));
|
|
|
|
return [
|
|
'content' => $content,
|
|
'model' => $model,
|
|
'latency_ms' => $latency,
|
|
'tokens_est' => (int)(mb_strlen($content) / 3.5),
|
|
'source' => 'gpu_local',
|
|
];
|
|
}
|
|
|
|
/**
|
|
* GPU call with rotation fallback
|
|
*/
|
|
function gpuCallWithFallback($intent, $systemPrompt, $userMessage, $history = [], $complexity = 'moderate') {
|
|
[$primary, $fallback, $timeout] = gpuSelectModel($intent, $complexity, mb_strlen($userMessage));
|
|
|
|
// Try primary
|
|
$result = gpuCallOllama($primary, $systemPrompt, $userMessage, $history, $timeout);
|
|
if ($result) return $result;
|
|
|
|
// Try fallback
|
|
error_log("WEVIA_GPU_ROTATE: primary=$primary failed, trying fallback=$fallback");
|
|
$result = gpuCallOllama($fallback, $systemPrompt, $userMessage, $history, GPU_TIMEOUT_FAST);
|
|
if ($result) return $result;
|
|
|
|
// Try rotation
|
|
$rotated = gpuRotateModel($fallback, $intent);
|
|
error_log("WEVIA_GPU_ROTATE: fallback=$fallback failed, trying rotation=$rotated");
|
|
return gpuCallOllama($rotated, $systemPrompt, $userMessage, $history, GPU_TIMEOUT_FAST);
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// CROSS-VERIFICATION ENGINE — Cerebras + Groq check GPU response
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
/**
|
|
* Build verification prompt for cross-checking
|
|
*/
|
|
function buildVerificationPrompt($originalQuestion, $gpuResponse) {
|
|
$excerpt = mb_substr($gpuResponse, 0, 1500);
|
|
return "Vérifie cette réponse et corrige UNIQUEMENT les erreurs factuelles ou techniques. Ne reformule PAS si c'est correct. Réponds en 2-3 phrases max.
|
|
|
|
QUESTION: {$originalQuestion}
|
|
|
|
RÉPONSE À VÉRIFIER:
|
|
{$excerpt}
|
|
|
|
Si la réponse est correcte, dis simplement 'VÉRIFIÉ_OK'. Si elle contient des erreurs, liste-les brièvement.";
|
|
}
|
|
|
|
/**
|
|
* Call cloud provider for verification (lightweight, fast)
|
|
*/
|
|
function cloudVerifyCall($provider, $key, $model, $url, $prompt, $timeout = 10) {
|
|
$payload = json_encode([
|
|
'model' => $model,
|
|
'messages' => [
|
|
['role' => 'system', 'content' => 'Tu es un vérificateur technique. Vérifie et corrige uniquement les erreurs factuelles. Sois bref.'],
|
|
['role' => 'user', 'content' => $prompt]
|
|
],
|
|
'max_tokens' => 300,
|
|
'temperature' => 0.1,
|
|
], JSON_UNESCAPED_UNICODE);
|
|
|
|
$ch = curl_init($url);
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_POST => true,
|
|
CURLOPT_POSTFIELDS => $payload,
|
|
CURLOPT_HTTPHEADER => [
|
|
'Content-Type: application/json',
|
|
'Authorization: Bearer ' . $key,
|
|
],
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_TIMEOUT => $timeout,
|
|
CURLOPT_CONNECTTIMEOUT => 3,
|
|
]);
|
|
|
|
$start = microtime(true);
|
|
$result = curl_exec($ch);
|
|
$latency = round((microtime(true) - $start) * 1000);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if ($httpCode !== 200 || !$result) {
|
|
error_log("WEVIA_VERIFY: FAIL provider=$provider http=$httpCode lat={$latency}ms");
|
|
return null;
|
|
}
|
|
|
|
$data = json_decode($result, true);
|
|
$content = $data['choices'][0]['message']['content'] ?? '';
|
|
error_log("WEVIA_VERIFY: OK provider=$provider lat={$latency}ms verdict=" . mb_substr($content, 0, 50));
|
|
|
|
return [
|
|
'content' => $content,
|
|
'provider' => $provider,
|
|
'latency_ms' => $latency,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Cross-verify GPU response with Cerebras AND Groq in parallel (via multi_curl)
|
|
*/
|
|
function crossVerifyResponse($originalQuestion, $gpuResponse, $providers = []) {
|
|
if (mb_strlen($gpuResponse) < 100) return ['verified' => true, 'corrections' => [], 'method' => 'skip_short'];
|
|
|
|
$prompt = buildVerificationPrompt($originalQuestion, $gpuResponse);
|
|
$verifications = [];
|
|
|
|
// Get providers from global if not passed
|
|
if (empty($providers)) {
|
|
global $PROVIDERS;
|
|
$providers = $PROVIDERS ?? [];
|
|
}
|
|
|
|
// Cerebras verification
|
|
if (isset($providers['cerebras'])) {
|
|
$v = cloudVerifyCall(
|
|
'cerebras',
|
|
$providers['cerebras']['key'],
|
|
$providers['cerebras']['model'],
|
|
$providers['cerebras']['url'],
|
|
$prompt,
|
|
VERIFY_TIMEOUT
|
|
);
|
|
if ($v) $verifications[] = $v;
|
|
}
|
|
|
|
// Groq verification
|
|
if (isset($providers['groq'])) {
|
|
$v = cloudVerifyCall(
|
|
'groq',
|
|
$providers['groq']['key'],
|
|
$providers['groq']['model'],
|
|
$providers['groq']['url'],
|
|
$prompt,
|
|
VERIFY_TIMEOUT
|
|
);
|
|
if ($v) $verifications[] = $v;
|
|
}
|
|
|
|
// Analyze verifications
|
|
$corrections = [];
|
|
$allVerified = true;
|
|
|
|
foreach ($verifications as $v) {
|
|
$content = $v['content'] ?? '';
|
|
if (stripos($content, 'VÉRIFIÉ_OK') !== false || stripos($content, 'VERIFIE_OK') !== false || stripos($content, 'correcte') !== false) {
|
|
continue; // This verifier says OK
|
|
}
|
|
$allVerified = false;
|
|
$corrections[] = [
|
|
'provider' => $v['provider'],
|
|
'correction' => mb_substr($content, 0, 500),
|
|
'latency_ms' => $v['latency_ms'],
|
|
];
|
|
}
|
|
|
|
return [
|
|
'verified' => $allVerified,
|
|
'corrections' => $corrections,
|
|
'verifiers_count' => count($verifications),
|
|
'method' => 'cerebras+groq',
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Apply corrections from verification to the GPU response
|
|
*/
|
|
function applyVerificationCorrections($gpuResponse, $verificationResult) {
|
|
if ($verificationResult['verified'] || empty($verificationResult['corrections'])) {
|
|
return $gpuResponse; // No corrections needed
|
|
}
|
|
|
|
// Append correction notes
|
|
$correctionNotes = [];
|
|
foreach ($verificationResult['corrections'] as $c) {
|
|
$correctionNotes[] = $c['correction'];
|
|
}
|
|
|
|
// If corrections found, append a refined note
|
|
$correctionText = implode("\n", $correctionNotes);
|
|
|
|
// Don't append if corrections are too vague
|
|
if (mb_strlen($correctionText) < 20) return $gpuResponse;
|
|
|
|
// Smart merge: only add if corrections contain specific fixes
|
|
if (preg_match('/(erreur|incorrect|faux|attention|devrait|plutôt|correction|en fait)/i', $correctionText)) {
|
|
$gpuResponse .= "\n\n---\n📋 **Vérification croisée (Cerebras/Groq):** " . mb_substr($correctionText, 0, 300);
|
|
}
|
|
|
|
return $gpuResponse;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// MASTER PIPELINE — GPU → Verify → Synthesize
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
/**
|
|
* Full sovereign pipeline:
|
|
* 1. GPU local (Ollama) generates primary response
|
|
* 2. Cerebras + Groq verify in parallel
|
|
* 3. Apply corrections if needed
|
|
* 4. Return enriched response
|
|
*/
|
|
function sovereignGPUPipeline($intent, $systemPrompt, $userMessage, $history = [], $providers = []) {
|
|
$start = microtime(true);
|
|
$complexity = function_exists('calculateComplexity') ? calculateComplexity($userMessage) : 'moderate';
|
|
|
|
// Step 1: GPU primary response
|
|
$gpuResult = gpuCallWithFallback($intent, $systemPrompt, $userMessage, $history, $complexity);
|
|
|
|
if (!$gpuResult) {
|
|
error_log("WEVIA_SOVEREIGN: GPU pipeline failed — all models exhausted");
|
|
return null; // Let cloud fallback handle it
|
|
}
|
|
|
|
$response = $gpuResult['content'];
|
|
|
|
// Step 2: Cross-verify (skip for greetings, very short queries, or widget mode)
|
|
$shouldVerify = (
|
|
$intent !== 'greeting' &&
|
|
$intent !== 'conversational' &&
|
|
mb_strlen($userMessage) >= 30 &&
|
|
mb_strlen($response) >= 100 &&
|
|
$complexity !== 'simple'
|
|
);
|
|
|
|
$verification = ['verified' => true, 'corrections' => [], 'method' => 'skip'];
|
|
|
|
if ($shouldVerify) {
|
|
$verification = crossVerifyResponse($userMessage, $response, $providers);
|
|
|
|
// Step 3: Apply corrections
|
|
if (!$verification['verified']) {
|
|
$response = applyVerificationCorrections($response, $verification);
|
|
}
|
|
}
|
|
|
|
$totalLatency = round((microtime(true) - $start) * 1000);
|
|
|
|
error_log(sprintf(
|
|
"WEVIA_SOVEREIGN: model=%s gpu_lat=%dms verified=%s corrections=%d total=%dms complexity=%s",
|
|
$gpuResult['model'],
|
|
$gpuResult['latency_ms'],
|
|
$verification['verified'] ? 'YES' : 'NO',
|
|
count($verification['corrections']),
|
|
$totalLatency,
|
|
$complexity
|
|
));
|
|
|
|
return [
|
|
'content' => $response,
|
|
'model' => $gpuResult['model'],
|
|
'source' => 'sovereign_gpu',
|
|
'gpu_latency_ms' => $gpuResult['latency_ms'],
|
|
'verification' => $verification,
|
|
'total_latency_ms' => $totalLatency,
|
|
'complexity' => $complexity,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Determine if sovereign GPU should be used (vs cloud-first)
|
|
*/
|
|
function shouldUseSovereignGPU($intent, $msg, $mode = 'full') {
|
|
// Widget mode = always cloud (faster)
|
|
if ($mode === 'widget' || $mode === 'fast') return false;
|
|
|
|
// Greetings = cloud (faster)
|
|
if ($intent === 'greeting') return false;
|
|
|
|
// Very short messages = cloud
|
|
if (mb_strlen(trim($msg)) < 20) return false;
|
|
|
|
// Complex queries benefit from GPU reasoning models
|
|
if (function_exists('calculateComplexity')) {
|
|
$complexity = calculateComplexity($msg);
|
|
if ($complexity === 'complex') return true;
|
|
}
|
|
|
|
// Code, analysis, reasoning = GPU preferred
|
|
if (in_array($intent, ['code', 'technical', 'analytical', 'mathematical', 'causal', 'strategic', 'compliance'])) {
|
|
return true;
|
|
}
|
|
|
|
// Default: alternate GPU/Cloud for load distribution
|
|
return (time() % 3 !== 0); // ~66% GPU, ~33% cloud
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// GPU HEALTH & MONITORING
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
function gpuHealthCheck() {
|
|
$ch = curl_init(OLLAMA_URL . '/api/tags');
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_TIMEOUT => 3,
|
|
]);
|
|
$result = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if ($httpCode !== 200) return ['status' => 'down', 'models' => 0];
|
|
|
|
$data = json_decode($result, true);
|
|
$models = $data['models'] ?? [];
|
|
|
|
return [
|
|
'status' => 'up',
|
|
'models' => count($models),
|
|
'fast_models' => count(array_filter($models, fn($m) => ($m['size'] ?? 0) < 20e9)),
|
|
'quality_models' => count(array_filter($models, fn($m) => ($m['size'] ?? 0) >= 20e9)),
|
|
];
|
|
}
|
|
|
|
function gpuGetLoadedModel() {
|
|
$ch = curl_init(OLLAMA_URL . '/api/ps');
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_TIMEOUT => 3,
|
|
]);
|
|
$result = curl_exec($ch);
|
|
curl_close($ch);
|
|
|
|
$data = json_decode($result, true);
|
|
return $data['models'][0]['name'] ?? null;
|
|
}
|
|
|
|
/**
|
|
* Prefer currently loaded model to avoid swap latency
|
|
*/
|
|
function gpuPreferLoaded($selected, $intent) {
|
|
$loaded = gpuGetLoadedModel();
|
|
if (!$loaded) return $selected;
|
|
|
|
// If loaded model is suitable for this intent, prefer it
|
|
$fastModels = ['qwen2.5:14b', 'deepseek-v2:16b', 'granite3.1-dense:8b', 'qwen2.5-coder:7b', 'orca2:13b'];
|
|
if (in_array($loaded, $fastModels)) {
|
|
// Code intent needs code model
|
|
if (in_array($intent, ['code', 'technical']) && $loaded === 'qwen2.5-coder:7b') return $loaded;
|
|
// If loaded is a fast general model, reuse it to avoid swap
|
|
if (!in_array($intent, ['code', 'technical'])) return $loaded;
|
|
}
|
|
|
|
return $selected;
|
|
}
|