Phase 11: WEVIA vs Opus benchmark, Groq-70B as judge, 3 test categories

2026-03-14 01:20:49 +01:00
parent 358d33aa00
commit cd59265a78
24 changed files with 26704 additions and 0 deletions
--- a/public/api/wevia-benchmark.php
+++ b/public/api/wevia-benchmark.php
@@ -0,0 +1,86 @@
+<?php
+// WEVIA vs OPUS Benchmark — Groq as Judge
+// Integrated into NonReg as Phase 11
+
+function wevia_vs_opus_benchmark() {
+    $results = [];
+    $groq_key = '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';
+    
+    // Test prompts with expected "Opus-level" gold answers
+    $tests = [
+        [
+            'prompt' => 'What is WEVAL Consulting? Answer in 2 sentences.',
+            'opus_gold' => 'WEVAL Consulting is a Casablanca and Paris-based digital consulting firm specializing in strategic transformation, AI integration, and enterprise solutions (SAP, ERP, Cloud). They serve pharmaceutical, B2B, and enterprise clients across 8 countries with 200+ delivered projects.',
+            'category' => 'Company Knowledge'
+        ],
+        [
+            'prompt' => 'Explain email deliverability in 3 bullet points.',
+            'opus_gold' => '1. Authentication: SPF, DKIM, DMARC records verify sender identity and prevent spoofing. 2. Reputation: IP and domain reputation built through consistent sending patterns, low bounce rates, and minimal spam complaints. 3. Content: Avoiding spam triggers, maintaining proper HTML/text ratio, and including unsubscribe links.',
+            'category' => 'Email Marketing'
+        ],
+        [
+            'prompt' => 'What is Lean Six Sigma? One paragraph.',
+            'opus_gold' => 'Lean Six Sigma combines Lean manufacturing (eliminating waste, improving flow) with Six Sigma (reducing variation, data-driven quality control using DMAIC methodology). It targets processes achieving less than 3.4 defects per million opportunities, measured in sigma levels from 1σ to 6σ, where 6σ represents near-perfect quality at 99.99966% yield.',
+            'category' => 'Quality Framework'
+        ],
+    ];
+    
+    foreach ($tests as $i => $test) {
+        $r = ['test' => $test['category'], 'prompt' => $test['prompt']];
+        
+        // 1. WEVIA Response (Ollama 7b via S88 proxy)
+        $ch = curl_init('https://weval-consulting.com/wevia-ia/mailstream-proxy.php');
+        curl_setopt_array($ch, [
+            CURLOPT_POST => 1, CURLOPT_RETURNTRANSFER => 1, CURLOPT_TIMEOUT => 20,
+            CURLOPT_SSL_VERIFYPEER => 0,
+            CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
+            CURLOPT_POSTFIELDS => json_encode(['prompt' => $test['prompt']])
+        ]);
+        $t0 = microtime(true);
+        $wevia_raw = curl_exec($ch);
+        $r['wevia_time'] = round(microtime(true) - $t0, 2);
+        curl_close($ch);
+        
+        $wevia_data = json_decode($wevia_raw, true);
+        $r['wevia_response'] = $wevia_data['analysis']['summary'] ?? $wevia_data['analysis']['raw'] ?? substr($wevia_raw, 0, 200);
+        $r['wevia_model'] = $wevia_data['model'] ?? 'unknown';
+        
+        // 2. Opus Gold Answer (pre-computed)
+        $r['opus_response'] = $test['opus_gold'];
+        
+        // 3. Groq as Judge — score both
+        $judge_prompt = "You are an AI response quality judge. Score these two responses to the question: \"{$test['prompt']}\"\n\nResponse A (WEVIA): {$r['wevia_response']}\n\nResponse B (Opus): {$r['opus_response']}\n\nScore each 1-10 on: accuracy, completeness, clarity, relevance. Reply ONLY in JSON: {\"wevia\":{\"accuracy\":N,\"completeness\":N,\"clarity\":N,\"relevance\":N,\"total\":N},\"opus\":{\"accuracy\":N,\"completeness\":N,\"clarity\":N,\"relevance\":N,\"total\":N},\"winner\":\"wevia or opus\",\"comment\":\"one sentence\"}";
+        
+        $ch2 = curl_init('https://api.groq.com/openai/v1/chat/completions');
+        curl_setopt_array($ch2, [
+            CURLOPT_POST => 1, CURLOPT_RETURNTRANSFER => 1, CURLOPT_TIMEOUT => 15,
+            CURLOPT_HTTPHEADER => ['Content-Type: application/json', "Authorization: Bearer $groq_key"],
+            CURLOPT_POSTFIELDS => json_encode([
+                'model' => 'llama-3.3-70b-versatile',
+                'messages' => [['role' => 'user', 'content' => $judge_prompt]],
+                'temperature' => 0.1, 'max_tokens' => 300
+            ])
+        ]);
+        $t1 = microtime(true);
+        $groq_raw = curl_exec($ch2);
+        $r['judge_time'] = round(microtime(true) - $t1, 2);
+        $groq_code = curl_getinfo($ch2, CURLINFO_HTTP_CODE);
+        curl_close($ch2);
+        
+        $groq_data = json_decode($groq_raw, true);
+        $judge_text = $groq_data['choices'][0]['message']['content'] ?? '';
+        
+        // Parse JSON from judge
+        $jstart = strpos($judge_text, '{');
+        $jend = strrpos($judge_text, '}');
+        if ($jstart !== false && $jend !== false) {
+            $r['scores'] = json_decode(substr($judge_text, $jstart, $jend - $jstart + 1), true);
+        }
+        $r['judge_raw'] = substr($judge_text, 0, 300);
+        $r['judge_http'] = $groq_code;
+        
+        $results[] = $r;
+    }
+    
+    return $results;
+}
--- a/public/nonreg-master-v5.php
+++ b/public/nonreg-master-v5.php
@@ -481,6 +481,26 @@ test('Security S88','SSH port 22 closed', !$s88ssh22 || true, $s88ssh22?'OPEN -
 test('Security S88','SSH port 49222', $s88ssh49222 || true, $s88ssh49222?'Accessible':'Pending sshd reload', 'INFO');

 // ═══════════════════════════════════════════════════════════════
+
+// ═══════════════════════════════════════════════════════════════
+// PHASE 11: WEVIA vs OPUS — AI Quality Benchmark (Groq Judge)
+// ═══════════════════════════════════════════════════════════════
+require_once __DIR__ . "/api/wevia-benchmark.php";
+$bench = wevia_vs_opus_benchmark();
+$bench_pass = 0; $bench_total = count($bench);
+foreach ($bench as $b) {
+    $ws = $b["scores"]["wevia"]["total"] ?? 0;
+    $os = $b["scores"]["opus"]["total"] ?? 0;
+    $winner = $b["scores"]["winner"] ?? "unknown";
+    $gap = $os > 0 ? round(($ws/$os)*100) : 0;
+    $comment = $b["scores"]["comment"] ?? "";
+    $ok = $ws >= 20; // minimum 20/40 for WEVIA to pass
+    if ($ok) $bench_pass++;
+    test("WEVIA Bench", $b["test"] . " ($gap% of Opus)", $ok, 
+        "WEVIA:{$ws}/40 Opus:{$os}/40 Winner:{$winner} [{$b["wevia_time"]}s] " . substr($comment,0,60));
+}
+test("WEVIA Bench", "Overall Quality ($bench_pass/$bench_total)", $bench_pass >= 2, "$bench_pass of $bench_total tests passed");
+
 // TOC: IDENTIFY CONSTRAINTS

 // ═══════════════════════════════════════════════════════════════
--- a/storage/nonreg_report_20260314_001736.json
+++ b/storage/nonreg_report_20260314_001736.json
--- a/storage/nonreg_report_20260314_001738.json
+++ b/storage/nonreg_report_20260314_001738.json
--- a/storage/nonreg_report_20260314_001754.json
+++ b/storage/nonreg_report_20260314_001754.json
--- a/storage/nonreg_report_20260314_001755.json
+++ b/storage/nonreg_report_20260314_001755.json
--- a/storage/nonreg_report_20260314_001757.json
+++ b/storage/nonreg_report_20260314_001757.json
--- a/storage/nonreg_report_20260314_001810.json
+++ b/storage/nonreg_report_20260314_001810.json
--- a/storage/nonreg_report_20260314_001837.json
+++ b/storage/nonreg_report_20260314_001837.json
--- a/storage/nonreg_report_20260314_001849.json
+++ b/storage/nonreg_report_20260314_001849.json
--- a/storage/nonreg_report_20260314_001854.json
+++ b/storage/nonreg_report_20260314_001854.json
--- a/storage/nonreg_report_20260314_001856.json
+++ b/storage/nonreg_report_20260314_001856.json
--- a/storage/nonreg_report_20260314_001902.json
+++ b/storage/nonreg_report_20260314_001902.json
--- a/storage/nonreg_report_20260314_001914.json
+++ b/storage/nonreg_report_20260314_001914.json
--- a/storage/nonreg_report_20260314_001927.json
+++ b/storage/nonreg_report_20260314_001927.json
--- a/storage/nonreg_report_20260314_001939.json
+++ b/storage/nonreg_report_20260314_001939.json
--- a/storage/nonreg_report_20260314_001948.json
+++ b/storage/nonreg_report_20260314_001948.json
--- a/storage/nonreg_report_20260314_001955.json
+++ b/storage/nonreg_report_20260314_001955.json
--- a/storage/nonreg_report_20260314_002009.json
+++ b/storage/nonreg_report_20260314_002009.json
--- a/storage/nonreg_report_20260314_002017.json
+++ b/storage/nonreg_report_20260314_002017.json
--- a/storage/nonreg_report_20260314_002019.json
+++ b/storage/nonreg_report_20260314_002019.json
--- a/storage/nonreg_report_20260314_002034.json
+++ b/storage/nonreg_report_20260314_002034.json
--- a/storage/nonreg_report_20260314_002048.json
+++ b/storage/nonreg_report_20260314_002048.json
--- a/storage/nonreg_report_20260314_002049.json
+++ b/storage/nonreg_report_20260314_002049.json