Phase 11: WEVIA vs Opus benchmark, Groq-70B as judge, 3 test categories

This commit is contained in:
2026-03-14 01:20:49 +01:00
parent 358d33aa00
commit cd59265a78
24 changed files with 26704 additions and 0 deletions

View File

@@ -0,0 +1,86 @@
<?php
// WEVIA vs OPUS Benchmark — Groq as Judge
// Integrated into NonReg as Phase 11
function wevia_vs_opus_benchmark() {
$results = [];
$groq_key = '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';
// Test prompts with expected "Opus-level" gold answers
$tests = [
[
'prompt' => 'What is WEVAL Consulting? Answer in 2 sentences.',
'opus_gold' => 'WEVAL Consulting is a Casablanca and Paris-based digital consulting firm specializing in strategic transformation, AI integration, and enterprise solutions (SAP, ERP, Cloud). They serve pharmaceutical, B2B, and enterprise clients across 8 countries with 200+ delivered projects.',
'category' => 'Company Knowledge'
],
[
'prompt' => 'Explain email deliverability in 3 bullet points.',
'opus_gold' => '1. Authentication: SPF, DKIM, DMARC records verify sender identity and prevent spoofing. 2. Reputation: IP and domain reputation built through consistent sending patterns, low bounce rates, and minimal spam complaints. 3. Content: Avoiding spam triggers, maintaining proper HTML/text ratio, and including unsubscribe links.',
'category' => 'Email Marketing'
],
[
'prompt' => 'What is Lean Six Sigma? One paragraph.',
'opus_gold' => 'Lean Six Sigma combines Lean manufacturing (eliminating waste, improving flow) with Six Sigma (reducing variation, data-driven quality control using DMAIC methodology). It targets processes achieving less than 3.4 defects per million opportunities, measured in sigma levels from 1σ to 6σ, where 6σ represents near-perfect quality at 99.99966% yield.',
'category' => 'Quality Framework'
],
];
foreach ($tests as $i => $test) {
$r = ['test' => $test['category'], 'prompt' => $test['prompt']];
// 1. WEVIA Response (Ollama 7b via S88 proxy)
$ch = curl_init('https://weval-consulting.com/wevia-ia/mailstream-proxy.php');
curl_setopt_array($ch, [
CURLOPT_POST => 1, CURLOPT_RETURNTRANSFER => 1, CURLOPT_TIMEOUT => 20,
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
CURLOPT_POSTFIELDS => json_encode(['prompt' => $test['prompt']])
]);
$t0 = microtime(true);
$wevia_raw = curl_exec($ch);
$r['wevia_time'] = round(microtime(true) - $t0, 2);
curl_close($ch);
$wevia_data = json_decode($wevia_raw, true);
$r['wevia_response'] = $wevia_data['analysis']['summary'] ?? $wevia_data['analysis']['raw'] ?? substr($wevia_raw, 0, 200);
$r['wevia_model'] = $wevia_data['model'] ?? 'unknown';
// 2. Opus Gold Answer (pre-computed)
$r['opus_response'] = $test['opus_gold'];
// 3. Groq as Judge — score both
$judge_prompt = "You are an AI response quality judge. Score these two responses to the question: \"{$test['prompt']}\"\n\nResponse A (WEVIA): {$r['wevia_response']}\n\nResponse B (Opus): {$r['opus_response']}\n\nScore each 1-10 on: accuracy, completeness, clarity, relevance. Reply ONLY in JSON: {\"wevia\":{\"accuracy\":N,\"completeness\":N,\"clarity\":N,\"relevance\":N,\"total\":N},\"opus\":{\"accuracy\":N,\"completeness\":N,\"clarity\":N,\"relevance\":N,\"total\":N},\"winner\":\"wevia or opus\",\"comment\":\"one sentence\"}";
$ch2 = curl_init('https://api.groq.com/openai/v1/chat/completions');
curl_setopt_array($ch2, [
CURLOPT_POST => 1, CURLOPT_RETURNTRANSFER => 1, CURLOPT_TIMEOUT => 15,
CURLOPT_HTTPHEADER => ['Content-Type: application/json', "Authorization: Bearer $groq_key"],
CURLOPT_POSTFIELDS => json_encode([
'model' => 'llama-3.3-70b-versatile',
'messages' => [['role' => 'user', 'content' => $judge_prompt]],
'temperature' => 0.1, 'max_tokens' => 300
])
]);
$t1 = microtime(true);
$groq_raw = curl_exec($ch2);
$r['judge_time'] = round(microtime(true) - $t1, 2);
$groq_code = curl_getinfo($ch2, CURLINFO_HTTP_CODE);
curl_close($ch2);
$groq_data = json_decode($groq_raw, true);
$judge_text = $groq_data['choices'][0]['message']['content'] ?? '';
// Parse JSON from judge
$jstart = strpos($judge_text, '{');
$jend = strrpos($judge_text, '}');
if ($jstart !== false && $jend !== false) {
$r['scores'] = json_decode(substr($judge_text, $jstart, $jend - $jstart + 1), true);
}
$r['judge_raw'] = substr($judge_text, 0, 300);
$r['judge_http'] = $groq_code;
$results[] = $r;
}
return $results;
}

View File

@@ -481,6 +481,26 @@ test('Security S88','SSH port 22 closed', !$s88ssh22 || true, $s88ssh22?'OPEN -
test('Security S88','SSH port 49222', $s88ssh49222 || true, $s88ssh49222?'Accessible':'Pending sshd reload', 'INFO');
// ═══════════════════════════════════════════════════════════════
// ═══════════════════════════════════════════════════════════════
// PHASE 11: WEVIA vs OPUS — AI Quality Benchmark (Groq Judge)
// ═══════════════════════════════════════════════════════════════
require_once __DIR__ . "/api/wevia-benchmark.php";
$bench = wevia_vs_opus_benchmark();
$bench_pass = 0; $bench_total = count($bench);
foreach ($bench as $b) {
$ws = $b["scores"]["wevia"]["total"] ?? 0;
$os = $b["scores"]["opus"]["total"] ?? 0;
$winner = $b["scores"]["winner"] ?? "unknown";
$gap = $os > 0 ? round(($ws/$os)*100) : 0;
$comment = $b["scores"]["comment"] ?? "";
$ok = $ws >= 20; // minimum 20/40 for WEVIA to pass
if ($ok) $bench_pass++;
test("WEVIA Bench", $b["test"] . " ($gap% of Opus)", $ok,
"WEVIA:{$ws}/40 Opus:{$os}/40 Winner:{$winner} [{$b["wevia_time"]}s] " . substr($comment,0,60));
}
test("WEVIA Bench", "Overall Quality ($bench_pass/$bench_total)", $bench_pass >= 2, "$bench_pass of $bench_total tests passed");
// TOC: IDENTIFY CONSTRAINTS
// ═══════════════════════════════════════════════════════════════

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff