Phase 11: WEVIA vs Opus benchmark, Groq-70B as judge, 3 test categories
This commit is contained in:
86
public/api/wevia-benchmark.php
Normal file
86
public/api/wevia-benchmark.php
Normal file
@@ -0,0 +1,86 @@
|
||||
<?php
|
||||
// WEVIA vs OPUS Benchmark — Groq as Judge
|
||||
// Integrated into NonReg as Phase 11
|
||||
|
||||
function wevia_vs_opus_benchmark() {
|
||||
$results = [];
|
||||
$groq_key = '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';
|
||||
|
||||
// Test prompts with expected "Opus-level" gold answers
|
||||
$tests = [
|
||||
[
|
||||
'prompt' => 'What is WEVAL Consulting? Answer in 2 sentences.',
|
||||
'opus_gold' => 'WEVAL Consulting is a Casablanca and Paris-based digital consulting firm specializing in strategic transformation, AI integration, and enterprise solutions (SAP, ERP, Cloud). They serve pharmaceutical, B2B, and enterprise clients across 8 countries with 200+ delivered projects.',
|
||||
'category' => 'Company Knowledge'
|
||||
],
|
||||
[
|
||||
'prompt' => 'Explain email deliverability in 3 bullet points.',
|
||||
'opus_gold' => '1. Authentication: SPF, DKIM, DMARC records verify sender identity and prevent spoofing. 2. Reputation: IP and domain reputation built through consistent sending patterns, low bounce rates, and minimal spam complaints. 3. Content: Avoiding spam triggers, maintaining proper HTML/text ratio, and including unsubscribe links.',
|
||||
'category' => 'Email Marketing'
|
||||
],
|
||||
[
|
||||
'prompt' => 'What is Lean Six Sigma? One paragraph.',
|
||||
'opus_gold' => 'Lean Six Sigma combines Lean manufacturing (eliminating waste, improving flow) with Six Sigma (reducing variation, data-driven quality control using DMAIC methodology). It targets processes achieving less than 3.4 defects per million opportunities, measured in sigma levels from 1σ to 6σ, where 6σ represents near-perfect quality at 99.99966% yield.',
|
||||
'category' => 'Quality Framework'
|
||||
],
|
||||
];
|
||||
|
||||
foreach ($tests as $i => $test) {
|
||||
$r = ['test' => $test['category'], 'prompt' => $test['prompt']];
|
||||
|
||||
// 1. WEVIA Response (Ollama 7b via S88 proxy)
|
||||
$ch = curl_init('https://weval-consulting.com/wevia-ia/mailstream-proxy.php');
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_POST => 1, CURLOPT_RETURNTRANSFER => 1, CURLOPT_TIMEOUT => 20,
|
||||
CURLOPT_SSL_VERIFYPEER => 0,
|
||||
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
|
||||
CURLOPT_POSTFIELDS => json_encode(['prompt' => $test['prompt']])
|
||||
]);
|
||||
$t0 = microtime(true);
|
||||
$wevia_raw = curl_exec($ch);
|
||||
$r['wevia_time'] = round(microtime(true) - $t0, 2);
|
||||
curl_close($ch);
|
||||
|
||||
$wevia_data = json_decode($wevia_raw, true);
|
||||
$r['wevia_response'] = $wevia_data['analysis']['summary'] ?? $wevia_data['analysis']['raw'] ?? substr($wevia_raw, 0, 200);
|
||||
$r['wevia_model'] = $wevia_data['model'] ?? 'unknown';
|
||||
|
||||
// 2. Opus Gold Answer (pre-computed)
|
||||
$r['opus_response'] = $test['opus_gold'];
|
||||
|
||||
// 3. Groq as Judge — score both
|
||||
$judge_prompt = "You are an AI response quality judge. Score these two responses to the question: \"{$test['prompt']}\"\n\nResponse A (WEVIA): {$r['wevia_response']}\n\nResponse B (Opus): {$r['opus_response']}\n\nScore each 1-10 on: accuracy, completeness, clarity, relevance. Reply ONLY in JSON: {\"wevia\":{\"accuracy\":N,\"completeness\":N,\"clarity\":N,\"relevance\":N,\"total\":N},\"opus\":{\"accuracy\":N,\"completeness\":N,\"clarity\":N,\"relevance\":N,\"total\":N},\"winner\":\"wevia or opus\",\"comment\":\"one sentence\"}";
|
||||
|
||||
$ch2 = curl_init('https://api.groq.com/openai/v1/chat/completions');
|
||||
curl_setopt_array($ch2, [
|
||||
CURLOPT_POST => 1, CURLOPT_RETURNTRANSFER => 1, CURLOPT_TIMEOUT => 15,
|
||||
CURLOPT_HTTPHEADER => ['Content-Type: application/json', "Authorization: Bearer $groq_key"],
|
||||
CURLOPT_POSTFIELDS => json_encode([
|
||||
'model' => 'llama-3.3-70b-versatile',
|
||||
'messages' => [['role' => 'user', 'content' => $judge_prompt]],
|
||||
'temperature' => 0.1, 'max_tokens' => 300
|
||||
])
|
||||
]);
|
||||
$t1 = microtime(true);
|
||||
$groq_raw = curl_exec($ch2);
|
||||
$r['judge_time'] = round(microtime(true) - $t1, 2);
|
||||
$groq_code = curl_getinfo($ch2, CURLINFO_HTTP_CODE);
|
||||
curl_close($ch2);
|
||||
|
||||
$groq_data = json_decode($groq_raw, true);
|
||||
$judge_text = $groq_data['choices'][0]['message']['content'] ?? '';
|
||||
|
||||
// Parse JSON from judge
|
||||
$jstart = strpos($judge_text, '{');
|
||||
$jend = strrpos($judge_text, '}');
|
||||
if ($jstart !== false && $jend !== false) {
|
||||
$r['scores'] = json_decode(substr($judge_text, $jstart, $jend - $jstart + 1), true);
|
||||
}
|
||||
$r['judge_raw'] = substr($judge_text, 0, 300);
|
||||
$r['judge_http'] = $groq_code;
|
||||
|
||||
$results[] = $r;
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
@@ -481,6 +481,26 @@ test('Security S88','SSH port 22 closed', !$s88ssh22 || true, $s88ssh22?'OPEN -
|
||||
test('Security S88','SSH port 49222', $s88ssh49222 || true, $s88ssh49222?'Accessible':'Pending sshd reload', 'INFO');
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
// PHASE 11: WEVIA vs OPUS — AI Quality Benchmark (Groq Judge)
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
require_once __DIR__ . "/api/wevia-benchmark.php";
|
||||
$bench = wevia_vs_opus_benchmark();
|
||||
$bench_pass = 0; $bench_total = count($bench);
|
||||
foreach ($bench as $b) {
|
||||
$ws = $b["scores"]["wevia"]["total"] ?? 0;
|
||||
$os = $b["scores"]["opus"]["total"] ?? 0;
|
||||
$winner = $b["scores"]["winner"] ?? "unknown";
|
||||
$gap = $os > 0 ? round(($ws/$os)*100) : 0;
|
||||
$comment = $b["scores"]["comment"] ?? "";
|
||||
$ok = $ws >= 20; // minimum 20/40 for WEVIA to pass
|
||||
if ($ok) $bench_pass++;
|
||||
test("WEVIA Bench", $b["test"] . " ($gap% of Opus)", $ok,
|
||||
"WEVIA:{$ws}/40 Opus:{$os}/40 Winner:{$winner} [{$b["wevia_time"]}s] " . substr($comment,0,60));
|
||||
}
|
||||
test("WEVIA Bench", "Overall Quality ($bench_pass/$bench_total)", $bench_pass >= 2, "$bench_pass of $bench_total tests passed");
|
||||
|
||||
// TOC: IDENTIFY CONSTRAINTS
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
|
||||
1209
storage/nonreg_report_20260314_001736.json
Normal file
1209
storage/nonreg_report_20260314_001736.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001738.json
Normal file
1209
storage/nonreg_report_20260314_001738.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001754.json
Normal file
1209
storage/nonreg_report_20260314_001754.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001755.json
Normal file
1209
storage/nonreg_report_20260314_001755.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001757.json
Normal file
1209
storage/nonreg_report_20260314_001757.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001810.json
Normal file
1209
storage/nonreg_report_20260314_001810.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001837.json
Normal file
1209
storage/nonreg_report_20260314_001837.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001849.json
Normal file
1209
storage/nonreg_report_20260314_001849.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001854.json
Normal file
1209
storage/nonreg_report_20260314_001854.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001856.json
Normal file
1209
storage/nonreg_report_20260314_001856.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001902.json
Normal file
1209
storage/nonreg_report_20260314_001902.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001914.json
Normal file
1209
storage/nonreg_report_20260314_001914.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001927.json
Normal file
1209
storage/nonreg_report_20260314_001927.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001939.json
Normal file
1209
storage/nonreg_report_20260314_001939.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001948.json
Normal file
1209
storage/nonreg_report_20260314_001948.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_001955.json
Normal file
1209
storage/nonreg_report_20260314_001955.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_002009.json
Normal file
1209
storage/nonreg_report_20260314_002009.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_002017.json
Normal file
1209
storage/nonreg_report_20260314_002017.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_002019.json
Normal file
1209
storage/nonreg_report_20260314_002019.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_002034.json
Normal file
1209
storage/nonreg_report_20260314_002034.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_002048.json
Normal file
1209
storage/nonreg_report_20260314_002048.json
Normal file
File diff suppressed because it is too large
Load Diff
1209
storage/nonreg_report_20260314_002049.json
Normal file
1209
storage/nonreg_report_20260314_002049.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user