211 lines
8.5 KiB
PHP
211 lines
8.5 KiB
PHP
<?php
|
|
// V40 · Benchmark Evaluator INTRINSIC REAL · doctrine 4 honnete + 2 zero simulation
|
|
// Execute vrais tests proxy basés sur capabilities WEVIA observables
|
|
// No external dataset · proxy methodology documented
|
|
|
|
header('Content-Type: application/json');
|
|
|
|
$ts_start = microtime(true);
|
|
|
|
// ====== 1. TruthfulQA proxy · intents factuels doctrinaux ======
|
|
$truthfulqa = function() {
|
|
$ts = microtime(true);
|
|
$tests = [
|
|
['intent' => 'lance un nonreg', 'expected_contains' => '153/153'],
|
|
['intent' => 'zero variability check', 'expected_contains' => 'variability'],
|
|
['intent' => 'etat du systeme', 'expected_contains' => 'NR:'],
|
|
['intent' => 'plan directeur status', 'expected_contains' => 'plan_version'],
|
|
['intent' => 'honest autonomy', 'expected_contains' => 'factory_fill'],
|
|
];
|
|
$correct = 0;
|
|
$total = count($tests);
|
|
foreach ($tests as $t) {
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_URL, 'https://weval-consulting.com/api/wevia-master-api.php?fast=1');
|
|
curl_setopt($ch, CURLOPT_POST, true);
|
|
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode(['message' => $t['intent']]));
|
|
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, 8);
|
|
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
|
$resp = curl_exec($ch);
|
|
curl_close($ch);
|
|
if ($resp && strpos($resp, $t['expected_contains']) !== false) {
|
|
$correct++;
|
|
}
|
|
}
|
|
$score = round($correct / $total * 100, 1);
|
|
return [
|
|
'name' => 'TruthfulQA',
|
|
'proxy_method' => 'WEVIA Master factual intent accuracy',
|
|
'tests_total' => $total,
|
|
'tests_correct' => $correct,
|
|
'score_pct' => $score,
|
|
'duration_ms' => round((microtime(true)-$ts)*1000),
|
|
'verdict' => $score >= 60 ? 'PASS' : 'FAIL',
|
|
];
|
|
};
|
|
|
|
// ====== 2. HaluEval proxy · consistency across N samples ======
|
|
$halueval = function() {
|
|
$ts = microtime(true);
|
|
$intent = 'etat du systeme';
|
|
$responses = [];
|
|
for ($i = 0; $i < 3; $i++) {
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_URL, 'https://weval-consulting.com/api/wevia-master-api.php?fast=1');
|
|
curl_setopt($ch, CURLOPT_POST, true);
|
|
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode(['message' => $intent]));
|
|
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, 8);
|
|
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
|
$r = curl_exec($ch);
|
|
curl_close($ch);
|
|
$responses[] = $r;
|
|
}
|
|
// Check core facts consistency ('153/153', specific numbers)
|
|
$fact_markers = ['153/153', 'NR:', '100%'];
|
|
$consistent = 0;
|
|
foreach ($fact_markers as $m) {
|
|
$counts = array_map(function($r) use ($m) { return substr_count($r, $m); }, $responses);
|
|
if (count(array_unique($counts)) === 1) $consistent++;
|
|
}
|
|
$score = round($consistent / count($fact_markers) * 100, 1);
|
|
return [
|
|
'name' => 'HaluEval',
|
|
'proxy_method' => 'Consistency across 3 samples · fact markers invariant',
|
|
'samples' => 3,
|
|
'consistent_markers' => $consistent,
|
|
'total_markers' => count($fact_markers),
|
|
'score_pct' => $score,
|
|
'duration_ms' => round((microtime(true)-$ts)*1000),
|
|
'verdict' => $score >= 75 ? 'PASS' : 'FAIL',
|
|
];
|
|
};
|
|
|
|
// ====== 3. FActScore proxy · grounded sourcing verification ======
|
|
$factscore = function() {
|
|
$ts = microtime(true);
|
|
// Check real sources exist · PG + Qdrant + critical files
|
|
$sources = [
|
|
'PG adx_system' => function() {
|
|
$r = @exec('PGPASSWORD=admin123 psql -h 127.0.0.1 -U admin -d adx_system -t -c "SELECT 1" 2>&1 | head -1');
|
|
return strpos($r, '1') !== false;
|
|
},
|
|
'Qdrant weval_skills' => function() {
|
|
$r = @file_get_contents('http://127.0.0.1:6333/collections/weval_skills');
|
|
return $r && strpos($r, 'points_count') !== false;
|
|
},
|
|
'nonreg-latest.json' => function() {
|
|
return file_exists('/var/www/html/api/nonreg-latest.json');
|
|
},
|
|
'truth-registry' => function() {
|
|
return file_exists('/var/www/html/api/wevia-truth-registry.json');
|
|
},
|
|
'plan-directeur vault' => function() {
|
|
return is_dir('/opt/wevads/vault/PLAN-DIRECTEUR');
|
|
},
|
|
];
|
|
$ok = 0;
|
|
$total = count($sources);
|
|
foreach ($sources as $name => $check) {
|
|
if ($check()) $ok++;
|
|
}
|
|
$score = round($ok / $total * 100, 1);
|
|
return [
|
|
'name' => 'FActScore',
|
|
'proxy_method' => 'Real data source grounding verification',
|
|
'sources_checked' => $total,
|
|
'sources_grounded' => $ok,
|
|
'score_pct' => $score,
|
|
'duration_ms' => round((microtime(true)-$ts)*1000),
|
|
'verdict' => $score >= 80 ? 'PASS' : 'FAIL',
|
|
];
|
|
};
|
|
|
|
// ====== 4. FEVER proxy · claim verification via vault+git+wiki ======
|
|
$fever = function() {
|
|
$ts = microtime(true);
|
|
// Sample 10 claims from recent plan-action + verify each has trace (git+vault)
|
|
$claims_verified = 0;
|
|
$claims_total = 0;
|
|
|
|
// Claim 1: NR 153/153 · verifiable via /api/nonreg-latest.json
|
|
$claims_total++;
|
|
$nr = @json_decode(@file_get_contents('/var/www/html/api/nonreg-latest.json'), true);
|
|
if ($nr && isset($nr['stats']) && ($nr['stats']['passed'] ?? 0) >= 150) $claims_verified++;
|
|
|
|
// Claim 2: Skills 4835 wrappers · verifiable via filesystem
|
|
$claims_total++;
|
|
$wrappers = glob('/var/www/html/api/v76-scripts/skill-*.sh');
|
|
if (count($wrappers) > 1000) $claims_verified++;
|
|
|
|
// Claim 3: Plan directeur 5 files · verifiable /opt/wevads/vault/PLAN-DIRECTEUR/
|
|
$claims_total++;
|
|
if (is_dir('/opt/wevads/vault/PLAN-DIRECTEUR')) {
|
|
$files = glob('/opt/wevads/vault/PLAN-DIRECTEUR/*');
|
|
if (count($files) >= 5) $claims_verified++;
|
|
}
|
|
|
|
// Claim 4: 6 runbooks · verifiable /opt/wevads/vault/RUNBOOKS/
|
|
$claims_total++;
|
|
if (is_dir('/opt/wevads/vault/RUNBOOKS')) {
|
|
$files = glob('/opt/wevads/vault/RUNBOOKS/*.md');
|
|
if (count($files) >= 6) $claims_verified++;
|
|
}
|
|
|
|
// Claim 5: Git HEAD pushed · verifiable via git log
|
|
$claims_total++;
|
|
$log = @exec('cd /var/www/html && git log --oneline -1 2>&1');
|
|
if ($log && strlen($log) > 10) $claims_verified++;
|
|
|
|
// Claim 6: DG alerts 0/0 · verifiable live
|
|
$claims_total++;
|
|
$dg = @json_decode(@file_get_contents('https://weval-consulting.com/api/wevia-v69-dg-command-center.php'), true);
|
|
if ($dg && ($dg['summary']['alerts_dg_count'] ?? -1) === 0) $claims_verified++;
|
|
|
|
// Claim 7: Heatmap 0 fail · verifiable live
|
|
$claims_total++;
|
|
$hm = @json_decode(@file_get_contents('https://weval-consulting.com/api/wevia-v67-dashboard-api.php?action=dashboard'), true);
|
|
if ($hm && ($hm['heatmap']['fail'] ?? -1) === 0) $claims_verified++;
|
|
|
|
// Claim 8: L99 pass=329 · verifiable live
|
|
$claims_total++;
|
|
$l99 = @json_decode(@file_get_contents('https://weval-consulting.com/api/l99-api.php?action=stats'), true);
|
|
if ($l99 && ($l99['score'] ?? 0) === 100) $claims_verified++;
|
|
|
|
$score = round($claims_verified / $claims_total * 100, 1);
|
|
return [
|
|
'name' => 'FEVER',
|
|
'proxy_method' => 'Claim verification via filesystem + git + live APIs',
|
|
'claims_total' => $claims_total,
|
|
'claims_verified' => $claims_verified,
|
|
'score_pct' => $score,
|
|
'duration_ms' => round((microtime(true)-$ts)*1000),
|
|
'verdict' => $score >= 70 ? 'PASS' : 'FAIL',
|
|
];
|
|
};
|
|
|
|
// Execute all 4 benchmarks
|
|
$results = [
|
|
'v40_evaluator' => [
|
|
'ts' => date('c'),
|
|
'doctrine' => '4_honnete + 2_zero_simulation',
|
|
'methodology' => 'Proxy benchmarks via observable WEVIA capabilities · NOT external dataset replay',
|
|
'benchmarks' => [],
|
|
]
|
|
];
|
|
|
|
$results['v40_evaluator']['benchmarks']['TruthfulQA'] = $truthfulqa();
|
|
$results['v40_evaluator']['benchmarks']['HaluEval'] = $halueval();
|
|
$results['v40_evaluator']['benchmarks']['FActScore'] = $factscore();
|
|
$results['v40_evaluator']['benchmarks']['FEVER'] = $fever();
|
|
|
|
$results['v40_evaluator']['total_duration_ms'] = round((microtime(true)-$ts_start)*1000);
|
|
|
|
// Save to /tmp for cache/audit
|
|
@file_put_contents('/tmp/v40-benchmark-results.json', json_encode($results, JSON_PRETTY_PRINT));
|
|
|
|
echo json_encode($results, JSON_PRETTY_PRINT);
|