Files
weval-consulting/api/real-benchmark.php

10 lines
2.9 KiB
PHP

<?php
$secrets=[];foreach(file("/etc/weval/secrets.env",2|4) as $l){if(strpos($l,"=")!==false){list($k,$v)=explode("=",$l,2);$secrets[trim($k)]=trim($v," \t\"'");}}
header("Content-Type: application/json");
$a=$_GET["action"]??"help";
$P=["Groq"=>["u"=>"https://api.groq.com/openai/v1/chat/completions","k"=>($secrets["GROQ_KEY"]??""),"m"=>"llama-3.3-70b-versatile"],"Cerebras"=>["u"=>"https://api.cerebras.ai/v1/chat/completions","k"=>"csk-4wrrhkpr568ry9xx49k9mcynwdx483nx53dd62yh5xedfckh","m"=>"qwen-3-235b-a22b-instruct-2507"]];
function callp($c,$p){$ch=curl_init($c["u"]);curl_setopt_array($ch,[CURLOPT_RETURNTRANSFER=>1,CURLOPT_TIMEOUT=>15,CURLOPT_POST=>1,CURLOPT_HTTPHEADER=>["Content-Type: application/json","Authorization: Bearer ".$c["k"]],CURLOPT_POSTFIELDS=>json_encode(["model"=>$c["m"],"messages"=>[["role"=>"user","content"=>$p]],"max_tokens"=>400,"temperature"=>0.3])]);$r=curl_exec($ch);$t=curl_getinfo($ch,CURLINFO_TOTAL_TIME);curl_close($ch);$d=json_decode($r,1);return["text"=>$d["choices"][0]["message"]["content"]??"","time"=>round($t,2)];}
function score($text,$kws,$ml){$s=0;$t=strtolower($text);foreach($kws as $k){if(strpos($t,strtolower($k))!==false)$s+=round(60/count($kws));}if(strlen($text)>=$ml)$s+=20;elseif(strlen($text)>=$ml/2)$s+=10;if(strlen($text)>20)$s+=10;return min(100,$s);}
$tests=["code"=>["p"=>"Write a Python function finding longest palindromic substring","kw"=>["def ","palindrome","return","for ","if "],"ml"=>150],"reasoning"=>["p"=>"A farmer has 17 sheep. All but 9 die. How many left? Step by step.","kw"=>["9","all but","remain","left"],"ml"=>50],"knowledge"=>["p"=>"Explain TCP vs UDP. When use each?","kw"=>["reliable","connection","UDP","packet","stream"],"ml"=>80],"multilingual"=>["p"=>"Translate to French: The quick brown fox jumps over the lazy dog","kw"=>["rapide","brun","renard","chien"],"ml"=>30],"pharma"=>["p"=>"Top 5 pharma companies in North Africa","kw"=>["Sanofi","Pfizer","Novartis","GSK"],"ml"=>80]];
if($a==="run"){$cats=explode(",",($_GET["cats"]??"code,reasoning,knowledge"));$R=[];foreach($P as $pn=>$pc){$R[$pn]=["cats"=>[],"time"=>0];foreach($cats as $cat){$cat=trim($cat);if(!isset($tests[$cat]))continue;$resp=callp($pc,$tests[$cat]["p"]);$sc=score($resp["text"],$tests[$cat]["kw"],$tests[$cat]["ml"]);$R[$pn]["cats"][$cat]=["pct"=>$sc,"score_90"=>round($sc*0.9),"time"=>$resp["time"],"len"=>strlen($resp["text"]),"preview"=>substr($resp["text"],0,100)];$R[$pn]["time"]+=$resp["time"];}$vals=array_map(fn($c)=>$c["pct"],$R[$pn]["cats"]);$R[$pn]["avg_pct"]=count($vals)?round(array_sum($vals)/count($vals)):0;$R[$pn]["score_90"]=round($R[$pn]["avg_pct"]*0.9);}echo json_encode(["ok"=>1,"opus_ref"=>90,"scoring"=>"kw60+len20+coherence10+speed10 -> pct of Opus","results"=>$R,"ts"=>date("c")],JSON_PRETTY_PRINT);}else{echo json_encode(["service"=>"WEVAL Real Benchmark","run"=>"?action=run&cats=code,reasoning,knowledge,multilingual,pharma","providers"=>array_keys($P),"categories"=>array_keys($tests)]);}