Files
html/api/ai-benchmark-runner.py
2026-04-12 22:57:03 +02:00

116 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""WEVAL AI Benchmark Runner"""
import json, time, urllib.request, ssl, sys
DB = '/opt/wevads/vault/ai-benchmark.json'
WEVIA = 'https://weval-consulting.com/api/weval-ia'
TOPICS = {
'strategy': {'prompt': 'Propose une strategie digitale pour une PME marocaine de 200 employes dans le textile', 'criteria': ['structure','actionable','maroc_context']},
'code': {'prompt': 'Ecris en Python une classe CsvAnalyzer avec methodes load et describe', 'criteria': ['has_class','has_def','docstring']},
'pharma': {'prompt': 'Quelles sont les etapes de la pharmacovigilance pour un nouveau medicament', 'criteria': ['pharmacovigilance','steps','regulatory']},
'security': {'prompt': 'Liste les 5 vulnerabilites OWASP Top 10 avec remediations', 'criteria': ['owasp','injection','xss','remediation']},
'erp': {'prompt': 'Compare SAP vs Oracle ERP pour une entreprise de 500 employes', 'criteria': ['sap','oracle','comparison','recommendation']},
}
KW = {
'structure': ['###','**','1.','2.'], 'actionable': ['etape','action','recommand','deploy'],
'maroc_context': ['maroc','marocain','pme'], 'has_class': ['class '], 'has_def': ['def '],
'docstring': ['Args','param','Return','description'], 'pharmacovigilance': ['pharmacovigilance','effet'],
'steps': ['etape','phase','1)','1.'], 'regulatory': ['amm','autorisation','reglementaire'],
'owasp': ['owasp','top 10'], 'injection': ['injection','sql injection'],
'xss': ['xss','cross-site'], 'remediation': ['remediation','correction','proteger','mitigation'],
'sap': ['sap','s/4hana'], 'oracle': ['oracle','erp'],
'comparison': ['avantage','inconvenient','vs','comparaison'], 'recommendation': ['recommand','conseil'],
}
def score(resp, criteria, latency):
s = 0
lower = resp.lower()
ln = len(resp)
for c in criteria:
for kw in KW.get(c, []):
if kw in lower:
s += 10
break
if ln > 3000: s += 15
elif ln > 1500: s += 10
elif ln > 500: s += 5
if 0 < latency < 1000: s += 15
elif latency < 2000: s += 10
elif latency < 4000: s += 5
if '```' in resp: s += 3
if '**' in resp: s += 2
if '###' in resp: s += 2
return min(s, 100)
def call_wevia(prompt, mode):
try:
data = json.dumps({'message': prompt, 'mode': mode}).encode()
req = urllib.request.Request(WEVIA, data=data, headers={'Content-Type': 'application/json'}, method='POST')
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
t0 = time.time()
resp = urllib.request.urlopen(req, timeout=20, context=ctx)
d = json.loads(resp.read())
lat = int((time.time() - t0) * 1000)
return d.get('response', ''), d.get('provider', '?'), d.get('latency_ms', lat)
except Exception as e:
return str(e)[:100], '?', 0
def call_ollama(prompt, model):
try:
data = json.dumps({'model': model, 'prompt': prompt, 'stream': False, 'options': {'num_predict': 200, 'num_ctx': 512}}).encode()
req = urllib.request.Request('http://127.0.0.1:11435/api/generate', data=data, headers={'Content-Type': 'application/json'})
t0 = time.time()
resp = urllib.request.urlopen(req, timeout=60)
d = json.loads(resp.read())
lat = int((time.time() - t0) * 1000)
tps = d.get('eval_count', 0) / max(d.get('eval_duration', 1) / 1e9, 0.01)
return d.get('response', ''), model, lat, round(tps, 1)
except Exception as e:
return str(e)[:100], model, 0, 0
topic = sys.argv[1] if len(sys.argv) > 1 else 'strategy'
if topic not in TOPICS:
print(json.dumps({'error': f'invalid topic: {topic}'}))
sys.exit(1)
t = TOPICS[topic]
results = {}
for mode, name in [('fast', 'wevia_fast'), ('code', 'wevcode'), ('deep', 'manager')]:
resp, prov, lat = call_wevia(t['prompt'], mode)
sc = score(resp, t['criteria'], lat)
results[name] = {'score': sc, 'provider': prov, 'latency': lat, 'length': len(resp), 'preview': resp[:200]}
time.sleep(1)
resp, mdl, lat, tps = call_ollama(t['prompt'], 'qwen3.5:0.8b')
sc = score(resp, t['criteria'], lat)
results['ollama_08b'] = {'score': sc, 'provider': mdl, 'latency': lat, 'length': len(resp), 'tps': tps, 'preview': resp[:200]}
try:
db = json.load(open(DB))
except:
db = {'benchmarks': [], 'leaderboard': {}, 'total_runs': 0}
run = {'topic': topic, 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%S'), 'results': results}
db['benchmarks'].append(run)
db['total_runs'] = len(db['benchmarks'])
db['last_run'] = run['timestamp']
scores_agg = {}
counts = {}
for b in db['benchmarks']:
for ai, r in b['results'].items():
scores_agg[ai] = scores_agg.get(ai, 0) + r.get('score', 0)
counts[ai] = counts.get(ai, 0) + 1
db['leaderboard'] = {ai: {'total': scores_agg[ai], 'avg': round(scores_agg[ai] / counts[ai], 1), 'runs': counts[ai]} for ai in scores_agg}
json.dump(db, open(DB, 'w'), indent=2, ensure_ascii=False)
for ai, r in sorted(results.items(), key=lambda x: -x[1]['score']):
print(f'{r["score"]:3d} | {ai:<16s} | {r["provider"]:<12s} | {r["latency"]:5d}ms | {r["length"]:5d}c')
print(f'DONE:{topic}')