116 lines
5.1 KiB
Python
116 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""WEVAL AI Benchmark Runner"""
|
|
import json, time, urllib.request, ssl, sys
|
|
|
|
DB = '/opt/wevads/vault/ai-benchmark.json'
|
|
WEVIA = 'https://weval-consulting.com/api/weval-ia'
|
|
|
|
TOPICS = {
|
|
'strategy': {'prompt': 'Propose une strategie digitale pour une PME marocaine de 200 employes dans le textile', 'criteria': ['structure','actionable','maroc_context']},
|
|
'code': {'prompt': 'Ecris en Python une classe CsvAnalyzer avec methodes load et describe', 'criteria': ['has_class','has_def','docstring']},
|
|
'pharma': {'prompt': 'Quelles sont les etapes de la pharmacovigilance pour un nouveau medicament', 'criteria': ['pharmacovigilance','steps','regulatory']},
|
|
'security': {'prompt': 'Liste les 5 vulnerabilites OWASP Top 10 avec remediations', 'criteria': ['owasp','injection','xss','remediation']},
|
|
'erp': {'prompt': 'Compare SAP vs Oracle ERP pour une entreprise de 500 employes', 'criteria': ['sap','oracle','comparison','recommendation']},
|
|
}
|
|
|
|
KW = {
|
|
'structure': ['###','**','1.','2.'], 'actionable': ['etape','action','recommand','deploy'],
|
|
'maroc_context': ['maroc','marocain','pme'], 'has_class': ['class '], 'has_def': ['def '],
|
|
'docstring': ['Args','param','Return','description'], 'pharmacovigilance': ['pharmacovigilance','effet'],
|
|
'steps': ['etape','phase','1)','1.'], 'regulatory': ['amm','autorisation','reglementaire'],
|
|
'owasp': ['owasp','top 10'], 'injection': ['injection','sql injection'],
|
|
'xss': ['xss','cross-site'], 'remediation': ['remediation','correction','proteger','mitigation'],
|
|
'sap': ['sap','s/4hana'], 'oracle': ['oracle','erp'],
|
|
'comparison': ['avantage','inconvenient','vs','comparaison'], 'recommendation': ['recommand','conseil'],
|
|
}
|
|
|
|
def score(resp, criteria, latency):
|
|
s = 0
|
|
lower = resp.lower()
|
|
ln = len(resp)
|
|
for c in criteria:
|
|
for kw in KW.get(c, []):
|
|
if kw in lower:
|
|
s += 10
|
|
break
|
|
if ln > 3000: s += 15
|
|
elif ln > 1500: s += 10
|
|
elif ln > 500: s += 5
|
|
if 0 < latency < 1000: s += 15
|
|
elif latency < 2000: s += 10
|
|
elif latency < 4000: s += 5
|
|
if '```' in resp: s += 3
|
|
if '**' in resp: s += 2
|
|
if '###' in resp: s += 2
|
|
return min(s, 100)
|
|
|
|
def call_wevia(prompt, mode):
|
|
try:
|
|
data = json.dumps({'message': prompt, 'mode': mode}).encode()
|
|
req = urllib.request.Request(WEVIA, data=data, headers={'Content-Type': 'application/json'}, method='POST')
|
|
ctx = ssl.create_default_context()
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
t0 = time.time()
|
|
resp = urllib.request.urlopen(req, timeout=20, context=ctx)
|
|
d = json.loads(resp.read())
|
|
lat = int((time.time() - t0) * 1000)
|
|
return d.get('response', ''), d.get('provider', '?'), d.get('latency_ms', lat)
|
|
except Exception as e:
|
|
return str(e)[:100], '?', 0
|
|
|
|
def call_ollama(prompt, model):
|
|
try:
|
|
data = json.dumps({'model': model, 'prompt': prompt, 'stream': False, 'options': {'num_predict': 200, 'num_ctx': 512}}).encode()
|
|
req = urllib.request.Request('http://127.0.0.1:11435/api/generate', data=data, headers={'Content-Type': 'application/json'})
|
|
t0 = time.time()
|
|
resp = urllib.request.urlopen(req, timeout=60)
|
|
d = json.loads(resp.read())
|
|
lat = int((time.time() - t0) * 1000)
|
|
tps = d.get('eval_count', 0) / max(d.get('eval_duration', 1) / 1e9, 0.01)
|
|
return d.get('response', ''), model, lat, round(tps, 1)
|
|
except Exception as e:
|
|
return str(e)[:100], model, 0, 0
|
|
|
|
topic = sys.argv[1] if len(sys.argv) > 1 else 'strategy'
|
|
if topic not in TOPICS:
|
|
print(json.dumps({'error': f'invalid topic: {topic}'}))
|
|
sys.exit(1)
|
|
|
|
t = TOPICS[topic]
|
|
results = {}
|
|
|
|
for mode, name in [('fast', 'wevia_fast'), ('code', 'wevcode'), ('deep', 'manager')]:
|
|
resp, prov, lat = call_wevia(t['prompt'], mode)
|
|
sc = score(resp, t['criteria'], lat)
|
|
results[name] = {'score': sc, 'provider': prov, 'latency': lat, 'length': len(resp), 'preview': resp[:200]}
|
|
time.sleep(1)
|
|
|
|
resp, mdl, lat, tps = call_ollama(t['prompt'], 'qwen3.5:0.8b')
|
|
sc = score(resp, t['criteria'], lat)
|
|
results['ollama_08b'] = {'score': sc, 'provider': mdl, 'latency': lat, 'length': len(resp), 'tps': tps, 'preview': resp[:200]}
|
|
|
|
try:
|
|
db = json.load(open(DB))
|
|
except:
|
|
db = {'benchmarks': [], 'leaderboard': {}, 'total_runs': 0}
|
|
|
|
run = {'topic': topic, 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%S'), 'results': results}
|
|
db['benchmarks'].append(run)
|
|
db['total_runs'] = len(db['benchmarks'])
|
|
db['last_run'] = run['timestamp']
|
|
|
|
scores_agg = {}
|
|
counts = {}
|
|
for b in db['benchmarks']:
|
|
for ai, r in b['results'].items():
|
|
scores_agg[ai] = scores_agg.get(ai, 0) + r.get('score', 0)
|
|
counts[ai] = counts.get(ai, 0) + 1
|
|
db['leaderboard'] = {ai: {'total': scores_agg[ai], 'avg': round(scores_agg[ai] / counts[ai], 1), 'runs': counts[ai]} for ai in scores_agg}
|
|
|
|
json.dump(db, open(DB, 'w'), indent=2, ensure_ascii=False)
|
|
|
|
for ai, r in sorted(results.items(), key=lambda x: -x[1]['score']):
|
|
print(f'{r["score"]:3d} | {ai:<16s} | {r["provider"]:<12s} | {r["latency"]:5d}ms | {r["length"]:5d}c')
|
|
print(f'DONE:{topic}')
|