Files
html/api/generate-training.php
2026-04-17 01:00:02 +02:00

155 lines
6.5 KiB
PHP

<?php
// CLI_ONLY_GUARD — batch job, not HTTP endpoint
if (php_sapi_name() !== 'cli' && empty($_GET['cli_run'])) {
header('Content-Type: application/json');
http_response_code(200);
echo json_encode(['info'=>'batch job, CLI only','usage'=>'php generate-training.php','estimated_duration'=>'5-10min']);
exit;
}
/**
* WEVIA Training Data Generator — uses Groq API
* Run on S204: php /var/www/weval/wevia-ia/generate-training.php
*/
error_reporting(E_ALL);
set_time_limit(3600);
require_once('/opt/wevads/vault/credentials.php');
$KEY = GROQ_KEY;
$MODEL = "llama-3.3-70b-versatile";
$OUTPUT = "/var/www/weval/wevia-ia/finetune-data/train-generated.jsonl";
$DOMAINS = [
"sap" => [
"sys" => "Tu es consultant SAP S/4HANA certifie. Reponds en francais, detaille, technique.",
"questions" => [
"Compare SAP S/4HANA Cloud Private vs Public pour pharma",
"Modules SAP essentiels pour laboratoire pharmaceutique",
"Migration ECC vers S4HANA brownfield vs greenfield",
"SAP Fiori deploiement et customisation",
"ROI projet SAP pour PME Maroc",
"SAP QM pour conformite BPF pharmaceutique",
"SAP PP gestion lots production pharma",
"SAP EWM tracabilite cold chain pharma",
"Architecture SAP S4HANA HANA database sizing",
"Vistex gestion incentives dans ecosysteme SAP",
]
],
"pharma" => [
"sys" => "Tu es expert reglementation pharma Maghreb (loi 17-04 Maroc). Cite articles. Francais.",
"questions" => [
"Obligations visite medicale pharmaceutique Maroc loi 17-04",
"Processus AMM medicament au Maroc",
"Pharmacovigilance post-commercialisation obligations",
"BPF bonnes pratiques fabrication pharma Maroc",
"Tracabilite medicaments systeme marocain",
"Consentement HCP exigences RGPD et loi 09-08",
"Publicite medicaments cadre legal Maroc",
"Generiques enregistrement Maroc procedure",
"Prix medicaments mecanisme fixation Maroc",
"Sanctions visite medicale non-conforme",
]
],
"cyber" => [
"sys" => "Expert cybersecurite OWASP DevSecOps. Commandes concretes. Francais.",
"questions" => [
"Audit OWASP Top 10 application PHP PostgreSQL nginx",
"Hardening nginx configuration securisee complete",
"Zero Trust architecture PME 50 postes",
"CrowdSec deploiement et configuration",
"Securisation PostgreSQL pg_hba.conf SSL",
"Container security scanning Docker Trivy",
"SIEM open-source Wazuh vs Elastic Security",
"Incident response plan PME template",
"Secrets management Infisical migration depuis .env",
"Security headers HTTP checklist complete",
]
],
"strategy" => [
"sys" => "Consultant senior transformation digitale Maghreb. Structure: Resume, SWOT, Roadmap, Budget EUR, KPI.",
"questions" => [
"Strategie transformation digitale PME 150 employes Maroc 500K EUR",
"Plan migration cloud souverain groupe pharma Maghreb",
"Strategie IA generative cabinet conseil Casablanca",
"Digitalisation supply chain agroalimentaire Maroc",
"Strategie cybersecurite banque regionale Maghreb",
"Gouvernance SI groupe multi-filiales Maghreb",
"ROI et business case transformation digitale methodologie",
"Plan adoption ERP change management KPI",
"Strategie souverainete numerique institution publique Maroc",
"Strategie data-driven assurance Maghreb",
]
],
"code" => [
"sys" => "Senior developer. Code COMPLET fonctionnel avec imports, types, error handling. Zero placeholder.",
"questions" => [
"API REST FastAPI Python CRUD patients SQLAlchemy pagination",
"Docker-compose production nginx PHP-FPM PostgreSQL Redis",
"Composant React TypeScript dashboard KPI recharts Tailwind",
"Script Python ETL CSV vers PostgreSQL nettoyage",
"Pipeline CI/CD GitHub Actions test build deploy Docker",
"WebSocket chat Python asyncio",
"Middleware Express.js authentification JWT refresh token",
"Script Playwright web scraping avec retry",
"Script Bash monitoring serveur alertes Telegram",
"Ansible playbook deploiement stack LAMP securisee",
]
],
];
function callGroq($key, $model, $system, $question) {
$ch = curl_init("https://api.groq.com/openai/v1/chat/completions");
curl_setopt_array($ch, [
CURLOPT_POST => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30,
CURLOPT_HTTPHEADER => ["Content-Type: application/json", "Authorization: Bearer $key"],
CURLOPT_POSTFIELDS => json_encode([
"model" => $model,
"messages" => [
["role" => "system", "content" => $system],
["role" => "user", "content" => $question]
],
"max_tokens" => 2048, "temperature" => 0.7
])
]);
$r = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($code == 200) {
$d = json_decode($r, true);
return trim($d["choices"][0]["message"]["content"] ?? "");
} elseif ($code == 429) {
echo " Rate limited, waiting 30s...\n";
sleep(30);
return null;
}
return null;
}
$total = 0;
$fh = fopen($OUTPUT, "w");
foreach ($DOMAINS as $domain => $config) {
echo "[$domain] " . count($config["questions"]) . " questions...\n";
foreach ($config["questions"] as $i => $q) {
$answer = callGroq($KEY, $MODEL, $config["sys"], $q);
if (!$answer) $answer = callGroq($KEY, $MODEL, $config["sys"], $q); // retry
if ($answer && strlen($answer) > 200) {
fwrite($fh, json_encode([
"messages" => [
["role" => "system", "content" => $config["sys"]],
["role" => "user", "content" => $q],
["role" => "assistant", "content" => $answer]
]
], JSON_UNESCAPED_UNICODE) . "\n");
$total++;
echo " [" . ($i+1) . "/" . count($config["questions"]) . "] " . strlen($answer) . "ch OK\n";
} else {
echo " [" . ($i+1) . "] SKIP\n";
}
usleep(2500000); // 2.5s between calls
}
}
fclose($fh);
echo "\nDONE: $total examples saved to $OUTPUT\n";