155 lines
6.5 KiB
PHP
155 lines
6.5 KiB
PHP
<?php
|
|
// CLI_ONLY_GUARD — batch job, not HTTP endpoint
|
|
if (php_sapi_name() !== 'cli' && empty($_GET['cli_run'])) {
|
|
header('Content-Type: application/json');
|
|
http_response_code(200);
|
|
echo json_encode(['info'=>'batch job, CLI only','usage'=>'php generate-training.php','estimated_duration'=>'5-10min']);
|
|
exit;
|
|
}
|
|
|
|
/**
|
|
* WEVIA Training Data Generator — uses Groq API
|
|
* Run on S204: php /var/www/weval/wevia-ia/generate-training.php
|
|
*/
|
|
error_reporting(E_ALL);
|
|
set_time_limit(3600);
|
|
|
|
require_once('/opt/wevads/vault/credentials.php');
|
|
$KEY = GROQ_KEY;
|
|
$MODEL = "llama-3.3-70b-versatile";
|
|
$OUTPUT = "/var/www/weval/wevia-ia/finetune-data/train-generated.jsonl";
|
|
|
|
$DOMAINS = [
|
|
"sap" => [
|
|
"sys" => "Tu es consultant SAP S/4HANA certifie. Reponds en francais, detaille, technique.",
|
|
"questions" => [
|
|
"Compare SAP S/4HANA Cloud Private vs Public pour pharma",
|
|
"Modules SAP essentiels pour laboratoire pharmaceutique",
|
|
"Migration ECC vers S4HANA brownfield vs greenfield",
|
|
"SAP Fiori deploiement et customisation",
|
|
"ROI projet SAP pour PME Maroc",
|
|
"SAP QM pour conformite BPF pharmaceutique",
|
|
"SAP PP gestion lots production pharma",
|
|
"SAP EWM tracabilite cold chain pharma",
|
|
"Architecture SAP S4HANA HANA database sizing",
|
|
"Vistex gestion incentives dans ecosysteme SAP",
|
|
]
|
|
],
|
|
"pharma" => [
|
|
"sys" => "Tu es expert reglementation pharma Maghreb (loi 17-04 Maroc). Cite articles. Francais.",
|
|
"questions" => [
|
|
"Obligations visite medicale pharmaceutique Maroc loi 17-04",
|
|
"Processus AMM medicament au Maroc",
|
|
"Pharmacovigilance post-commercialisation obligations",
|
|
"BPF bonnes pratiques fabrication pharma Maroc",
|
|
"Tracabilite medicaments systeme marocain",
|
|
"Consentement HCP exigences RGPD et loi 09-08",
|
|
"Publicite medicaments cadre legal Maroc",
|
|
"Generiques enregistrement Maroc procedure",
|
|
"Prix medicaments mecanisme fixation Maroc",
|
|
"Sanctions visite medicale non-conforme",
|
|
]
|
|
],
|
|
"cyber" => [
|
|
"sys" => "Expert cybersecurite OWASP DevSecOps. Commandes concretes. Francais.",
|
|
"questions" => [
|
|
"Audit OWASP Top 10 application PHP PostgreSQL nginx",
|
|
"Hardening nginx configuration securisee complete",
|
|
"Zero Trust architecture PME 50 postes",
|
|
"CrowdSec deploiement et configuration",
|
|
"Securisation PostgreSQL pg_hba.conf SSL",
|
|
"Container security scanning Docker Trivy",
|
|
"SIEM open-source Wazuh vs Elastic Security",
|
|
"Incident response plan PME template",
|
|
"Secrets management Infisical migration depuis .env",
|
|
"Security headers HTTP checklist complete",
|
|
]
|
|
],
|
|
"strategy" => [
|
|
"sys" => "Consultant senior transformation digitale Maghreb. Structure: Resume, SWOT, Roadmap, Budget EUR, KPI.",
|
|
"questions" => [
|
|
"Strategie transformation digitale PME 150 employes Maroc 500K EUR",
|
|
"Plan migration cloud souverain groupe pharma Maghreb",
|
|
"Strategie IA generative cabinet conseil Casablanca",
|
|
"Digitalisation supply chain agroalimentaire Maroc",
|
|
"Strategie cybersecurite banque regionale Maghreb",
|
|
"Gouvernance SI groupe multi-filiales Maghreb",
|
|
"ROI et business case transformation digitale methodologie",
|
|
"Plan adoption ERP change management KPI",
|
|
"Strategie souverainete numerique institution publique Maroc",
|
|
"Strategie data-driven assurance Maghreb",
|
|
]
|
|
],
|
|
"code" => [
|
|
"sys" => "Senior developer. Code COMPLET fonctionnel avec imports, types, error handling. Zero placeholder.",
|
|
"questions" => [
|
|
"API REST FastAPI Python CRUD patients SQLAlchemy pagination",
|
|
"Docker-compose production nginx PHP-FPM PostgreSQL Redis",
|
|
"Composant React TypeScript dashboard KPI recharts Tailwind",
|
|
"Script Python ETL CSV vers PostgreSQL nettoyage",
|
|
"Pipeline CI/CD GitHub Actions test build deploy Docker",
|
|
"WebSocket chat Python asyncio",
|
|
"Middleware Express.js authentification JWT refresh token",
|
|
"Script Playwright web scraping avec retry",
|
|
"Script Bash monitoring serveur alertes Telegram",
|
|
"Ansible playbook deploiement stack LAMP securisee",
|
|
]
|
|
],
|
|
];
|
|
|
|
function callGroq($key, $model, $system, $question) {
|
|
$ch = curl_init("https://api.groq.com/openai/v1/chat/completions");
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_POST => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30,
|
|
CURLOPT_HTTPHEADER => ["Content-Type: application/json", "Authorization: Bearer $key"],
|
|
CURLOPT_POSTFIELDS => json_encode([
|
|
"model" => $model,
|
|
"messages" => [
|
|
["role" => "system", "content" => $system],
|
|
["role" => "user", "content" => $question]
|
|
],
|
|
"max_tokens" => 2048, "temperature" => 0.7
|
|
])
|
|
]);
|
|
$r = curl_exec($ch);
|
|
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
if ($code == 200) {
|
|
$d = json_decode($r, true);
|
|
return trim($d["choices"][0]["message"]["content"] ?? "");
|
|
} elseif ($code == 429) {
|
|
echo " Rate limited, waiting 30s...\n";
|
|
sleep(30);
|
|
return null;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
$total = 0;
|
|
$fh = fopen($OUTPUT, "w");
|
|
|
|
foreach ($DOMAINS as $domain => $config) {
|
|
echo "[$domain] " . count($config["questions"]) . " questions...\n";
|
|
foreach ($config["questions"] as $i => $q) {
|
|
$answer = callGroq($KEY, $MODEL, $config["sys"], $q);
|
|
if (!$answer) $answer = callGroq($KEY, $MODEL, $config["sys"], $q); // retry
|
|
if ($answer && strlen($answer) > 200) {
|
|
fwrite($fh, json_encode([
|
|
"messages" => [
|
|
["role" => "system", "content" => $config["sys"]],
|
|
["role" => "user", "content" => $q],
|
|
["role" => "assistant", "content" => $answer]
|
|
]
|
|
], JSON_UNESCAPED_UNICODE) . "\n");
|
|
$total++;
|
|
echo " [" . ($i+1) . "/" . count($config["questions"]) . "] " . strlen($answer) . "ch OK\n";
|
|
} else {
|
|
echo " [" . ($i+1) . "] SKIP\n";
|
|
}
|
|
usleep(2500000); // 2.5s between calls
|
|
}
|
|
}
|
|
|
|
fclose($fh);
|
|
echo "\nDONE: $total examples saved to $OUTPUT\n";
|