Files
wevia-brain/sovereign-reduction-plan.json
2026-04-16 16:09:59 +02:00

50 lines
1.8 KiB
JSON

{
"current_state": {
"sovereign_ratio": "86%",
"cloud_providers": [
"Cerebras",
"Groq",
"NVIDIA",
"Gemini",
"SambaNova",
"Mistral",
"HF",
"OpenRouter"
],
"sovereign_providers": [
"Ollama weval-brain-v3",
"Ollama qwen3:4b",
"Ollama nomic-embed",
"Ollama all-minilm",
"SearXNG",
"Qdrant"
]
},
"target": "95% souverain après fine-tune v4",
"reduction_plan": {
"phase1_now": [
"Ollama weval-brain-v3 en fallback #1 (pas cloud)",
"SearXNG pour toute recherche web",
"Qdrant RAG pour knowledge interne",
"Nomic-embed pour tous les embeddings"
],
"phase2_after_finetune": [
"weval-brain-v4 (fine-tuned 4816 samples) = PRIMARY pour toutes les questions WEVAL",
"Cloud = fallback UNIQUEMENT si local timeout",
"AirLLM pour charger Qwen3-72B offline (benchmark, evaluation)"
],
"phase3_full_sovereign": [
"HuggingFace Space (T4 gratuit 2h/jour) pour inference API publique",
"Kaggle GPU (30h/semaine) pour training continu",
"SGLang remplace Ollama (speculative decoding, prefix cache, 3x faster)",
"TurboQuant quand dispo dans llama.cpp (6x RAM reduction)"
]
},
"airllm_role": [
"Use case: charger Qwen3-72B sur S204 32GB RAM (CPU)",
"Latence: ~60s/query (batch only, pas temps réel)",
"Utile pour: benchmark, evaluation, generation données fine-tune",
"PAS pour: chat temps réel (trop lent)",
"Alternative: SGLang avec speculative decoding = 5-10s"
]
}