50 lines
1.8 KiB
JSON
50 lines
1.8 KiB
JSON
{
|
|
"current_state": {
|
|
"sovereign_ratio": "86%",
|
|
"cloud_providers": [
|
|
"Cerebras",
|
|
"Groq",
|
|
"NVIDIA",
|
|
"Gemini",
|
|
"SambaNova",
|
|
"Mistral",
|
|
"HF",
|
|
"OpenRouter"
|
|
],
|
|
"sovereign_providers": [
|
|
"Ollama weval-brain-v3",
|
|
"Ollama qwen3:4b",
|
|
"Ollama nomic-embed",
|
|
"Ollama all-minilm",
|
|
"SearXNG",
|
|
"Qdrant"
|
|
]
|
|
},
|
|
"target": "95% souverain après fine-tune v4",
|
|
"reduction_plan": {
|
|
"phase1_now": [
|
|
"Ollama weval-brain-v3 en fallback #1 (pas cloud)",
|
|
"SearXNG pour toute recherche web",
|
|
"Qdrant RAG pour knowledge interne",
|
|
"Nomic-embed pour tous les embeddings"
|
|
],
|
|
"phase2_after_finetune": [
|
|
"weval-brain-v4 (fine-tuned 4816 samples) = PRIMARY pour toutes les questions WEVAL",
|
|
"Cloud = fallback UNIQUEMENT si local timeout",
|
|
"AirLLM pour charger Qwen3-72B offline (benchmark, evaluation)"
|
|
],
|
|
"phase3_full_sovereign": [
|
|
"HuggingFace Space (T4 gratuit 2h/jour) pour inference API publique",
|
|
"Kaggle GPU (30h/semaine) pour training continu",
|
|
"SGLang remplace Ollama (speculative decoding, prefix cache, 3x faster)",
|
|
"TurboQuant quand dispo dans llama.cpp (6x RAM reduction)"
|
|
]
|
|
},
|
|
"airllm_role": [
|
|
"Use case: charger Qwen3-72B sur S204 32GB RAM (CPU)",
|
|
"Latence: ~60s/query (batch only, pas temps réel)",
|
|
"Utile pour: benchmark, evaluation, generation données fine-tune",
|
|
"PAS pour: chat temps réel (trop lent)",
|
|
"Alternative: SGLang avec speculative decoding = 5-10s"
|
|
]
|
|
} |