Files
wevia-brain/knowledge/data-science-kb.json
2026-04-12 23:01:36 +02:00

77 lines
3.6 KiB
JSON
Executable File

{
"python_data_stack": {
"pandas": {
"desc": "Manipulation et analyse de données tabulaires",
"key_ops": ["read_csv", "groupby", "merge", "pivot_table", "apply", "rolling"],
"best_practices": ["Vectorized operations over loops", "Use dtypes wisely", "Chain operations"]
},
"numpy": {
"desc": "Calcul numérique performant",
"key_ops": ["array operations", "broadcasting", "linear algebra", "FFT", "random"]
},
"scikit_learn": {
"desc": "Machine learning classique",
"algorithms": {
"classification": ["RandomForest", "XGBoost", "SVM", "LogisticRegression", "KNN"],
"regression": ["LinearRegression", "Ridge", "Lasso", "GradientBoosting", "SVR"],
"clustering": ["KMeans", "DBSCAN", "Hierarchical", "GaussianMixture"],
"dimensionality": ["PCA", "t-SNE", "UMAP", "LDA"]
},
"pipeline": ["preprocessing", "feature_selection", "model_selection", "cross_validation", "hyperparameter_tuning"]
},
"visualization": {
"matplotlib": "Graphiques de base, publication-quality",
"seaborn": "Statistical data visualization",
"plotly": "Interactive charts, dashboards",
"altair": "Declarative statistical visualization"
}
},
"ml_ops": {
"experiment_tracking": ["MLflow", "Weights & Biases", "Neptune"],
"model_serving": ["FastAPI", "TensorRT", "Triton", "vLLM", "Ollama"],
"data_pipeline": ["Apache Airflow", "Prefect", "Dagster", "dbt"],
"feature_store": ["Feast", "Tecton", "Hopsworks"],
"monitoring": ["Evidently AI", "Prometheus + Grafana", "WhyLabs"]
},
"llm_engineering": {
"prompt_engineering": {
"techniques": ["Zero-shot", "Few-shot", "Chain-of-Thought", "Tree-of-Thoughts", "ReAct", "Self-Consistency"],
"best_practices": [
"Be specific and detailed in instructions",
"Use delimiters for structured input",
"Specify output format explicitly",
"Break complex tasks into steps",
"Use system prompts for persona and constraints"
]
},
"rag": {
"components": ["Document Loading", "Chunking", "Embedding", "Vector Store", "Retrieval", "Generation"],
"chunking_strategies": ["Fixed-size", "Recursive", "Semantic", "Document-based"],
"embedding_models": ["nomic-embed-text", "BGE", "GTE", "E5", "OpenAI ada-002"],
"vector_stores": ["pgvector", "Chroma", "Weaviate", "Pinecone", "Milvus"],
"retrieval_strategies": ["Similarity search", "MMR", "Hybrid (BM25+vector)", "Re-ranking"]
},
"fine_tuning": {
"methods": ["LoRA", "QLoRA", "Full fine-tuning", "RLHF", "DPO"],
"tools": ["Hugging Face Transformers", "Axolotl", "LLaMA-Factory", "Unsloth"],
"data_prep": ["Instruction format", "Conversation format", "Preference pairs"]
},
"inference_optimization": {
"quantization": ["GGUF (llama.cpp)", "GPTQ", "AWQ", "bitsandbytes 4-bit"],
"serving": ["vLLM (PagedAttention)", "Ollama", "TGI", "llama.cpp server"],
"techniques": ["KV-cache", "Speculative decoding", "Continuous batching", "Flash Attention"]
}
},
"sql_advanced": {
"postgresql": {
"features": ["CTE (WITH)", "Window functions", "LATERAL joins", "JSONB", "Full-text search", "pgvector"],
"optimization": ["EXPLAIN ANALYZE", "Index types (B-tree, GIN, GiST, BRIN)", "Partitioning", "Materialized views"],
"extensions": ["pgvector", "PostGIS", "pg_trgm", "dblink", "pg_stat_statements"]
},
"patterns": {
"analytics": ["Running totals", "Moving averages", "Percentile ranks", "Gap analysis", "Cohort analysis"],
"etl": ["COPY command", "Foreign Data Wrappers", "Logical replication", "pg_dump strategies"]
}
}
}