Files
wevia-brain/modules/rag-engine.php
2026-04-12 23:01:36 +02:00

185 lines
6.3 KiB
PHP
Executable File

<?php
/**
* ╔══════════════════════════════════════════════════════════╗
* ║ WEVIA OPUS — RAG Engine (Retrieval-Augmented Gen) ║
* ║ pgvector + Ollama Embeddings + Semantic Search ║
* ╚══════════════════════════════════════════════════════════╝
*/
class RAGEngine {
private $pdo;
private $embeddingModel = 'nomic-embed-text';
private $ollamaUrl;
private $topK = 5;
private $minSimilarity = 0.3;
public function __construct(?PDO $pdo = null, string $ollamaUrl = 'http://127.0.0.1:11434') {
$this->ollamaUrl = $ollamaUrl;
if ($pdo) {
$this->pdo = $pdo;
} else {
$this->pdo = new PDO("pgsql:host=127.0.0.1;dbname=wevia_db", "postgres", "");
$this->pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
}
$this->ensureSchema();
}
private function ensureSchema(): void {
$this->pdo->exec("CREATE EXTENSION IF NOT EXISTS vector");
$this->pdo->exec("
CREATE TABLE IF NOT EXISTS kb_embeddings (
id SERIAL PRIMARY KEY,
source VARCHAR(255),
category VARCHAR(100),
chunk_text TEXT NOT NULL,
embedding vector(768),
metadata JSONB DEFAULT '{}',
created_at TIMESTAMP DEFAULT NOW(),
access_count INT DEFAULT 0
)
");
$this->pdo->exec("
CREATE INDEX IF NOT EXISTS idx_kb_embedding ON kb_embeddings
USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100)
");
}
/**
* Ingère un document en le découpant en chunks avec overlap
*/
public function ingest(string $text, string $source, string $category = 'general', array $meta = []): int {
$chunks = $this->chunkText($text, 512, 64);
$count = 0;
foreach ($chunks as $chunk) {
$embedding = $this->getEmbedding($chunk);
if (!$embedding) continue;
$stmt = $this->pdo->prepare("
INSERT INTO kb_embeddings (source, category, chunk_text, embedding, metadata)
VALUES (?, ?, ?, ?::vector, ?::jsonb)
");
$stmt->execute([
$source, $category, $chunk,
'[' . implode(',', $embedding) . ']',
json_encode($meta)
]);
$count++;
}
return $count;
}
/**
* Recherche sémantique — trouve les chunks les plus pertinents
*/
public function search(string $query, int $topK = null, string $category = null): array {
$k = $topK ?? $this->topK;
$embedding = $this->getEmbedding($query);
if (!$embedding) return [];
$vecStr = '[' . implode(',', $embedding) . ']';
$sql = "SELECT id, source, category, chunk_text, metadata,
1 - (embedding <=> ?::vector) as similarity
FROM kb_embeddings
WHERE 1=1";
$params = [$vecStr];
if ($category) {
$sql .= " AND category = ?";
$params[] = $category;
}
$sql .= " ORDER BY embedding <=> ?::vector LIMIT ?";
$params[] = $vecStr;
$params[] = $k;
$stmt = $this->pdo->prepare($sql);
$stmt->execute($params);
$results = $stmt->fetchAll(PDO::FETCH_ASSOC);
// Update access count
foreach ($results as $r) {
$this->pdo->exec("UPDATE kb_embeddings SET access_count = access_count + 1 WHERE id = " . intval($r['id']));
}
return array_filter($results, fn($r) => $r['similarity'] >= $this->minSimilarity);
}
/**
* Génère le contexte RAG pour une requête
*/
public function getContext(string $query, string $category = null): string {
$results = $this->search($query, $this->topK, $category);
if (empty($results)) return '';
$context = "--- CONTEXTE KNOWLEDGE BASE ---\n";
foreach ($results as $i => $r) {
$sim = round($r['similarity'] * 100);
$context .= "[{$r['source']}] (pertinence: {$sim}%)\n{$r['chunk_text']}\n\n";
}
$context .= "--- FIN CONTEXTE ---\n";
return $context;
}
/**
* Obtient l'embedding d'un texte via Ollama
*/
private function getEmbedding(string $text): ?array {
$ch = curl_init("{$this->ollamaUrl}/api/embed");
curl_setopt_array($ch, [
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => json_encode([
'model' => $this->embeddingModel,
'input' => $text
]),
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 30,
CURLOPT_HTTPHEADER => ['Content-Type: application/json']
]);
$raw = curl_exec($ch);
curl_close($ch);
$data = json_decode($raw, true);
return $data['embeddings'][0] ?? null;
}
/**
* Découpe un texte en chunks avec overlap
*/
private function chunkText(string $text, int $chunkSize = 512, int $overlap = 64): array {
$words = preg_split('/\s+/', $text);
$chunks = [];
$total = count($words);
for ($i = 0; $i < $total; $i += ($chunkSize - $overlap)) {
$chunk = implode(' ', array_slice($words, $i, $chunkSize));
if (mb_strlen($chunk) > 20) {
$chunks[] = $chunk;
}
}
return $chunks;
}
/**
* Stats du KB
*/
public function getStats(): array {
$stats = $this->pdo->query("
SELECT category, COUNT(*) as chunks,
SUM(access_count) as total_access,
COUNT(DISTINCT source) as sources
FROM kb_embeddings
GROUP BY category ORDER BY chunks DESC
")->fetchAll(PDO::FETCH_ASSOC);
$total = $this->pdo->query("SELECT COUNT(*) FROM kb_embeddings")->fetchColumn();
return ['total_chunks' => $total, 'categories' => $stats];
}
}