185 lines
6.3 KiB
PHP
Executable File
185 lines
6.3 KiB
PHP
Executable File
<?php
|
|
/**
|
|
* ╔══════════════════════════════════════════════════════════╗
|
|
* ║ WEVIA OPUS — RAG Engine (Retrieval-Augmented Gen) ║
|
|
* ║ pgvector + Ollama Embeddings + Semantic Search ║
|
|
* ╚══════════════════════════════════════════════════════════╝
|
|
*/
|
|
|
|
class RAGEngine {
|
|
private $pdo;
|
|
private $embeddingModel = 'nomic-embed-text';
|
|
private $ollamaUrl;
|
|
private $topK = 5;
|
|
private $minSimilarity = 0.3;
|
|
|
|
public function __construct(?PDO $pdo = null, string $ollamaUrl = 'http://127.0.0.1:11434') {
|
|
$this->ollamaUrl = $ollamaUrl;
|
|
if ($pdo) {
|
|
$this->pdo = $pdo;
|
|
} else {
|
|
$this->pdo = new PDO("pgsql:host=127.0.0.1;dbname=wevia_db", "postgres", "");
|
|
$this->pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
|
|
}
|
|
$this->ensureSchema();
|
|
}
|
|
|
|
private function ensureSchema(): void {
|
|
$this->pdo->exec("CREATE EXTENSION IF NOT EXISTS vector");
|
|
$this->pdo->exec("
|
|
CREATE TABLE IF NOT EXISTS kb_embeddings (
|
|
id SERIAL PRIMARY KEY,
|
|
source VARCHAR(255),
|
|
category VARCHAR(100),
|
|
chunk_text TEXT NOT NULL,
|
|
embedding vector(768),
|
|
metadata JSONB DEFAULT '{}',
|
|
created_at TIMESTAMP DEFAULT NOW(),
|
|
access_count INT DEFAULT 0
|
|
)
|
|
");
|
|
$this->pdo->exec("
|
|
CREATE INDEX IF NOT EXISTS idx_kb_embedding ON kb_embeddings
|
|
USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100)
|
|
");
|
|
}
|
|
|
|
/**
|
|
* Ingère un document en le découpant en chunks avec overlap
|
|
*/
|
|
public function ingest(string $text, string $source, string $category = 'general', array $meta = []): int {
|
|
$chunks = $this->chunkText($text, 512, 64);
|
|
$count = 0;
|
|
|
|
foreach ($chunks as $chunk) {
|
|
$embedding = $this->getEmbedding($chunk);
|
|
if (!$embedding) continue;
|
|
|
|
$stmt = $this->pdo->prepare("
|
|
INSERT INTO kb_embeddings (source, category, chunk_text, embedding, metadata)
|
|
VALUES (?, ?, ?, ?::vector, ?::jsonb)
|
|
");
|
|
$stmt->execute([
|
|
$source, $category, $chunk,
|
|
'[' . implode(',', $embedding) . ']',
|
|
json_encode($meta)
|
|
]);
|
|
$count++;
|
|
}
|
|
|
|
return $count;
|
|
}
|
|
|
|
/**
|
|
* Recherche sémantique — trouve les chunks les plus pertinents
|
|
*/
|
|
public function search(string $query, int $topK = null, string $category = null): array {
|
|
$k = $topK ?? $this->topK;
|
|
$embedding = $this->getEmbedding($query);
|
|
if (!$embedding) return [];
|
|
|
|
$vecStr = '[' . implode(',', $embedding) . ']';
|
|
|
|
$sql = "SELECT id, source, category, chunk_text, metadata,
|
|
1 - (embedding <=> ?::vector) as similarity
|
|
FROM kb_embeddings
|
|
WHERE 1=1";
|
|
$params = [$vecStr];
|
|
|
|
if ($category) {
|
|
$sql .= " AND category = ?";
|
|
$params[] = $category;
|
|
}
|
|
|
|
$sql .= " ORDER BY embedding <=> ?::vector LIMIT ?";
|
|
$params[] = $vecStr;
|
|
$params[] = $k;
|
|
|
|
$stmt = $this->pdo->prepare($sql);
|
|
$stmt->execute($params);
|
|
$results = $stmt->fetchAll(PDO::FETCH_ASSOC);
|
|
|
|
// Update access count
|
|
foreach ($results as $r) {
|
|
$this->pdo->exec("UPDATE kb_embeddings SET access_count = access_count + 1 WHERE id = " . intval($r['id']));
|
|
}
|
|
|
|
return array_filter($results, fn($r) => $r['similarity'] >= $this->minSimilarity);
|
|
}
|
|
|
|
/**
|
|
* Génère le contexte RAG pour une requête
|
|
*/
|
|
public function getContext(string $query, string $category = null): string {
|
|
$results = $this->search($query, $this->topK, $category);
|
|
|
|
if (empty($results)) return '';
|
|
|
|
$context = "--- CONTEXTE KNOWLEDGE BASE ---\n";
|
|
foreach ($results as $i => $r) {
|
|
$sim = round($r['similarity'] * 100);
|
|
$context .= "[{$r['source']}] (pertinence: {$sim}%)\n{$r['chunk_text']}\n\n";
|
|
}
|
|
$context .= "--- FIN CONTEXTE ---\n";
|
|
|
|
return $context;
|
|
}
|
|
|
|
/**
|
|
* Obtient l'embedding d'un texte via Ollama
|
|
*/
|
|
private function getEmbedding(string $text): ?array {
|
|
$ch = curl_init("{$this->ollamaUrl}/api/embed");
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_POST => true,
|
|
CURLOPT_POSTFIELDS => json_encode([
|
|
'model' => $this->embeddingModel,
|
|
'input' => $text
|
|
]),
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_TIMEOUT => 30,
|
|
CURLOPT_HTTPHEADER => ['Content-Type: application/json']
|
|
]);
|
|
$raw = curl_exec($ch);
|
|
curl_close($ch);
|
|
|
|
$data = json_decode($raw, true);
|
|
return $data['embeddings'][0] ?? null;
|
|
}
|
|
|
|
/**
|
|
* Découpe un texte en chunks avec overlap
|
|
*/
|
|
private function chunkText(string $text, int $chunkSize = 512, int $overlap = 64): array {
|
|
$words = preg_split('/\s+/', $text);
|
|
$chunks = [];
|
|
$total = count($words);
|
|
|
|
for ($i = 0; $i < $total; $i += ($chunkSize - $overlap)) {
|
|
$chunk = implode(' ', array_slice($words, $i, $chunkSize));
|
|
if (mb_strlen($chunk) > 20) {
|
|
$chunks[] = $chunk;
|
|
}
|
|
}
|
|
|
|
return $chunks;
|
|
}
|
|
|
|
/**
|
|
* Stats du KB
|
|
*/
|
|
public function getStats(): array {
|
|
$stats = $this->pdo->query("
|
|
SELECT category, COUNT(*) as chunks,
|
|
SUM(access_count) as total_access,
|
|
COUNT(DISTINCT source) as sources
|
|
FROM kb_embeddings
|
|
GROUP BY category ORDER BY chunks DESC
|
|
")->fetchAll(PDO::FETCH_ASSOC);
|
|
|
|
$total = $this->pdo->query("SELECT COUNT(*) FROM kb_embeddings")->fetchColumn();
|
|
|
|
return ['total_chunks' => $total, 'categories' => $stats];
|
|
}
|
|
}
|