Files
wevia-brain/s89-ai-apis/kb-ingestor.php
2026-04-12 23:01:36 +02:00

484 lines
20 KiB
PHP
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
/**
* WEVIA KB DOCUMENT INGESTOR v1.0
*
* Parse multi-format documents (PDF, PPTX, XLSX, DOCX, MD, TXT, images)
* Extract text content and store in knowledge_base + kb_documents
*
* USAGE:
* ?action=ingest&file=/path/to/file.pdf Ingest single file
* ?action=ingest&dir=/opt/wevads/storage/kb/ Ingest all files in dir
* ?action=scan List parseable files
* ?action=stats KB stats
* ?action=fix-encoding Fix UTF-8 corruption in all KB tables
* ?action=consolidate Merge fragmented KB tables into knowledge_base
*
* Supports: PDF, PPTX, XLSX, DOCX, MD, TXT, CSV, HTML, PNG/JPG/TIFF (OCR)
* Zero external dependency: uses pdftotext, tesseract, python3 (all local)
*/
header('Content-Type: application/json; charset=utf-8');
set_time_limit(300);
$pdo = new PDO('pgsql:host=localhost;dbname=adx_system', 'admin', 'admin123');
$pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$pdo->exec("SET search_path TO admin,affiliate,public");
$pdo->exec("SET client_encoding TO 'UTF8'");
$action = $_GET['action'] ?? $_POST['action'] ?? 'stats';
$file = $_GET['file'] ?? $_POST['file'] ?? null;
$dir = $_GET['dir'] ?? $_POST['dir'] ?? null;
$STORAGE = '/opt/wevads/storage/kb/';
$UPLOAD = '/opt/wevads/storage/uploads/';
// ============================================================
// UTF-8 SANITIZER — Prevents encoding corruption
// ============================================================
function sanitizeUTF8(string $text): string {
// Fix double-encoded UTF-8 (Latin-1 stored as UTF-8)
$fixed = @iconv('UTF-8', 'UTF-8//IGNORE', $text);
if ($fixed === false) {
// Try converting from common encodings
foreach (['Windows-1252', 'ISO-8859-1', 'ISO-8859-15'] as $enc) {
$try = @iconv($enc, 'UTF-8//TRANSLIT', $text);
if ($try !== false) return $try;
}
return mb_convert_encoding($text, 'UTF-8', 'UTF-8,ISO-8859-1,Windows-1252');
}
// Fix common mojibake patterns (é → é, è → è, etc.)
$mojibake = [
'é' => 'é', 'è' => 'è', 'ê' => 'ê', 'ë' => 'ë',
'à ' => 'à', 'â' => 'â', 'ç' => 'ç', 'ô' => 'ô',
'ù' => 'ù', 'û' => 'û', 'ï' => 'ï', 'î' => 'î',
'ü' => 'ü', 'ö' => 'ö', 'ä' => 'ä', 'ñ' => 'ñ',
'É' => 'É', 'È' => 'È', 'À' => 'À', 'Ç' => 'Ç',
'Ã"' => 'Ô', 'Û' => 'Û', 'Ã' => 'Í', '’' => "'",
'“' => '"', 'â€' => '"', 'â€"' => '—', 'â€"' => '',
'…' => '…', '°' => '°', '«' => '«', '»' => '»',
'€' => '€', '£' => '£',
];
$fixed = str_replace(array_keys($mojibake), array_values($mojibake), $fixed);
// Remove null bytes and other control chars (keep newlines/tabs)
$fixed = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/', '', $fixed);
return trim($fixed);
}
// ============================================================
// DOCUMENT PARSERS — Zero external dependency
// ============================================================
function parsePDF(string $path): array {
$output = [];
exec("pdftotext -layout " . escapeshellarg($path) . " - 2>/dev/null", $output, $rc);
$text = implode("\n", $output);
if (empty(trim($text))) {
// Fallback: OCR scanned PDF
$tmpDir = tempnam('/tmp', 'pdf_');
unlink($tmpDir);
mkdir($tmpDir);
exec("pdftoppm -png -r 300 " . escapeshellarg($path) . " $tmpDir/page 2>/dev/null");
$pages = glob("$tmpDir/page-*.png");
$text = '';
foreach ($pages as $page) {
exec("tesseract " . escapeshellarg($page) . " stdout -l fra+eng 2>/dev/null", $ocrOut);
$text .= implode("\n", $ocrOut) . "\n\n";
$ocrOut = [];
}
exec("rm -rf " . escapeshellarg($tmpDir));
}
// Get page count
exec("pdfinfo " . escapeshellarg($path) . " 2>/dev/null | grep Pages", $info);
$pages = 0;
if (!empty($info)) {
preg_match('/(\d+)/', $info[0], $m);
$pages = (int)($m[1] ?? 0);
}
return ['text' => sanitizeUTF8($text), 'pages' => $pages, 'method' => empty($output) ? 'ocr' : 'text'];
}
function parsePPTX(string $path): array {
$script = <<<'PY'
import sys
from pptx import Presentation
prs = Presentation(sys.argv[1])
texts = []
for i, slide in enumerate(prs.slides, 1):
slide_text = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_text.append(shape.text.strip())
if shape.has_table:
for row in shape.table.rows:
row_text = [cell.text.strip() for cell in row.cells if cell.text.strip()]
if row_text:
slide_text.append(" | ".join(row_text))
if slide_text:
texts.append(f"--- Slide {i} ---\n" + "\n".join(slide_text))
print("\n\n".join(texts))
PY;
$tmpPy = tempnam('/tmp', 'pptx_') . '.py';
file_put_contents($tmpPy, $script);
$output = shell_exec("python3 $tmpPy " . escapeshellarg($path) . " 2>/dev/null");
unlink($tmpPy);
$slides = substr_count($output ?? '', '--- Slide');
return ['text' => sanitizeUTF8($output ?? ''), 'slides' => $slides];
}
function parseXLSX(string $path): array {
$script = <<<'PY'
import sys, openpyxl
wb = openpyxl.load_workbook(sys.argv[1], data_only=True)
for sheet in wb.sheetnames:
ws = wb[sheet]
print(f"=== {sheet} ===")
for row in ws.iter_rows(values_only=True):
cells = [str(c).strip() if c is not None else '' for c in row]
if any(cells):
print(" | ".join(cells))
print()
PY;
$tmpPy = tempnam('/tmp', 'xlsx_') . '.py';
file_put_contents($tmpPy, $script);
$output = shell_exec("python3 $tmpPy " . escapeshellarg($path) . " 2>/dev/null");
unlink($tmpPy);
$sheets = substr_count($output ?? '', '===');
return ['text' => sanitizeUTF8($output ?? ''), 'sheets' => $sheets / 2];
}
function parseDOCX(string $path): array {
$script = <<<'PY'
import sys
from docx import Document
doc = Document(sys.argv[1])
texts = []
for para in doc.paragraphs:
if para.text.strip():
texts.append(para.text.strip())
for table in doc.tables:
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
if cells:
texts.append(" | ".join(cells))
print("\n".join(texts))
PY;
$tmpPy = tempnam('/tmp', 'docx_') . '.py';
file_put_contents($tmpPy, $script);
$output = shell_exec("python3 $tmpPy " . escapeshellarg($path) . " 2>/dev/null");
unlink($tmpPy);
return ['text' => sanitizeUTF8($output ?? '')];
}
function parseImage(string $path): array {
$output = [];
exec("tesseract " . escapeshellarg($path) . " stdout -l fra+eng 2>/dev/null", $output, $rc);
return ['text' => sanitizeUTF8(implode("\n", $output)), 'method' => 'ocr'];
}
function parseText(string $path): array {
$raw = file_get_contents($path);
return ['text' => sanitizeUTF8($raw)];
}
function parseCSV(string $path): array {
$lines = [];
if (($handle = fopen($path, 'r')) !== false) {
while (($data = fgetcsv($handle)) !== false) {
$lines[] = implode(' | ', array_filter($data, fn($v) => $v !== null && $v !== ''));
}
fclose($handle);
}
return ['text' => sanitizeUTF8(implode("\n", $lines)), 'rows' => count($lines)];
}
// ============================================================
// ROUTER
// ============================================================
function getParser(string $ext): ?string {
$map = [
'pdf' => 'parsePDF', 'pptx' => 'parsePPTX', 'ppt' => 'parsePPTX',
'xlsx' => 'parseXLSX', 'xls' => 'parseXLSX', 'csv' => 'parseCSV',
'docx' => 'parseDOCX', 'doc' => 'parseDOCX',
'md' => 'parseText', 'txt' => 'parseText', 'html' => 'parseText',
'htm' => 'parseText', 'json' => 'parseText', 'xml' => 'parseText',
'png' => 'parseImage', 'jpg' => 'parseImage', 'jpeg' => 'parseImage',
'tiff' => 'parseImage', 'tif' => 'parseImage', 'bmp' => 'parseImage',
'webp' => 'parseImage',
];
return $map[strtolower($ext)] ?? null;
}
function ingestFile(PDO $pdo, string $path): array {
if (!file_exists($path)) return ['error' => 'File not found: ' . $path];
$ext = pathinfo($path, PATHINFO_EXTENSION);
$parser = getParser($ext);
if (!$parser) return ['error' => "Unsupported format: .$ext"];
$filename = basename($path);
$title = pathinfo($filename, PATHINFO_FILENAME);
$title = str_replace(['_', '-'], ' ', $title);
$title = ucfirst(trim(preg_replace('/\s+/', ' ', $title)));
$size = filesize($path);
// Parse
$start = microtime(true);
$result = $parser($path);
$parseTime = round((microtime(true) - $start) * 1000);
$text = $result['text'] ?? '';
if (empty(trim($text))) return ['error' => "No text extracted from $filename", 'parser' => $parser];
// Determine category from path or content
$category = detectCategory($path, $text);
// Store in knowledge_base
$stmt = $pdo->prepare("INSERT INTO admin.knowledge_base (title, category, content, source, created_at, updated_at)
VALUES (:title, :cat, :content, :source, NOW(), NOW()) RETURNING id");
$stmt->execute([
':title' => $title,
':cat' => $category,
':content' => $text,
':source' => 'ingestor:' . $filename,
]);
$kbId = $stmt->fetchColumn();
// Store in kb_documents
$code = 'KB_' . str_pad($kbId, 4, '0', STR_PAD_LEFT);
$stmt = $pdo->prepare("INSERT INTO admin.kb_documents (code, title, category, file_path, is_active, created_at)
VALUES (:code, :title, :cat, :path, true, NOW()) ON CONFLICT DO NOTHING");
$stmt->execute([
':code' => $code,
':title' => $title,
':cat' => $category,
':path' => $path,
]);
return [
'status' => 'ingested',
'kb_id' => $kbId,
'code' => $code,
'title' => $title,
'category' => $category,
'format' => $ext,
'file_size' => $size,
'text_length'=> strlen($text),
'text_words' => str_word_count($text),
'parse_ms' => $parseTime,
'extra' => array_diff_key($result, ['text' => 1]),
];
}
function detectCategory(string $path, string $text): string {
$pathLower = strtolower($path);
$textLower = strtolower(substr($text, 0, 2000));
$categories = [
'Intelligence Artificielle' => ['ia', 'intelligence artificielle', 'machine learning', 'deep learning', 'neural', 'llm', 'gpt'],
'Cybersécurité' => ['cyber', 'sécurité', 'firewall', 'owasp', 'vulnerability', 'pentest'],
'Email Marketing' => ['email', 'deliverability', 'warmup', 'inbox', 'spam', 'dkim', 'spf', 'dmarc', 'isp'],
'Blockchain Web3' => ['blockchain', 'web3', 'crypto', 'nft', 'ethereum', 'smart contract'],
'Supply Chain' => ['supply chain', 'logistique', 'procurement', 'inventory'],
'Coding DevOps' => ['docker', 'kubernetes', 'ci/cd', 'devops', 'git', 'deployment'],
'Droit' => ['droit', 'juridique', 'rgpd', 'compliance', 'contrat'],
'IoT' => ['iot', 'capteur', 'sensor', 'industry 4.0', 'embedded'],
'IT Consulting' => ['consulting', 'transformation', 'digital', 'stratégie'],
'Infrastructure' => ['serveur', 'server', 'nginx', 'apache', 'postgresql', 'linux'],
'WEVADS' => ['wevads', 'arsenal', 'brain', 'pipeline', 'adx'],
];
foreach ($categories as $cat => $keywords) {
foreach ($keywords as $kw) {
if (strpos($pathLower, $kw) !== false || strpos($textLower, $kw) !== false) {
return $cat;
}
}
}
return 'General';
}
// ============================================================
// ACTIONS
// ============================================================
switch ($action) {
case 'ingest':
if ($file) {
echo json_encode(ingestFile($pdo, $file), JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
} elseif ($dir) {
if (!is_dir($dir)) die(json_encode(['error' => 'Directory not found']));
$results = [];
$exts = ['pdf','pptx','ppt','xlsx','xls','csv','docx','doc','md','txt','html','png','jpg','jpeg','tiff'];
foreach (scandir($dir) as $f) {
if ($f[0] === '.') continue;
$ext = strtolower(pathinfo($f, PATHINFO_EXTENSION));
if (in_array($ext, $exts)) {
$results[] = ingestFile($pdo, rtrim($dir, '/') . '/' . $f);
}
}
$ok = count(array_filter($results, fn($r) => ($r['status'] ?? '') === 'ingested'));
echo json_encode([
'total' => count($results),
'ingested' => $ok,
'errors' => count($results) - $ok,
'results' => $results
], JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
} else {
echo json_encode(['error' => 'Specify ?file=/path or ?dir=/path']);
}
break;
case 'scan':
$scanDirs = [$STORAGE, $UPLOAD, '/opt/wevads/hamid-files/'];
$files = [];
foreach ($scanDirs as $d) {
if (!is_dir($d)) continue;
foreach (scandir($d) as $f) {
if ($f[0] === '.') continue;
$ext = strtolower(pathinfo($f, PATHINFO_EXTENSION));
$parser = getParser($ext);
if ($parser) {
$files[] = [
'path' => $d . $f,
'name' => $f,
'ext' => $ext,
'size' => filesize($d . $f),
'parser' => $parser,
];
}
}
}
echo json_encode(['parseable_files' => count($files), 'files' => $files], JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
break;
case 'fix-encoding':
$tables = [
'knowledge_base' => 'content',
'hamid_knowledge' => 'content',
'sentinel_knowledge' => 'content',
'isp_knowledge' => 'content',
'chatbot_knowledge' => 'answer',
];
$fixed = [];
foreach ($tables as $table => $col) {
try {
$stmt = $pdo->query("SELECT id, $col FROM admin.$table WHERE $col IS NOT NULL AND $col != ''");
$count = 0;
while ($row = $stmt->fetch(PDO::FETCH_ASSOC)) {
$original = $row[$col];
$cleaned = sanitizeUTF8($original);
if ($cleaned !== $original) {
$upd = $pdo->prepare("UPDATE admin.$table SET $col = :val WHERE id = :id");
$upd->execute([':val' => $cleaned, ':id' => $row['id']]);
$count++;
}
}
$fixed[$table] = $count;
} catch (Exception $e) {
$fixed[$table] = 'error: ' . $e->getMessage();
}
}
// Also fix titles in knowledge_base
try {
$stmt = $pdo->query("SELECT id, title FROM admin.knowledge_base WHERE title IS NOT NULL");
$tc = 0;
while ($row = $stmt->fetch(PDO::FETCH_ASSOC)) {
$cleaned = sanitizeUTF8($row['title']);
if ($cleaned !== $row['title']) {
$pdo->prepare("UPDATE admin.knowledge_base SET title = :val WHERE id = :id")->execute([':val' => $cleaned, ':id' => $row['id']]);
$tc++;
}
}
$fixed['knowledge_base_titles'] = $tc;
} catch (Exception $e) {}
echo json_encode(['action' => 'fix-encoding', 'fixed_rows' => $fixed], JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
break;
case 'consolidate':
// Merge fragmented tables into knowledge_base
$sources = [
'hamid_knowledge' => "SELECT topic as title, 'hamid' as category, content, source, last_updated as created_at FROM admin.hamid_knowledge WHERE content IS NOT NULL AND content != ''",
'sentinel_knowledge' => "SELECT topic as title, 'sentinel' as category, content, source, last_updated as created_at FROM admin.sentinel_knowledge WHERE content IS NOT NULL AND content != ''",
'brain_knowledge' => "SELECT topic as title, 'brain' as category, content, source, last_updated as created_at FROM admin.brain_knowledge WHERE content IS NOT NULL AND content != ''",
];
$merged = [];
foreach ($sources as $table => $sql) {
try {
$rows = $pdo->query($sql)->fetchAll(PDO::FETCH_ASSOC);
$count = 0;
foreach ($rows as $row) {
// Check if already exists (avoid duplicates)
$check = $pdo->prepare("SELECT COUNT(*) FROM admin.knowledge_base WHERE title = :t AND source = :s");
$check->execute([':t' => $row['title'], ':s' => $row['source'] ?? $table]);
if ($check->fetchColumn() == 0) {
$ins = $pdo->prepare("INSERT INTO admin.knowledge_base (title, category, content, source, created_at, updated_at) VALUES (:t, :c, :co, :s, :d, NOW())");
$ins->execute([
':t' => sanitizeUTF8($row['title'] ?? 'Untitled'),
':c' => $row['category'] ?? $table,
':co' => sanitizeUTF8($row['content']),
':s' => ($row['source'] ?? $table) . ':consolidated',
':d' => $row['created_at'] ?? date('Y-m-d H:i:s'),
]);
$count++;
}
}
$merged[$table] = ['total' => count($rows), 'new' => $count, 'skipped' => count($rows) - $count];
} catch (Exception $e) {
$merged[$table] = 'error: ' . $e->getMessage();
}
}
$total = $pdo->query("SELECT COUNT(*) FROM admin.knowledge_base")->fetchColumn();
echo json_encode(['action' => 'consolidate', 'merged' => $merged, 'knowledge_base_total' => $total], JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
break;
case 'stats':
default:
$stats = [];
$tables = ['knowledge_base','hamid_knowledge','sentinel_knowledge','isp_knowledge',
'kb_documents','chatbot_knowledge','brain_knowledge','weval_mind_learning'];
foreach ($tables as $t) {
try {
$stats[$t] = (int)$pdo->query("SELECT COUNT(*) FROM admin.$t")->fetchColumn();
} catch (Exception $e) {
$stats[$t] = 'N/A';
}
}
// Categories in knowledge_base
$cats = $pdo->query("SELECT category, COUNT(*) as cnt FROM admin.knowledge_base GROUP BY category ORDER BY cnt DESC LIMIT 20")->fetchAll(PDO::FETCH_ASSOC);
// Supported formats
$formats = ['pdf','pptx','xlsx','docx','md','txt','csv','html','png','jpg','tiff'];
echo json_encode([
'kb_tables' => $stats,
'total_entries' => array_sum(array_filter($stats, 'is_int')),
'categories' => $cats,
'supported_formats' => $formats,
'storage_path' => $STORAGE,
'upload_path' => $UPLOAD,
'tools' => [
'pdftotext' => !empty(shell_exec('which pdftotext')),
'tesseract' => !empty(shell_exec('which tesseract')),
'python3' => !empty(shell_exec('which python3')),
'python-pptx' => !empty(shell_exec("pip3 show python-pptx 2>/dev/null")),
'python-docx' => !empty(shell_exec("pip3 show python-docx 2>/dev/null")),
'openpyxl' => !empty(shell_exec("pip3 show openpyxl 2>/dev/null")),
],
], JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
break;
}