Files
wevia-brain/modules/advanced/web-intelligence.php
2026-04-12 23:01:36 +02:00

229 lines
8.8 KiB
PHP
Executable File

<?php
/**
* WEVIA OPUS — Web Intelligence Engine
*
* Collecte et analyse d'informations web:
* - Scraping structuré (HTML → données)
* - Analyse de contenu (sentiment, entités, résumé)
* - Monitoring de compétiteurs/tendances
* - Feed RSS/Atom parsing
*/
class WebIntelligence {
private string $ollamaUrl;
private array $headers;
public function __construct(string $ollamaUrl = 'http://localhost:11434') {
$this->ollamaUrl = $ollamaUrl;
$this->headers = [
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9',
'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8,ar;q=0.7'
];
}
/**
* Récupère et parse une page web
*/
public function fetchPage(string $url, int $timeout = 30): array {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_HTTPHEADER => $this->headers,
CURLOPT_SSL_VERIFYPEER => true,
CURLOPT_ENCODING => 'gzip, deflate'
]);
$html = curl_exec($ch);
$info = curl_getinfo($ch);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
return ['success' => false, 'error' => $error, 'http_code' => $info['http_code']];
}
return [
'success' => true,
'html' => $html,
'http_code' => $info['http_code'],
'content_type' => $info['content_type'],
'url_final' => $info['url'],
'size' => $info['size_download'],
'time' => round($info['total_time'], 2)
];
}
/**
* Extrait le texte propre d'une page HTML
*/
public function extractText(string $html): string {
// Supprimer script, style, nav, footer, header
$html = preg_replace('/<(script|style|nav|footer|header|aside)[^>]*>.*?<\/\1>/is', '', $html);
// Supprimer les tags HTML
$text = strip_tags($html);
// Nettoyer les espaces multiples
$text = preg_replace('/\s+/', ' ', $text);
$text = preg_replace('/\n\s*\n/', "\n\n", $text);
return trim($text);
}
/**
* Extrait les métadonnées SEO d'une page
*/
public function extractMeta(string $html): array {
$meta = [];
// Title
if (preg_match('/<title[^>]*>(.*?)<\/title>/is', $html, $m)) {
$meta['title'] = html_entity_decode(trim($m[1]));
}
// Meta description
if (preg_match('/<meta\s+name=["\']description["\']\s+content=["\'](.*?)["\']/is', $html, $m)) {
$meta['description'] = html_entity_decode(trim($m[1]));
}
// Open Graph
preg_match_all('/<meta\s+property=["\']og:(\w+)["\']\s+content=["\'](.*?)["\']/is', $html, $matches);
if (!empty($matches[1])) {
foreach ($matches[1] as $i => $key) {
$meta['og_' . $key] = html_entity_decode($matches[2][$i]);
}
}
// Links
preg_match_all('/<a\s+[^>]*href=["\'](https?:\/\/[^"\']+)["\']/i', $html, $linkMatches);
$meta['external_links'] = array_unique($linkMatches[1] ?? []);
$meta['link_count'] = count($meta['external_links']);
// Headers structure
preg_match_all('/<h([1-6])[^>]*>(.*?)<\/h\1>/is', $html, $headerMatches);
$meta['headers'] = [];
if (!empty($headerMatches[1])) {
foreach ($headerMatches[1] as $i => $level) {
$meta['headers'][] = [
'level' => (int)$level,
'text' => strip_tags(trim($headerMatches[2][$i]))
];
}
}
return $meta;
}
/**
* Analyse le contenu avec l'IA (sentiment, entités, résumé)
*/
public function analyzeContent(string $text, string $analysisType = 'full'): array {
$text = mb_substr($text, 0, 4000); // Limiter pour le LLM
$prompts = [
'sentiment' => "Analyse le sentiment de ce texte. Retourne UNIQUEMENT un JSON:\n{\"sentiment\":\"positif|neutre|négatif|mixte\",\"confidence\":0.0-1.0,\"key_emotions\":[\"liste\"]}\n\nTexte: $text",
'entities' => "Extrais les entités nommées de ce texte. Retourne UNIQUEMENT un JSON:\n{\"persons\":[],\"organizations\":[],\"locations\":[],\"products\":[],\"dates\":[],\"amounts\":[]}\n\nTexte: $text",
'summary' => "Résume ce texte en 3 phrases maximum. Retourne UNIQUEMENT un JSON:\n{\"summary\":\"...\",\"key_points\":[\"point1\",\"point2\"],\"topic\":\"sujet principal\"}\n\nTexte: $text",
'full' => "Analyse complète de ce texte. Retourne UNIQUEMENT un JSON:\n{\n \"summary\":\"3 phrases max\",\n \"sentiment\":\"positif|neutre|négatif\",\n \"key_points\":[\"liste\"],\n \"entities\":{\"persons\":[],\"organizations\":[],\"products\":[]},\n \"topic\":\"sujet\",\n \"language\":\"fr|en|ar\",\n \"actionable_insights\":[\"liste\"]\n}\n\nTexte: $text"
];
$prompt = $prompts[$analysisType] ?? $prompts['full'];
$ch = curl_init("{$this->ollamaUrl}/api/generate");
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => json_encode([
'model' => 'llama3.1:8b', 'prompt' => $prompt,
'stream' => false, 'options' => ['temperature' => 0.2, 'num_predict' => 1024]
]),
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
CURLOPT_TIMEOUT => 60
]);
$resp = curl_exec($ch);
curl_close($ch);
$data = json_decode($resp, true);
$response = $data['response'] ?? '';
// Parse JSON from response
$response = preg_replace('/^```json\s*/', '', trim($response));
$response = preg_replace('/\s*```$/', '', $response);
$result = json_decode($response, true);
return is_array($result) ? $result : ['raw' => $response, 'parse_error' => true];
}
/**
* Compare deux pages (utile pour monitoring concurrentiel)
*/
public function comparePages(string $url1, string $url2): array {
$page1 = $this->fetchPage($url1);
$page2 = $this->fetchPage($url2);
if (!$page1['success'] || !$page2['success']) {
return ['error' => 'Failed to fetch one or both pages'];
}
$text1 = $this->extractText($page1['html']);
$text2 = $this->extractText($page2['html']);
$meta1 = $this->extractMeta($page1['html']);
$meta2 = $this->extractMeta($page2['html']);
return [
'page1' => ['url' => $url1, 'title' => $meta1['title'] ?? '', 'text_length' => strlen($text1), 'links' => $meta1['link_count'] ?? 0],
'page2' => ['url' => $url2, 'title' => $meta2['title'] ?? '', 'text_length' => strlen($text2), 'links' => $meta2['link_count'] ?? 0],
'analysis1' => $this->analyzeContent($text1, 'summary'),
'analysis2' => $this->analyzeContent($text2, 'summary')
];
}
/**
* Parse un feed RSS/Atom
*/
public function parseFeed(string $url): array {
$result = $this->fetchPage($url);
if (!$result['success']) return ['error' => $result['error']];
$xml = @simplexml_load_string($result['html']);
if (!$xml) return ['error' => 'Invalid XML'];
$items = [];
// RSS 2.0
if (isset($xml->channel->item)) {
foreach ($xml->channel->item as $item) {
$items[] = [
'title' => (string)$item->title,
'link' => (string)$item->link,
'description' => strip_tags((string)$item->description),
'pubDate' => (string)$item->pubDate,
'category' => (string)($item->category ?? '')
];
}
}
// Atom
elseif (isset($xml->entry)) {
foreach ($xml->entry as $entry) {
$items[] = [
'title' => (string)$entry->title,
'link' => (string)($entry->link['href'] ?? ''),
'description' => strip_tags((string)$entry->summary),
'pubDate' => (string)$entry->updated,
];
}
}
return [
'feed_title' => (string)($xml->channel->title ?? $xml->title ?? ''),
'item_count' => count($items),
'items' => array_slice($items, 0, 20) // Max 20 items
];
}
}