229 lines
8.8 KiB
PHP
Executable File
229 lines
8.8 KiB
PHP
Executable File
<?php
|
|
/**
|
|
* WEVIA OPUS — Web Intelligence Engine
|
|
*
|
|
* Collecte et analyse d'informations web:
|
|
* - Scraping structuré (HTML → données)
|
|
* - Analyse de contenu (sentiment, entités, résumé)
|
|
* - Monitoring de compétiteurs/tendances
|
|
* - Feed RSS/Atom parsing
|
|
*/
|
|
|
|
class WebIntelligence {
|
|
|
|
private string $ollamaUrl;
|
|
private array $headers;
|
|
|
|
public function __construct(string $ollamaUrl = 'http://localhost:11434') {
|
|
$this->ollamaUrl = $ollamaUrl;
|
|
$this->headers = [
|
|
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9',
|
|
'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8,ar;q=0.7'
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Récupère et parse une page web
|
|
*/
|
|
public function fetchPage(string $url, int $timeout = 30): array {
|
|
$ch = curl_init($url);
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_MAXREDIRS => 5,
|
|
CURLOPT_TIMEOUT => $timeout,
|
|
CURLOPT_HTTPHEADER => $this->headers,
|
|
CURLOPT_SSL_VERIFYPEER => true,
|
|
CURLOPT_ENCODING => 'gzip, deflate'
|
|
]);
|
|
|
|
$html = curl_exec($ch);
|
|
$info = curl_getinfo($ch);
|
|
$error = curl_error($ch);
|
|
curl_close($ch);
|
|
|
|
if ($error) {
|
|
return ['success' => false, 'error' => $error, 'http_code' => $info['http_code']];
|
|
}
|
|
|
|
return [
|
|
'success' => true,
|
|
'html' => $html,
|
|
'http_code' => $info['http_code'],
|
|
'content_type' => $info['content_type'],
|
|
'url_final' => $info['url'],
|
|
'size' => $info['size_download'],
|
|
'time' => round($info['total_time'], 2)
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Extrait le texte propre d'une page HTML
|
|
*/
|
|
public function extractText(string $html): string {
|
|
// Supprimer script, style, nav, footer, header
|
|
$html = preg_replace('/<(script|style|nav|footer|header|aside)[^>]*>.*?<\/\1>/is', '', $html);
|
|
|
|
// Supprimer les tags HTML
|
|
$text = strip_tags($html);
|
|
|
|
// Nettoyer les espaces multiples
|
|
$text = preg_replace('/\s+/', ' ', $text);
|
|
$text = preg_replace('/\n\s*\n/', "\n\n", $text);
|
|
|
|
return trim($text);
|
|
}
|
|
|
|
/**
|
|
* Extrait les métadonnées SEO d'une page
|
|
*/
|
|
public function extractMeta(string $html): array {
|
|
$meta = [];
|
|
|
|
// Title
|
|
if (preg_match('/<title[^>]*>(.*?)<\/title>/is', $html, $m)) {
|
|
$meta['title'] = html_entity_decode(trim($m[1]));
|
|
}
|
|
|
|
// Meta description
|
|
if (preg_match('/<meta\s+name=["\']description["\']\s+content=["\'](.*?)["\']/is', $html, $m)) {
|
|
$meta['description'] = html_entity_decode(trim($m[1]));
|
|
}
|
|
|
|
// Open Graph
|
|
preg_match_all('/<meta\s+property=["\']og:(\w+)["\']\s+content=["\'](.*?)["\']/is', $html, $matches);
|
|
if (!empty($matches[1])) {
|
|
foreach ($matches[1] as $i => $key) {
|
|
$meta['og_' . $key] = html_entity_decode($matches[2][$i]);
|
|
}
|
|
}
|
|
|
|
// Links
|
|
preg_match_all('/<a\s+[^>]*href=["\'](https?:\/\/[^"\']+)["\']/i', $html, $linkMatches);
|
|
$meta['external_links'] = array_unique($linkMatches[1] ?? []);
|
|
$meta['link_count'] = count($meta['external_links']);
|
|
|
|
// Headers structure
|
|
preg_match_all('/<h([1-6])[^>]*>(.*?)<\/h\1>/is', $html, $headerMatches);
|
|
$meta['headers'] = [];
|
|
if (!empty($headerMatches[1])) {
|
|
foreach ($headerMatches[1] as $i => $level) {
|
|
$meta['headers'][] = [
|
|
'level' => (int)$level,
|
|
'text' => strip_tags(trim($headerMatches[2][$i]))
|
|
];
|
|
}
|
|
}
|
|
|
|
return $meta;
|
|
}
|
|
|
|
/**
|
|
* Analyse le contenu avec l'IA (sentiment, entités, résumé)
|
|
*/
|
|
public function analyzeContent(string $text, string $analysisType = 'full'): array {
|
|
$text = mb_substr($text, 0, 4000); // Limiter pour le LLM
|
|
|
|
$prompts = [
|
|
'sentiment' => "Analyse le sentiment de ce texte. Retourne UNIQUEMENT un JSON:\n{\"sentiment\":\"positif|neutre|négatif|mixte\",\"confidence\":0.0-1.0,\"key_emotions\":[\"liste\"]}\n\nTexte: $text",
|
|
'entities' => "Extrais les entités nommées de ce texte. Retourne UNIQUEMENT un JSON:\n{\"persons\":[],\"organizations\":[],\"locations\":[],\"products\":[],\"dates\":[],\"amounts\":[]}\n\nTexte: $text",
|
|
'summary' => "Résume ce texte en 3 phrases maximum. Retourne UNIQUEMENT un JSON:\n{\"summary\":\"...\",\"key_points\":[\"point1\",\"point2\"],\"topic\":\"sujet principal\"}\n\nTexte: $text",
|
|
'full' => "Analyse complète de ce texte. Retourne UNIQUEMENT un JSON:\n{\n \"summary\":\"3 phrases max\",\n \"sentiment\":\"positif|neutre|négatif\",\n \"key_points\":[\"liste\"],\n \"entities\":{\"persons\":[],\"organizations\":[],\"products\":[]},\n \"topic\":\"sujet\",\n \"language\":\"fr|en|ar\",\n \"actionable_insights\":[\"liste\"]\n}\n\nTexte: $text"
|
|
];
|
|
|
|
$prompt = $prompts[$analysisType] ?? $prompts['full'];
|
|
|
|
$ch = curl_init("{$this->ollamaUrl}/api/generate");
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_POST => true,
|
|
CURLOPT_POSTFIELDS => json_encode([
|
|
'model' => 'llama3.1:8b', 'prompt' => $prompt,
|
|
'stream' => false, 'options' => ['temperature' => 0.2, 'num_predict' => 1024]
|
|
]),
|
|
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
|
|
CURLOPT_TIMEOUT => 60
|
|
]);
|
|
$resp = curl_exec($ch);
|
|
curl_close($ch);
|
|
|
|
$data = json_decode($resp, true);
|
|
$response = $data['response'] ?? '';
|
|
|
|
// Parse JSON from response
|
|
$response = preg_replace('/^```json\s*/', '', trim($response));
|
|
$response = preg_replace('/\s*```$/', '', $response);
|
|
|
|
$result = json_decode($response, true);
|
|
return is_array($result) ? $result : ['raw' => $response, 'parse_error' => true];
|
|
}
|
|
|
|
/**
|
|
* Compare deux pages (utile pour monitoring concurrentiel)
|
|
*/
|
|
public function comparePages(string $url1, string $url2): array {
|
|
$page1 = $this->fetchPage($url1);
|
|
$page2 = $this->fetchPage($url2);
|
|
|
|
if (!$page1['success'] || !$page2['success']) {
|
|
return ['error' => 'Failed to fetch one or both pages'];
|
|
}
|
|
|
|
$text1 = $this->extractText($page1['html']);
|
|
$text2 = $this->extractText($page2['html']);
|
|
$meta1 = $this->extractMeta($page1['html']);
|
|
$meta2 = $this->extractMeta($page2['html']);
|
|
|
|
return [
|
|
'page1' => ['url' => $url1, 'title' => $meta1['title'] ?? '', 'text_length' => strlen($text1), 'links' => $meta1['link_count'] ?? 0],
|
|
'page2' => ['url' => $url2, 'title' => $meta2['title'] ?? '', 'text_length' => strlen($text2), 'links' => $meta2['link_count'] ?? 0],
|
|
'analysis1' => $this->analyzeContent($text1, 'summary'),
|
|
'analysis2' => $this->analyzeContent($text2, 'summary')
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Parse un feed RSS/Atom
|
|
*/
|
|
public function parseFeed(string $url): array {
|
|
$result = $this->fetchPage($url);
|
|
if (!$result['success']) return ['error' => $result['error']];
|
|
|
|
$xml = @simplexml_load_string($result['html']);
|
|
if (!$xml) return ['error' => 'Invalid XML'];
|
|
|
|
$items = [];
|
|
|
|
// RSS 2.0
|
|
if (isset($xml->channel->item)) {
|
|
foreach ($xml->channel->item as $item) {
|
|
$items[] = [
|
|
'title' => (string)$item->title,
|
|
'link' => (string)$item->link,
|
|
'description' => strip_tags((string)$item->description),
|
|
'pubDate' => (string)$item->pubDate,
|
|
'category' => (string)($item->category ?? '')
|
|
];
|
|
}
|
|
}
|
|
// Atom
|
|
elseif (isset($xml->entry)) {
|
|
foreach ($xml->entry as $entry) {
|
|
$items[] = [
|
|
'title' => (string)$entry->title,
|
|
'link' => (string)($entry->link['href'] ?? ''),
|
|
'description' => strip_tags((string)$entry->summary),
|
|
'pubDate' => (string)$entry->updated,
|
|
];
|
|
}
|
|
}
|
|
|
|
return [
|
|
'feed_title' => (string)($xml->channel->title ?? $xml->title ?? ''),
|
|
'item_count' => count($items),
|
|
'items' => array_slice($items, 0, 20) // Max 20 items
|
|
];
|
|
}
|
|
}
|