wevia-brain/modules/advanced/web-intelligence.php

<?php
/**
 * WEVIA OPUS — Web Intelligence Engine
 *
 * Collecte et analyse d'informations web:
 * - Scraping structuré (HTML → données)
 * - Analyse de contenu (sentiment, entités, résumé)
 * - Monitoring de compétiteurs/tendances
 * - Feed RSS/Atom parsing
 */

class WebIntelligence {

    private string $ollamaUrl;
    private array $headers;

    public function __construct(string $ollamaUrl = 'http://localhost:11434') {
        $this->ollamaUrl = $ollamaUrl;
        $this->headers = [
            'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept: text/html,application/xhtml+xml,application/xml;q=0.9',
            'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8,ar;q=0.7'
        ];
    }

    /**
     * Récupère et parse une page web
     */
    public function fetchPage(string $url, int $timeout = 30): array {
        $ch = curl_init($url);
        curl_setopt_array($ch, [
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_MAXREDIRS => 5,
            CURLOPT_TIMEOUT => $timeout,
            CURLOPT_HTTPHEADER => $this->headers,
            CURLOPT_SSL_VERIFYPEER => true,
            CURLOPT_ENCODING => 'gzip, deflate'
        ]);

        $html = curl_exec($ch);
        $info = curl_getinfo($ch);
        $error = curl_error($ch);
        curl_close($ch);

        if ($error) {
            return ['success' => false, 'error' => $error, 'http_code' => $info['http_code']];
        }

        return [
            'success' => true,
            'html' => $html,
            'http_code' => $info['http_code'],
            'content_type' => $info['content_type'],
            'url_final' => $info['url'],
            'size' => $info['size_download'],
            'time' => round($info['total_time'], 2)
        ];
    }

    /**
     * Extrait le texte propre d'une page HTML
     */
    public function extractText(string $html): string {
        // Supprimer script, style, nav, footer, header
        $html = preg_replace('/<(script|style|nav|footer|header|aside)[^>]*>.*?<\/\1>/is', '', $html);

        // Supprimer les tags HTML
        $text = strip_tags($html);

        // Nettoyer les espaces multiples
        $text = preg_replace('/\s+/', ' ', $text);
        $text = preg_replace('/\n\s*\n/', "\n\n", $text);

        return trim($text);
    }

    /**
     * Extrait les métadonnées SEO d'une page
     */
    public function extractMeta(string $html): array {
        $meta = [];

        // Title
        if (preg_match('/<title[^>]*>(.*?)<\/title>/is', $html, $m)) {
            $meta['title'] = html_entity_decode(trim($m[1]));
        }

        // Meta description
        if (preg_match('/<meta\s+name=["\']description["\']\s+content=["\'](.*?)["\']/is', $html, $m)) {
            $meta['description'] = html_entity_decode(trim($m[1]));
        }

        // Open Graph
        preg_match_all('/<meta\s+property=["\']og:(\w+)["\']\s+content=["\'](.*?)["\']/is', $html, $matches);
        if (!empty($matches[1])) {
            foreach ($matches[1] as $i => $key) {
                $meta['og_' . $key] = html_entity_decode($matches[2][$i]);
            }
        }

        // Links
        preg_match_all('/<a\s+[^>]*href=["\'](https?:\/\/[^"\']+)["\']/i', $html, $linkMatches);
        $meta['external_links'] = array_unique($linkMatches[1] ?? []);
        $meta['link_count'] = count($meta['external_links']);

        // Headers structure
        preg_match_all('/<h([1-6])[^>]*>(.*?)<\/h\1>/is', $html, $headerMatches);
        $meta['headers'] = [];
        if (!empty($headerMatches[1])) {
            foreach ($headerMatches[1] as $i => $level) {
                $meta['headers'][] = [
                    'level' => (int)$level,
                    'text' => strip_tags(trim($headerMatches[2][$i]))
                ];
            }
        }

        return $meta;
    }

    /**
     * Analyse le contenu avec l'IA (sentiment, entités, résumé)
     */
    public function analyzeContent(string $text, string $analysisType = 'full'): array {
        $text = mb_substr($text, 0, 4000); // Limiter pour le LLM

        $prompts = [
            'sentiment' => "Analyse le sentiment de ce texte. Retourne UNIQUEMENT un JSON:\n{\"sentiment\":\"positif|neutre|négatif|mixte\",\"confidence\":0.0-1.0,\"key_emotions\":[\"liste\"]}\n\nTexte: $text",
            'entities' => "Extrais les entités nommées de ce texte. Retourne UNIQUEMENT un JSON:\n{\"persons\":[],\"organizations\":[],\"locations\":[],\"products\":[],\"dates\":[],\"amounts\":[]}\n\nTexte: $text",
            'summary' => "Résume ce texte en 3 phrases maximum. Retourne UNIQUEMENT un JSON:\n{\"summary\":\"...\",\"key_points\":[\"point1\",\"point2\"],\"topic\":\"sujet principal\"}\n\nTexte: $text",
            'full' => "Analyse complète de ce texte. Retourne UNIQUEMENT un JSON:\n{\n  \"summary\":\"3 phrases max\",\n  \"sentiment\":\"positif|neutre|négatif\",\n  \"key_points\":[\"liste\"],\n  \"entities\":{\"persons\":[],\"organizations\":[],\"products\":[]},\n  \"topic\":\"sujet\",\n  \"language\":\"fr|en|ar\",\n  \"actionable_insights\":[\"liste\"]\n}\n\nTexte: $text"
        ];

        $prompt = $prompts[$analysisType] ?? $prompts['full'];

        $ch = curl_init("{$this->ollamaUrl}/api/generate");
        curl_setopt_array($ch, [
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_POST => true,
            CURLOPT_POSTFIELDS => json_encode([
                'model' => 'llama3.1:8b', 'prompt' => $prompt,
                'stream' => false, 'options' => ['temperature' => 0.2, 'num_predict' => 1024]
            ]),
            CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
            CURLOPT_TIMEOUT => 60
        ]);
        $resp = curl_exec($ch);
        curl_close($ch);

        $data = json_decode($resp, true);
        $response = $data['response'] ?? '';

        // Parse JSON from response
        $response = preg_replace('/^```json\s*/', '', trim($response));
        $response = preg_replace('/\s*```$/', '', $response);

        $result = json_decode($response, true);
        return is_array($result) ? $result : ['raw' => $response, 'parse_error' => true];
    }

    /**
     * Compare deux pages (utile pour monitoring concurrentiel)
     */
    public function comparePages(string $url1, string $url2): array {
        $page1 = $this->fetchPage($url1);
        $page2 = $this->fetchPage($url2);

        if (!$page1['success'] || !$page2['success']) {
            return ['error' => 'Failed to fetch one or both pages'];
        }

        $text1 = $this->extractText($page1['html']);
        $text2 = $this->extractText($page2['html']);
        $meta1 = $this->extractMeta($page1['html']);
        $meta2 = $this->extractMeta($page2['html']);

        return [
            'page1' => ['url' => $url1, 'title' => $meta1['title'] ?? '', 'text_length' => strlen($text1), 'links' => $meta1['link_count'] ?? 0],
            'page2' => ['url' => $url2, 'title' => $meta2['title'] ?? '', 'text_length' => strlen($text2), 'links' => $meta2['link_count'] ?? 0],
            'analysis1' => $this->analyzeContent($text1, 'summary'),
            'analysis2' => $this->analyzeContent($text2, 'summary')
        ];
    }

    /**
     * Parse un feed RSS/Atom
     */
    public function parseFeed(string $url): array {
        $result = $this->fetchPage($url);
        if (!$result['success']) return ['error' => $result['error']];

        $xml = @simplexml_load_string($result['html']);
        if (!$xml) return ['error' => 'Invalid XML'];

        $items = [];

        // RSS 2.0
        if (isset($xml->channel->item)) {
            foreach ($xml->channel->item as $item) {
                $items[] = [
                    'title' => (string)$item->title,
                    'link' => (string)$item->link,
                    'description' => strip_tags((string)$item->description),
                    'pubDate' => (string)$item->pubDate,
                    'category' => (string)($item->category ?? '')
                ];
            }
        }
        // Atom
        elseif (isset($xml->entry)) {
            foreach ($xml->entry as $entry) {
                $items[] = [
                    'title' => (string)$entry->title,
                    'link' => (string)($entry->link['href'] ?? ''),
                    'description' => strip_tags((string)$entry->summary),
                    'pubDate' => (string)$entry->updated,
                ];
            }
        }

        return [
            'feed_title' => (string)($xml->channel->title ?? $xml->title ?? ''),
            'item_count' => count($items),
            'items' => array_slice($items, 0, 20) // Max 20 items
        ];
    }
}