wevads-platform/scripts/template_scraper.py

#!/usr/bin/env python3
"""
TEMPLATE SCRAPER - Extraction de templates publics
Sources: Milled.com, ReallyGoodEmails, etc.
"""
import sys
import json
import re
import psycopg2
import requests
from bs4 import BeautifulSoup
from datetime import datetime

DB_CONFIG = {
    'host': 'localhost',
    'database': 'adx_system',
    'user': 'admin',
    'password': 'admin123'
}

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

def get_db():
    return psycopg2.connect(**DB_CONFIG)

def save_template(source, url, brand, category, subject, body_html, headers_json=None):
    """Sauvegarde un template scrapé"""
    conn = get_db()
    cur = conn.cursor()

    try:
        cur.execute("""
            INSERT INTO admin.scraped_templates
            (source, source_url, brand, category, subject, body_html, headers_json)
            VALUES (%s, %s, %s, %s, %s, %s, %s)
            RETURNING id
        """, (source, url, brand, category, subject, body_html[:100000] if body_html else None,
              json.dumps(headers_json) if headers_json else None))

        conn.commit()
        return cur.fetchone()[0]
    except Exception as e:
        print(f"DB Error: {e}")
        conn.rollback()
        return None
    finally:
        cur.close()
        conn.close()

def scrape_milled(category='retail', limit=10):
    """Scrape templates depuis Milled.com"""
    print(f"🔍 Scraping Milled.com - {category}...")

    base_url = f"https://milled.com/{category}"
    templates = []

    try:
        resp = requests.get(base_url, headers=HEADERS, timeout=15)
        soup = BeautifulSoup(resp.text, 'html.parser')

        # Trouver les liens vers les emails
        email_links = soup.find_all('a', href=re.compile(r'/[^/]+/[^/]+-\d+'))[:limit]

        for link in email_links:
            href = link.get('href')
            if not href.startswith('http'):
                href = 'https://milled.com' + href

            print(f"  📧 {href[:50]}...")

            try:
                email_resp = requests.get(href, headers=HEADERS, timeout=15)
                email_soup = BeautifulSoup(email_resp.text, 'html.parser')

                # Extraire le contenu
                brand = email_soup.find('h1')
                brand = brand.text.strip() if brand else 'Unknown'

                subject_el = email_soup.find('title')
                subject = subject_el.text.strip() if subject_el else ''

                # Trouver le body de l'email
                email_body = email_soup.find('div', class_=re.compile('email|content|body'))
                body_html = str(email_body) if email_body else ''

                if body_html:
                    template_id = save_template('milled', href, brand, category, subject, body_html)
                    if template_id:
                        templates.append({'id': template_id, 'brand': brand, 'subject': subject[:40]})
                        print(f"    ✅ Saved #{template_id}")

            except Exception as e:
                print(f"    ❌ Error: {e}")

    except Exception as e:
        print(f"❌ Scraping error: {e}")

    return templates

def scrape_reallygoodemails(category='promotional', limit=10):
    """Scrape templates depuis ReallyGoodEmails"""
    print(f"🔍 Scraping ReallyGoodEmails - {category}...")

    base_url = f"https://reallygoodemails.com/category/{category}"
    templates = []

    try:
        resp = requests.get(base_url, headers=HEADERS, timeout=15)
        soup = BeautifulSoup(resp.text, 'html.parser')

        # Trouver les articles
        articles = soup.find_all('article')[:limit]

        for article in articles:
            link = article.find('a', href=True)
            if not link:
                continue

            href = link.get('href')
            if not href.startswith('http'):
                href = 'https://reallygoodemails.com' + href

            print(f"  📧 {href[:50]}...")

            try:
                email_resp = requests.get(href, headers=HEADERS, timeout=15)
                email_soup = BeautifulSoup(email_resp.text, 'html.parser')

                brand_el = email_soup.find('h1')
                brand = brand_el.text.strip() if brand_el else 'Unknown'

                subject_el = email_soup.find('h2')
                subject = subject_el.text.strip() if subject_el else ''

                # Chercher l'iframe ou le contenu de l'email
                iframe = email_soup.find('iframe')
                body_html = ''
                if iframe and iframe.get('src'):
                    # Récupérer le contenu de l'iframe
                    iframe_resp = requests.get(iframe.get('src'), headers=HEADERS, timeout=15)
                    body_html = iframe_resp.text

                if body_html:
                    template_id = save_template('reallygoodemails', href, brand, category, subject, body_html)
                    if template_id:
                        templates.append({'id': template_id, 'brand': brand})
                        print(f"    ✅ Saved #{template_id}")

            except Exception as e:
                print(f"    ❌ Error: {e}")

    except Exception as e:
        print(f"❌ Scraping error: {e}")

    return templates

def process_templates_to_patterns():
    """Convertit les templates scrapés en patterns Send Data"""
    conn = get_db()
    cur = conn.cursor()

    cur.execute("""
        SELECT id, source, source_url, brand, category, subject, body_html
        FROM admin.scraped_templates
        WHERE is_processed = false
        LIMIT 50
    """)

    templates = cur.fetchall()
    print(f"🔄 Processing {len(templates)} templates...")

    processed = 0
    for tpl in templates:
        id, source, url, brand, category, subject, body_html = tpl

        # Extraire les patterns du body
        links = re.findall(r'href=["\']([^"\']+)["\']', body_html or '', re.IGNORECASE)[:20]
        images = [img for img in re.findall(r'src=["\']([^"\']+)["\']', body_html or '', re.IGNORECASE)
                  if any(ext in img.lower() for ext in ['.png', '.jpg', '.gif'])][:10]

        # Sauvegarder comme pattern
        cur.execute("""
            INSERT INTO admin.send_data_patterns
            (source, source_id, from_name, subject_template, body_html, links_json, images_json, inbox_potential)
            VALUES (%s, %s, %s, %s, %s, %s, %s, 'high')
            RETURNING id
        """, (
            f'scraper_{source}', url, brand, subject,
            body_html[:50000] if body_html else None,
            json.dumps(links), json.dumps(images)
        ))

        # Marquer comme traité
        cur.execute("UPDATE admin.scraped_templates SET is_processed = true WHERE id = %s", (id,))
        processed += 1

    conn.commit()
    cur.close()
    conn.close()

    print(f"✅ {processed} templates convertis en patterns")
    return processed

def show_stats():
    """Affiche les stats des templates scrapés"""
    conn = get_db()
    cur = conn.cursor()

    cur.execute("SELECT source, COUNT(*), COUNT(CASE WHEN is_processed THEN 1 END) FROM admin.scraped_templates GROUP BY source")
    stats = cur.fetchall()

    cur.close()
    conn.close()

    print("📊 SCRAPER STATS")
    print("=" * 40)
    print(f"{'Source':<20} {'Total':<10} {'Processed'}")
    print("-" * 40)
    for src, total, processed in stats:
        print(f"{src:<20} {total:<10} {processed}")

def main():
    if len(sys.argv) < 2:
        print("""

Usage:
  python3 template_scraper.py milled [category] [limit]      - Scrape Milled.com
  python3 template_scraper.py rge [category] [limit]         - Scrape ReallyGoodEmails
  python3 template_scraper.py process                         - Convert to patterns
  python3 template_scraper.py stats                           - Show statistics

Categories (Milled): retail, travel, food, tech, finance
Categories (RGE): promotional, newsletter, transactional, welcome
        """)
        return

    cmd = sys.argv[1].lower()

    if cmd == 'milled':
        category = sys.argv[2] if len(sys.argv) > 2 else 'retail'
        limit = int(sys.argv[3]) if len(sys.argv) > 3 else 10
        templates = scrape_milled(category, limit)
        print(f"\n✅ {len(templates)} templates scrapés")

    elif cmd == 'rge':
        category = sys.argv[2] if len(sys.argv) > 2 else 'promotional'
        limit = int(sys.argv[3]) if len(sys.argv) > 3 else 10
        templates = scrape_reallygoodemails(category, limit)
        print(f"\n✅ {len(templates)} templates scrapés")

    elif cmd == 'process':
        process_templates_to_patterns()

    elif cmd == 'stats':
        show_stats()

if __name__ == '__main__':
    main()