Files
wevads-platform/scripts/template_scraper.py
2026-02-26 04:53:11 +01:00

261 lines
8.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
TEMPLATE SCRAPER - Extraction de templates publics
Sources: Milled.com, ReallyGoodEmails, etc.
"""
import sys
import json
import re
import psycopg2
import requests
from bs4 import BeautifulSoup
from datetime import datetime
DB_CONFIG = {
'host': 'localhost',
'database': 'adx_system',
'user': 'admin',
'password': 'admin123'
}
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
def get_db():
return psycopg2.connect(**DB_CONFIG)
def save_template(source, url, brand, category, subject, body_html, headers_json=None):
"""Sauvegarde un template scrapé"""
conn = get_db()
cur = conn.cursor()
try:
cur.execute("""
INSERT INTO admin.scraped_templates
(source, source_url, brand, category, subject, body_html, headers_json)
VALUES (%s, %s, %s, %s, %s, %s, %s)
RETURNING id
""", (source, url, brand, category, subject, body_html[:100000] if body_html else None,
json.dumps(headers_json) if headers_json else None))
conn.commit()
return cur.fetchone()[0]
except Exception as e:
print(f"DB Error: {e}")
conn.rollback()
return None
finally:
cur.close()
conn.close()
def scrape_milled(category='retail', limit=10):
"""Scrape templates depuis Milled.com"""
print(f"🔍 Scraping Milled.com - {category}...")
base_url = f"https://milled.com/{category}"
templates = []
try:
resp = requests.get(base_url, headers=HEADERS, timeout=15)
soup = BeautifulSoup(resp.text, 'html.parser')
# Trouver les liens vers les emails
email_links = soup.find_all('a', href=re.compile(r'/[^/]+/[^/]+-\d+'))[:limit]
for link in email_links:
href = link.get('href')
if not href.startswith('http'):
href = 'https://milled.com' + href
print(f" 📧 {href[:50]}...")
try:
email_resp = requests.get(href, headers=HEADERS, timeout=15)
email_soup = BeautifulSoup(email_resp.text, 'html.parser')
# Extraire le contenu
brand = email_soup.find('h1')
brand = brand.text.strip() if brand else 'Unknown'
subject_el = email_soup.find('title')
subject = subject_el.text.strip() if subject_el else ''
# Trouver le body de l'email
email_body = email_soup.find('div', class_=re.compile('email|content|body'))
body_html = str(email_body) if email_body else ''
if body_html:
template_id = save_template('milled', href, brand, category, subject, body_html)
if template_id:
templates.append({'id': template_id, 'brand': brand, 'subject': subject[:40]})
print(f" ✅ Saved #{template_id}")
except Exception as e:
print(f" ❌ Error: {e}")
except Exception as e:
print(f"❌ Scraping error: {e}")
return templates
def scrape_reallygoodemails(category='promotional', limit=10):
"""Scrape templates depuis ReallyGoodEmails"""
print(f"🔍 Scraping ReallyGoodEmails - {category}...")
base_url = f"https://reallygoodemails.com/category/{category}"
templates = []
try:
resp = requests.get(base_url, headers=HEADERS, timeout=15)
soup = BeautifulSoup(resp.text, 'html.parser')
# Trouver les articles
articles = soup.find_all('article')[:limit]
for article in articles:
link = article.find('a', href=True)
if not link:
continue
href = link.get('href')
if not href.startswith('http'):
href = 'https://reallygoodemails.com' + href
print(f" 📧 {href[:50]}...")
try:
email_resp = requests.get(href, headers=HEADERS, timeout=15)
email_soup = BeautifulSoup(email_resp.text, 'html.parser')
brand_el = email_soup.find('h1')
brand = brand_el.text.strip() if brand_el else 'Unknown'
subject_el = email_soup.find('h2')
subject = subject_el.text.strip() if subject_el else ''
# Chercher l'iframe ou le contenu de l'email
iframe = email_soup.find('iframe')
body_html = ''
if iframe and iframe.get('src'):
# Récupérer le contenu de l'iframe
iframe_resp = requests.get(iframe.get('src'), headers=HEADERS, timeout=15)
body_html = iframe_resp.text
if body_html:
template_id = save_template('reallygoodemails', href, brand, category, subject, body_html)
if template_id:
templates.append({'id': template_id, 'brand': brand})
print(f" ✅ Saved #{template_id}")
except Exception as e:
print(f" ❌ Error: {e}")
except Exception as e:
print(f"❌ Scraping error: {e}")
return templates
def process_templates_to_patterns():
"""Convertit les templates scrapés en patterns Send Data"""
conn = get_db()
cur = conn.cursor()
cur.execute("""
SELECT id, source, source_url, brand, category, subject, body_html
FROM admin.scraped_templates
WHERE is_processed = false
LIMIT 50
""")
templates = cur.fetchall()
print(f"🔄 Processing {len(templates)} templates...")
processed = 0
for tpl in templates:
id, source, url, brand, category, subject, body_html = tpl
# Extraire les patterns du body
links = re.findall(r'href=["\']([^"\']+)["\']', body_html or '', re.IGNORECASE)[:20]
images = [img for img in re.findall(r'src=["\']([^"\']+)["\']', body_html or '', re.IGNORECASE)
if any(ext in img.lower() for ext in ['.png', '.jpg', '.gif'])][:10]
# Sauvegarder comme pattern
cur.execute("""
INSERT INTO admin.send_data_patterns
(source, source_id, from_name, subject_template, body_html, links_json, images_json, inbox_potential)
VALUES (%s, %s, %s, %s, %s, %s, %s, 'high')
RETURNING id
""", (
f'scraper_{source}', url, brand, subject,
body_html[:50000] if body_html else None,
json.dumps(links), json.dumps(images)
))
# Marquer comme traité
cur.execute("UPDATE admin.scraped_templates SET is_processed = true WHERE id = %s", (id,))
processed += 1
conn.commit()
cur.close()
conn.close()
print(f"{processed} templates convertis en patterns")
return processed
def show_stats():
"""Affiche les stats des templates scrapés"""
conn = get_db()
cur = conn.cursor()
cur.execute("SELECT source, COUNT(*), COUNT(CASE WHEN is_processed THEN 1 END) FROM admin.scraped_templates GROUP BY source")
stats = cur.fetchall()
cur.close()
conn.close()
print("📊 SCRAPER STATS")
print("=" * 40)
print(f"{'Source':<20} {'Total':<10} {'Processed'}")
print("-" * 40)
for src, total, processed in stats:
print(f"{src:<20} {total:<10} {processed}")
def main():
if len(sys.argv) < 2:
print("""
Usage:
python3 template_scraper.py milled [category] [limit] - Scrape Milled.com
python3 template_scraper.py rge [category] [limit] - Scrape ReallyGoodEmails
python3 template_scraper.py process - Convert to patterns
python3 template_scraper.py stats - Show statistics
Categories (Milled): retail, travel, food, tech, finance
Categories (RGE): promotional, newsletter, transactional, welcome
""")
return
cmd = sys.argv[1].lower()
if cmd == 'milled':
category = sys.argv[2] if len(sys.argv) > 2 else 'retail'
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 10
templates = scrape_milled(category, limit)
print(f"\n{len(templates)} templates scrapés")
elif cmd == 'rge':
category = sys.argv[2] if len(sys.argv) > 2 else 'promotional'
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 10
templates = scrape_reallygoodemails(category, limit)
print(f"\n{len(templates)} templates scrapés")
elif cmd == 'process':
process_templates_to_patterns()
elif cmd == 'stats':
show_stats()
if __name__ == '__main__':
main()