261 lines
8.9 KiB
Python
Executable File
261 lines
8.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
TEMPLATE SCRAPER - Extraction de templates publics
|
|
Sources: Milled.com, ReallyGoodEmails, etc.
|
|
"""
|
|
import sys
|
|
import json
|
|
import re
|
|
import psycopg2
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
|
|
DB_CONFIG = {
|
|
'host': 'localhost',
|
|
'database': 'adx_system',
|
|
'user': 'admin',
|
|
'password': 'admin123'
|
|
}
|
|
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
}
|
|
|
|
def get_db():
|
|
return psycopg2.connect(**DB_CONFIG)
|
|
|
|
def save_template(source, url, brand, category, subject, body_html, headers_json=None):
|
|
"""Sauvegarde un template scrapé"""
|
|
conn = get_db()
|
|
cur = conn.cursor()
|
|
|
|
try:
|
|
cur.execute("""
|
|
INSERT INTO admin.scraped_templates
|
|
(source, source_url, brand, category, subject, body_html, headers_json)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
|
RETURNING id
|
|
""", (source, url, brand, category, subject, body_html[:100000] if body_html else None,
|
|
json.dumps(headers_json) if headers_json else None))
|
|
|
|
conn.commit()
|
|
return cur.fetchone()[0]
|
|
except Exception as e:
|
|
print(f"DB Error: {e}")
|
|
conn.rollback()
|
|
return None
|
|
finally:
|
|
cur.close()
|
|
conn.close()
|
|
|
|
def scrape_milled(category='retail', limit=10):
|
|
"""Scrape templates depuis Milled.com"""
|
|
print(f"🔍 Scraping Milled.com - {category}...")
|
|
|
|
base_url = f"https://milled.com/{category}"
|
|
templates = []
|
|
|
|
try:
|
|
resp = requests.get(base_url, headers=HEADERS, timeout=15)
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
|
|
|
# Trouver les liens vers les emails
|
|
email_links = soup.find_all('a', href=re.compile(r'/[^/]+/[^/]+-\d+'))[:limit]
|
|
|
|
for link in email_links:
|
|
href = link.get('href')
|
|
if not href.startswith('http'):
|
|
href = 'https://milled.com' + href
|
|
|
|
print(f" 📧 {href[:50]}...")
|
|
|
|
try:
|
|
email_resp = requests.get(href, headers=HEADERS, timeout=15)
|
|
email_soup = BeautifulSoup(email_resp.text, 'html.parser')
|
|
|
|
# Extraire le contenu
|
|
brand = email_soup.find('h1')
|
|
brand = brand.text.strip() if brand else 'Unknown'
|
|
|
|
subject_el = email_soup.find('title')
|
|
subject = subject_el.text.strip() if subject_el else ''
|
|
|
|
# Trouver le body de l'email
|
|
email_body = email_soup.find('div', class_=re.compile('email|content|body'))
|
|
body_html = str(email_body) if email_body else ''
|
|
|
|
if body_html:
|
|
template_id = save_template('milled', href, brand, category, subject, body_html)
|
|
if template_id:
|
|
templates.append({'id': template_id, 'brand': brand, 'subject': subject[:40]})
|
|
print(f" ✅ Saved #{template_id}")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Scraping error: {e}")
|
|
|
|
return templates
|
|
|
|
def scrape_reallygoodemails(category='promotional', limit=10):
|
|
"""Scrape templates depuis ReallyGoodEmails"""
|
|
print(f"🔍 Scraping ReallyGoodEmails - {category}...")
|
|
|
|
base_url = f"https://reallygoodemails.com/category/{category}"
|
|
templates = []
|
|
|
|
try:
|
|
resp = requests.get(base_url, headers=HEADERS, timeout=15)
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
|
|
|
# Trouver les articles
|
|
articles = soup.find_all('article')[:limit]
|
|
|
|
for article in articles:
|
|
link = article.find('a', href=True)
|
|
if not link:
|
|
continue
|
|
|
|
href = link.get('href')
|
|
if not href.startswith('http'):
|
|
href = 'https://reallygoodemails.com' + href
|
|
|
|
print(f" 📧 {href[:50]}...")
|
|
|
|
try:
|
|
email_resp = requests.get(href, headers=HEADERS, timeout=15)
|
|
email_soup = BeautifulSoup(email_resp.text, 'html.parser')
|
|
|
|
brand_el = email_soup.find('h1')
|
|
brand = brand_el.text.strip() if brand_el else 'Unknown'
|
|
|
|
subject_el = email_soup.find('h2')
|
|
subject = subject_el.text.strip() if subject_el else ''
|
|
|
|
# Chercher l'iframe ou le contenu de l'email
|
|
iframe = email_soup.find('iframe')
|
|
body_html = ''
|
|
if iframe and iframe.get('src'):
|
|
# Récupérer le contenu de l'iframe
|
|
iframe_resp = requests.get(iframe.get('src'), headers=HEADERS, timeout=15)
|
|
body_html = iframe_resp.text
|
|
|
|
if body_html:
|
|
template_id = save_template('reallygoodemails', href, brand, category, subject, body_html)
|
|
if template_id:
|
|
templates.append({'id': template_id, 'brand': brand})
|
|
print(f" ✅ Saved #{template_id}")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Scraping error: {e}")
|
|
|
|
return templates
|
|
|
|
def process_templates_to_patterns():
|
|
"""Convertit les templates scrapés en patterns Send Data"""
|
|
conn = get_db()
|
|
cur = conn.cursor()
|
|
|
|
cur.execute("""
|
|
SELECT id, source, source_url, brand, category, subject, body_html
|
|
FROM admin.scraped_templates
|
|
WHERE is_processed = false
|
|
LIMIT 50
|
|
""")
|
|
|
|
templates = cur.fetchall()
|
|
print(f"🔄 Processing {len(templates)} templates...")
|
|
|
|
processed = 0
|
|
for tpl in templates:
|
|
id, source, url, brand, category, subject, body_html = tpl
|
|
|
|
# Extraire les patterns du body
|
|
links = re.findall(r'href=["\']([^"\']+)["\']', body_html or '', re.IGNORECASE)[:20]
|
|
images = [img for img in re.findall(r'src=["\']([^"\']+)["\']', body_html or '', re.IGNORECASE)
|
|
if any(ext in img.lower() for ext in ['.png', '.jpg', '.gif'])][:10]
|
|
|
|
# Sauvegarder comme pattern
|
|
cur.execute("""
|
|
INSERT INTO admin.send_data_patterns
|
|
(source, source_id, from_name, subject_template, body_html, links_json, images_json, inbox_potential)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, 'high')
|
|
RETURNING id
|
|
""", (
|
|
f'scraper_{source}', url, brand, subject,
|
|
body_html[:50000] if body_html else None,
|
|
json.dumps(links), json.dumps(images)
|
|
))
|
|
|
|
# Marquer comme traité
|
|
cur.execute("UPDATE admin.scraped_templates SET is_processed = true WHERE id = %s", (id,))
|
|
processed += 1
|
|
|
|
conn.commit()
|
|
cur.close()
|
|
conn.close()
|
|
|
|
print(f"✅ {processed} templates convertis en patterns")
|
|
return processed
|
|
|
|
def show_stats():
|
|
"""Affiche les stats des templates scrapés"""
|
|
conn = get_db()
|
|
cur = conn.cursor()
|
|
|
|
cur.execute("SELECT source, COUNT(*), COUNT(CASE WHEN is_processed THEN 1 END) FROM admin.scraped_templates GROUP BY source")
|
|
stats = cur.fetchall()
|
|
|
|
cur.close()
|
|
conn.close()
|
|
|
|
print("📊 SCRAPER STATS")
|
|
print("=" * 40)
|
|
print(f"{'Source':<20} {'Total':<10} {'Processed'}")
|
|
print("-" * 40)
|
|
for src, total, processed in stats:
|
|
print(f"{src:<20} {total:<10} {processed}")
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("""
|
|
|
|
Usage:
|
|
python3 template_scraper.py milled [category] [limit] - Scrape Milled.com
|
|
python3 template_scraper.py rge [category] [limit] - Scrape ReallyGoodEmails
|
|
python3 template_scraper.py process - Convert to patterns
|
|
python3 template_scraper.py stats - Show statistics
|
|
|
|
Categories (Milled): retail, travel, food, tech, finance
|
|
Categories (RGE): promotional, newsletter, transactional, welcome
|
|
""")
|
|
return
|
|
|
|
cmd = sys.argv[1].lower()
|
|
|
|
if cmd == 'milled':
|
|
category = sys.argv[2] if len(sys.argv) > 2 else 'retail'
|
|
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 10
|
|
templates = scrape_milled(category, limit)
|
|
print(f"\n✅ {len(templates)} templates scrapés")
|
|
|
|
elif cmd == 'rge':
|
|
category = sys.argv[2] if len(sys.argv) > 2 else 'promotional'
|
|
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 10
|
|
templates = scrape_reallygoodemails(category, limit)
|
|
print(f"\n✅ {len(templates)} templates scrapés")
|
|
|
|
elif cmd == 'process':
|
|
process_templates_to_patterns()
|
|
|
|
elif cmd == 'stats':
|
|
show_stats()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|