Files
html/api/wem-thumb-worker.py
2026-04-17 22:35:01 +02:00

177 lines
5.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""
wem-thumb-worker.py — Generate thumbnail + metadata for a page
Called by wem-screen-thumb.php in background or by cron refresh.
Usage: python3 wem-thumb-worker.py <path>
Example: python3 wem-thumb-worker.py ethica-chatbot.html
Output:
/var/www/html/api/screenshots/wem/<md5>.png (320x200)
Updates /var/www/html/api/wem-page-meta.json with {title, http, mtime, ts}
Doctrine 84.
"""
import sys, os, hashlib, json, time, subprocess
from pathlib import Path
if len(sys.argv) < 2:
print("Usage: wem-thumb-worker.py <path>")
sys.exit(1)
raw = sys.argv[1]
# Accept either: bare path (ethica-chatbot.html) OR full URL
# Resolve URL to the right upstream (S204 vs S95)
import re as _re
m_wevads = _re.match(r'^https?://wevads\.weval-consulting\.com/(.+)$', raw)
m_main = _re.match(r'^https?://(?:www\.)?weval-consulting\.com/(.+)$', raw)
m_ethica = _re.match(r'^https?://ethica\.wevup\.app/(.+)$', raw)
if m_wevads:
# Route via S95 directly (WireGuard 10.1.0.3:5890) — faster than going through CF
path = m_wevads.group(1)
url_for_meta = raw
url_for_fetch = f"http://10.1.0.3:5890/{path}"
key = f"wevads/{path}" # meta key prefix to disambiguate from S204
elif m_main:
path = m_main.group(1)
url_for_meta = raw
url_for_fetch = f"http://127.0.0.1:5890/{path}"
key = path
elif m_ethica:
path = m_ethica.group(1)
url_for_meta = raw
url_for_fetch = f"http://127.0.0.1:5890/{path}" # ethica.wevup.app proxies locally
key = f"ethica/{path}"
else:
# bare path → S204 local
path = raw.lstrip('/')
url_for_meta = f"https://weval-consulting.com/{path}"
url_for_fetch = f"http://127.0.0.1:5890/{path}"
key = path
h = hashlib.md5(key.encode()).hexdigest()
thumb_out = f"/var/www/html/api/screenshots/wem/{h}.png"
meta_file = "/var/www/html/api/wem-page-meta.json"
lockfile = f"/tmp/wem-thumb-{h}.lock"
# Load existing meta
try:
with open(meta_file) as f:
meta = json.load(f)
except Exception:
meta = {}
url = url_for_fetch
# For vhost routing on S95 without Host header, fallback to public URL via CF
# This is slower but Host header breaks Playwright's connection reuse
if 'wevads' in key:
url = url_for_meta # use https://wevads.weval-consulting.com/... (public via CF)
elif 'ethica/' in key:
url = url_for_meta # use https://ethica.wevup.app/... (public via CF)
host_header = None # not needed with public URLs
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
context = browser.new_context(viewport={'width': 1024, 'height': 640}, ignore_https_errors=True)
page = context.new_page()
if host_header:
page.set_extra_http_headers({'Host': host_header})
http_status = 0
try:
resp = page.goto(url, wait_until='domcontentloaded', timeout=12000)
http_status = resp.status if resp else 0
except Exception as e:
http_status = -1
print(f"nav error: {e}")
# Give a beat for any fast JS render
try:
page.wait_for_timeout(1500)
except Exception:
pass
# Get real title
try:
title = page.title() or path
except Exception:
title = path
title = title.strip()[:120]
# Screenshot full viewport (1024x640) then we resize
png_tmp = f"/tmp/wem-raw-{h}.png"
try:
page.screenshot(path=png_tmp, full_page=False, type='png')
except Exception as e:
print(f"screenshot error: {e}")
browser.close()
sys.exit(2)
browser.close()
# Resize to 320x200 using Python Pillow (fast) or ImageMagick fallback
try:
from PIL import Image
img = Image.open(png_tmp)
# Crop top portion (aspect 16:10 from 1024x640 is already 16:10, just resize)
img.thumbnail((640, 400), Image.LANCZOS)
# Pad/crop to 320x200
img = img.resize((320, 200), Image.LANCZOS)
img.save(thumb_out, 'PNG', optimize=True)
except Exception as e:
# Fallback: ImageMagick
subprocess.run(['convert', png_tmp, '-resize', '320x200!', thumb_out], check=False)
# Cleanup tmp
try: os.unlink(png_tmp)
except Exception: pass
# Chown for web
try:
subprocess.run(['chown', 'www-data:www-data', thumb_out], check=False)
except Exception:
pass
# Get file mtime (only for S204-local files)
try:
file_path = f"/var/www/html/{path}"
mtime = time.strftime('%d%b', time.localtime(os.path.getmtime(file_path))) if os.path.exists(file_path) else ''
except Exception:
mtime = ''
# Update meta — key is the URL-derived key used by the HTML for lookup
meta[key] = {
'title': title,
'http': http_status,
'mtime': mtime,
'thumb': f"/api/screenshots/wem/{h}.png",
'url': url_for_meta,
'ts': int(time.time())
}
# Atomic write
tmp_meta = meta_file + '.tmp'
with open(tmp_meta, 'w') as f:
json.dump(meta, f, indent=2, ensure_ascii=False)
os.rename(tmp_meta, meta_file)
try:
subprocess.run(['chown', 'www-data:www-data', meta_file], check=False)
except Exception:
pass
print(f"OK {path} http={http_status} title={title[:40]}")
except Exception as e:
print(f"FATAL {path}: {e}")
sys.exit(3)
finally:
try: os.unlink(lockfile)
except Exception: pass