html/api/wem-thumb-refresh-all.py

#!/usr/bin/env python3
"""
wem-thumb-refresh-all.py — Batch refresh thumbnails every 6h via cron
Reads /var/www/html/api/wem-inventory.json → iterate categorized_pages
Calls wem-thumb-worker.py for each page (parallel, max 4 workers)
Logs to /tmp/wem-refresh.log

Doctrine 84.
"""
import json, os, subprocess, time, hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed

INV = '/var/www/html/api/wem-inventory.json'
WORKER = '/var/www/html/api/wem-thumb-worker.py'
THUMB_DIR = '/var/www/html/api/screenshots/wem'
TTL = 21600  # 6h

start = time.time()
print(f"=== WEM thumb refresh start {time.strftime('%Y-%m-%d %H:%M:%S')} ===")

try:
    with open(INV) as f:
        inv = json.load(f)
except Exception as e:
    print(f"inventory read fail: {e}")
    exit(1)

# Collect all scannable pages (internal S204 + S95 wevads + ethica)
pages = set()
cp = inv.get('categorized_pages', {})
import re as _re
WHITELIST = _re.compile(r'^https?://(?:(?:www\.)?weval-consulting\.com|wevads\.weval-consulting\.com|ethica\.wevup\.app)/')

for cat, lst in cp.items():
    if not isinstance(lst, list): continue
    for p in lst:
        if not isinstance(p, str): continue
        # Full URL path
        if p.startswith('http'):
            if WHITELIST.match(p) and p.endswith('.html'):
                pages.add(p)  # keep full URL — worker resolves to right upstream
        else:
            path = p.lstrip('/')
            if path.endswith('.html') and '..' not in path:
                pages.add(path)

print(f"Found {len(pages)} internal HTML pages to refresh")

# Filter: skip if thumb exists and < TTL (use same hash key logic as worker)
import re as _re2
def _key_for(p):
    m = _re2.match(r'^https?://wevads\.weval-consulting\.com/(.+)$', p)
    if m: return f"wevads/{m.group(1)}"
    m = _re2.match(r'^https?://ethica\.wevup\.app/(.+)$', p)
    if m: return f"ethica/{m.group(1)}"
    m = _re2.match(r'^https?://(?:www\.)?weval-consulting\.com/(.+)$', p)
    if m: return m.group(1)
    return p.lstrip('/')

fresh = 0
todo = []
for path in pages:
    key = _key_for(path)
    h = hashlib.md5(key.encode()).hexdigest()
    thumb = f"{THUMB_DIR}/{h}.png"
    if os.path.exists(thumb) and (time.time() - os.path.getmtime(thumb)) < TTL:
        fresh += 1
        continue
    todo.append(path)

print(f"Fresh (skipped): {fresh} / Todo: {len(todo)}")

def run_one(path):
    try:
        r = subprocess.run(
            ['python3', WORKER, path],
            capture_output=True, text=True, timeout=25
        )
        return path, r.returncode, r.stdout.strip()[:80]
    except subprocess.TimeoutExpired:
        return path, -1, "TIMEOUT"
    except Exception as e:
        return path, -2, f"ERR: {e}"

ok = 0
fail = 0
with ThreadPoolExecutor(max_workers=4) as ex:
    futures = {ex.submit(run_one, p): p for p in todo}
    for fut in as_completed(futures):
        path, code, msg = fut.result()
        if code == 0:
            ok += 1
            print(f"  ✓ {path}: {msg}")
        else:
            fail += 1
            print(f"  ✗ {path}: {msg}")

dur = round(time.time() - start, 1)
print(f"\n=== DONE in {dur}s — ok={ok} fail={fail} skipped={fresh} total={len(pages)} ===")