100 lines
3.1 KiB
Python
Executable File
100 lines
3.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
wem-thumb-refresh-all.py — Batch refresh thumbnails every 6h via cron
|
|
Reads /var/www/html/api/wem-inventory.json → iterate categorized_pages
|
|
Calls wem-thumb-worker.py for each page (parallel, max 4 workers)
|
|
Logs to /tmp/wem-refresh.log
|
|
|
|
Doctrine 84.
|
|
"""
|
|
import json, os, subprocess, time, hashlib
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
INV = '/var/www/html/api/wem-inventory.json'
|
|
WORKER = '/var/www/html/api/wem-thumb-worker.py'
|
|
THUMB_DIR = '/var/www/html/api/screenshots/wem'
|
|
TTL = 21600 # 6h
|
|
|
|
start = time.time()
|
|
print(f"=== WEM thumb refresh start {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
|
|
|
|
try:
|
|
with open(INV) as f:
|
|
inv = json.load(f)
|
|
except Exception as e:
|
|
print(f"inventory read fail: {e}")
|
|
exit(1)
|
|
|
|
# Collect all scannable pages (internal S204 + S95 wevads + ethica)
|
|
pages = set()
|
|
cp = inv.get('categorized_pages', {})
|
|
import re as _re
|
|
WHITELIST = _re.compile(r'^https?://(?:(?:www\.)?weval-consulting\.com|wevads\.weval-consulting\.com|ethica\.wevup\.app)/')
|
|
|
|
for cat, lst in cp.items():
|
|
if not isinstance(lst, list): continue
|
|
for p in lst:
|
|
if not isinstance(p, str): continue
|
|
# Full URL path
|
|
if p.startswith('http'):
|
|
if WHITELIST.match(p) and p.endswith('.html'):
|
|
pages.add(p) # keep full URL — worker resolves to right upstream
|
|
else:
|
|
path = p.lstrip('/')
|
|
if path.endswith('.html') and '..' not in path:
|
|
pages.add(path)
|
|
|
|
print(f"Found {len(pages)} internal HTML pages to refresh")
|
|
|
|
# Filter: skip if thumb exists and < TTL (use same hash key logic as worker)
|
|
import re as _re2
|
|
def _key_for(p):
|
|
m = _re2.match(r'^https?://wevads\.weval-consulting\.com/(.+)$', p)
|
|
if m: return f"wevads/{m.group(1)}"
|
|
m = _re2.match(r'^https?://ethica\.wevup\.app/(.+)$', p)
|
|
if m: return f"ethica/{m.group(1)}"
|
|
m = _re2.match(r'^https?://(?:www\.)?weval-consulting\.com/(.+)$', p)
|
|
if m: return m.group(1)
|
|
return p.lstrip('/')
|
|
|
|
fresh = 0
|
|
todo = []
|
|
for path in pages:
|
|
key = _key_for(path)
|
|
h = hashlib.md5(key.encode()).hexdigest()
|
|
thumb = f"{THUMB_DIR}/{h}.png"
|
|
if os.path.exists(thumb) and (time.time() - os.path.getmtime(thumb)) < TTL:
|
|
fresh += 1
|
|
continue
|
|
todo.append(path)
|
|
|
|
print(f"Fresh (skipped): {fresh} / Todo: {len(todo)}")
|
|
|
|
def run_one(path):
|
|
try:
|
|
r = subprocess.run(
|
|
['python3', WORKER, path],
|
|
capture_output=True, text=True, timeout=25
|
|
)
|
|
return path, r.returncode, r.stdout.strip()[:80]
|
|
except subprocess.TimeoutExpired:
|
|
return path, -1, "TIMEOUT"
|
|
except Exception as e:
|
|
return path, -2, f"ERR: {e}"
|
|
|
|
ok = 0
|
|
fail = 0
|
|
with ThreadPoolExecutor(max_workers=4) as ex:
|
|
futures = {ex.submit(run_one, p): p for p in todo}
|
|
for fut in as_completed(futures):
|
|
path, code, msg = fut.result()
|
|
if code == 0:
|
|
ok += 1
|
|
print(f" ✓ {path}: {msg}")
|
|
else:
|
|
fail += 1
|
|
print(f" ✗ {path}: {msg}")
|
|
|
|
dur = round(time.time() - start, 1)
|
|
print(f"\n=== DONE in {dur}s — ok={ok} fail={fail} skipped={fresh} total={len(pages)} ===")
|