Files
html/api/duplicates-registry.php
opus 48d793ea5f
Some checks failed
WEVAL NonReg / nonreg (push) Has been cancelled
auto-sync via WEVIA git_sync_all intent 2026-04-21T15:34:28+02:00
2026-04-21 15:34:28 +02:00

100 lines
3.4 KiB
PHP

<?php
// Wave 209 · /api/duplicates-registry.php
// Duplicate screens registry. Non-destructive audit: returns groups with
// canonical file + legacy candidates + recommendations. Zero delete.
@require_once __DIR__ . '/wevia-sanitizer-guard.php';
header('Content-Type: application/json; charset=utf-8');
header('Access-Control-Allow-Origin: *');
$DOCROOT = '/var/www/html';
$pages = array_filter(scandir($DOCROOT), function($f) use ($DOCROOT) {
return substr($f, -5) === '.html' && is_file("$DOCROOT/$f");
});
// Normalize base name (strip version/legacy/saas suffixes)
$base_groups = [];
foreach ($pages as $p) {
$base = substr($p, 0, -5);
$base = preg_replace('/-v\d+$|-new$|-legacy$|-old$|\d{8,14}$|-saas$|-v\d+-\w+$/', '', $base);
$base = preg_replace('/-pre-\w+$/', '', $base);
$base_groups[$base][] = $p;
}
// Duplicates only
$dups = [];
$total_dups = 0;
foreach ($base_groups as $base => $files) {
if (count($files) < 2) continue;
// Determine canonical = file with cleanest name (shortest), fall back to newest
usort($files, function($a, $b) use ($DOCROOT) {
// Prefer files without -v / -legacy / -old / -saas suffixes
$has_suffix_a = preg_match('/-v\d+|-legacy|-old|-saas|-new/', $a);
$has_suffix_b = preg_match('/-v\d+|-legacy|-old|-saas|-new/', $b);
if ($has_suffix_a !== $has_suffix_b) return $has_suffix_a - $has_suffix_b;
// Then prefer shorter name
if (strlen($a) !== strlen($b)) return strlen($a) - strlen($b);
// Then newest mtime
return filemtime("$DOCROOT/$b") - filemtime("$DOCROOT/$a");
});
$canonical = $files[0];
$legacy = array_slice($files, 1);
$entries = [];
foreach ($files as $f) {
$path = "$DOCROOT/$f";
$size = filesize($path);
$entries[] = [
'file' => $f,
'size_bytes' => $size,
'last_modified' => date('c', filemtime($path)),
'is_canonical' => $f === $canonical,
'url' => "/$f"
];
}
// Size similarity between canonical and legacy
$can_size = filesize("$DOCROOT/$canonical");
$close_copies = [];
foreach ($legacy as $l) {
$l_size = filesize("$DOCROOT/$l");
if ($can_size > 0 && abs($can_size - $l_size) / max($can_size, $l_size) < 0.1) {
$close_copies[] = $l;
}
}
$recommendation = count($close_copies) > 0
? 'Archive legacy copies (add HTTP 301 to canonical) - content quasi-identical'
: 'Keep for now - legacy has distinct content, verify if still referenced';
$dups[] = [
'base' => $base,
'count' => count($files),
'canonical' => $canonical,
'canonical_url' => "/$canonical",
'legacy_files' => $legacy,
'close_copies' => $close_copies,
'recommendation' => $recommendation,
'files' => $entries
];
$total_dups += count($legacy);
}
// Sort by count desc
usort($dups, function($a, $b) { return $b['count'] - $a['count']; });
$out = [
'ok' => true,
'ts' => date('c'),
'total_pages' => count($pages),
'total_duplicate_groups' => count($dups),
'total_legacy_files' => $total_dups,
'total_close_copies' => array_sum(array_map(function($g){ return count($g['close_copies']); }, $dups)),
'groups' => $dups,
'version' => 'wave-209',
'doctrine' => 'non-destructive · document canonical + legacy · zero delete'
];
echo json_encode($out, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);