$files) { if (count($files) < 2) continue; // Determine canonical = file with cleanest name (shortest), fall back to newest usort($files, function($a, $b) use ($DOCROOT) { // Prefer files without -v / -legacy / -old / -saas suffixes $has_suffix_a = preg_match('/-v\d+|-legacy|-old|-saas|-new/', $a); $has_suffix_b = preg_match('/-v\d+|-legacy|-old|-saas|-new/', $b); if ($has_suffix_a !== $has_suffix_b) return $has_suffix_a - $has_suffix_b; // Then prefer shorter name if (strlen($a) !== strlen($b)) return strlen($a) - strlen($b); // Then newest mtime return filemtime("$DOCROOT/$b") - filemtime("$DOCROOT/$a"); }); $canonical = $files[0]; $legacy = array_slice($files, 1); $entries = []; foreach ($files as $f) { $path = "$DOCROOT/$f"; $size = filesize($path); $entries[] = [ 'file' => $f, 'size_bytes' => $size, 'last_modified' => date('c', filemtime($path)), 'is_canonical' => $f === $canonical, 'url' => "/$f" ]; } // Size similarity between canonical and legacy $can_size = filesize("$DOCROOT/$canonical"); $close_copies = []; foreach ($legacy as $l) { $l_size = filesize("$DOCROOT/$l"); if ($can_size > 0 && abs($can_size - $l_size) / max($can_size, $l_size) < 0.1) { $close_copies[] = $l; } } $recommendation = count($close_copies) > 0 ? 'Archive legacy copies (add HTTP 301 to canonical) - content quasi-identical' : 'Keep for now - legacy has distinct content, verify if still referenced'; $dups[] = [ 'base' => $base, 'count' => count($files), 'canonical' => $canonical, 'canonical_url' => "/$canonical", 'legacy_files' => $legacy, 'close_copies' => $close_copies, 'recommendation' => $recommendation, 'files' => $entries ]; $total_dups += count($legacy); } // Sort by count desc usort($dups, function($a, $b) { return $b['count'] - $a['count']; }); $out = [ 'ok' => true, 'ts' => date('c'), 'total_pages' => count($pages), 'total_duplicate_groups' => count($dups), 'total_legacy_files' => $total_dups, 'total_close_copies' => array_sum(array_map(function($g){ return count($g['close_copies']); }, $dups)), 'groups' => $dups, 'version' => 'wave-209', 'doctrine' => 'non-destructive · document canonical + legacy · zero delete' ]; echo json_encode($out, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);