242 lines
13 KiB
PHP
Executable File
242 lines
13 KiB
PHP
Executable File
<?php
|
|
/**
|
|
* WEVIA OPUS — Error Taxonomy & Recovery
|
|
*
|
|
* Classifie les erreurs et propose des stratégies de recovery:
|
|
* - Network errors (timeout, DNS, SSL)
|
|
* - LLM errors (overload, context overflow, hallucination)
|
|
* - Database errors (connection, query, constraint)
|
|
* - File system errors (permission, not found, disk full)
|
|
* - Application errors (validation, auth, rate limit)
|
|
*/
|
|
|
|
class ErrorTaxonomy {
|
|
|
|
private array $taxonomy;
|
|
private array $recoveryStrategies;
|
|
private array $errorLog = [];
|
|
|
|
public function __construct() {
|
|
$this->initTaxonomy();
|
|
$this->initRecoveryStrategies();
|
|
}
|
|
|
|
private function initTaxonomy(): void {
|
|
$this->taxonomy = [
|
|
'network' => [
|
|
'timeout' => ['code' => 'NET_TIMEOUT', 'severity' => 'medium', 'retryable' => true,
|
|
'symptoms' => ['ETIMEDOUT', 'Connection timed out', 'curl_errno 28'],
|
|
'common_causes' => ['Server overloaded', 'Network congestion', 'DNS slow']],
|
|
'dns_failure' => ['code' => 'NET_DNS', 'severity' => 'high', 'retryable' => true,
|
|
'symptoms' => ['Could not resolve host', 'Name resolution failed', 'getaddrinfo failed'],
|
|
'common_causes' => ['DNS server down', 'Invalid hostname', 'Network disconnected']],
|
|
'ssl_error' => ['code' => 'NET_SSL', 'severity' => 'high', 'retryable' => false,
|
|
'symptoms' => ['SSL certificate problem', 'certificate verify failed', 'SSL_ERROR'],
|
|
'common_causes' => ['Expired certificate', 'Self-signed cert', 'Wrong hostname']],
|
|
'connection_refused' => ['code' => 'NET_REFUSED', 'severity' => 'high', 'retryable' => true,
|
|
'symptoms' => ['Connection refused', 'ECONNREFUSED', 'port 5821'],
|
|
'common_causes' => ['Service not running', 'Firewall blocking', 'Wrong port']],
|
|
],
|
|
'llm' => [
|
|
'overload' => ['code' => 'LLM_OVERLOAD', 'severity' => 'medium', 'retryable' => true,
|
|
'symptoms' => ['429 Too Many Requests', 'Rate limit', 'Server busy'],
|
|
'common_causes' => ['Too many concurrent requests', 'API quota exceeded']],
|
|
'context_overflow' => ['code' => 'LLM_CTX_OVERFLOW', 'severity' => 'medium', 'retryable' => false,
|
|
'symptoms' => ['context length', 'maximum context', 'token limit'],
|
|
'common_causes' => ['Prompt too long', 'Conversation history too big']],
|
|
'model_not_found' => ['code' => 'LLM_NO_MODEL', 'severity' => 'high', 'retryable' => false,
|
|
'symptoms' => ['model not found', 'not a valid model', 'unknown model'],
|
|
'common_causes' => ['Model not pulled', 'Wrong model name', 'Ollama not running']],
|
|
'generation_error' => ['code' => 'LLM_GEN_ERROR', 'severity' => 'medium', 'retryable' => true,
|
|
'symptoms' => ['internal server error', '500', 'generation failed'],
|
|
'common_causes' => ['GPU out of memory', 'Model corrupted', 'Concurrent load']],
|
|
'empty_response' => ['code' => 'LLM_EMPTY', 'severity' => 'medium', 'retryable' => true,
|
|
'symptoms' => ['empty response', 'no content', 'null message'],
|
|
'common_causes' => ['Model confused', 'Temperature too low', 'Prompt issue']],
|
|
],
|
|
'database' => [
|
|
'connection_failed' => ['code' => 'DB_CONN', 'severity' => 'critical', 'retryable' => true,
|
|
'symptoms' => ['could not connect', 'Connection refused', 'FATAL:', 'pg_connect'],
|
|
'common_causes' => ['PostgreSQL not running', 'Wrong credentials', 'Max connections reached']],
|
|
'query_error' => ['code' => 'DB_QUERY', 'severity' => 'medium', 'retryable' => false,
|
|
'symptoms' => ['syntax error', 'ERROR:', 'relation does not exist', 'column does not exist'],
|
|
'common_causes' => ['SQL syntax error', 'Table/column renamed', 'Missing migration']],
|
|
'constraint_violation' => ['code' => 'DB_CONSTRAINT', 'severity' => 'low', 'retryable' => false,
|
|
'symptoms' => ['unique constraint', 'foreign key', 'not-null constraint', 'check constraint'],
|
|
'common_causes' => ['Duplicate entry', 'Missing reference', 'Invalid data']],
|
|
'timeout' => ['code' => 'DB_TIMEOUT', 'severity' => 'high', 'retryable' => true,
|
|
'symptoms' => ['statement timeout', 'canceling statement', 'lock timeout'],
|
|
'common_causes' => ['Slow query', 'Missing index', 'Table lock', 'Dead lock']],
|
|
],
|
|
'filesystem' => [
|
|
'permission_denied' => ['code' => 'FS_PERM', 'severity' => 'high', 'retryable' => false,
|
|
'symptoms' => ['Permission denied', 'EACCES', 'Operation not permitted'],
|
|
'common_causes' => ['Wrong file ownership', 'SELinux', 'Read-only filesystem']],
|
|
'not_found' => ['code' => 'FS_NOTFOUND', 'severity' => 'medium', 'retryable' => false,
|
|
'symptoms' => ['No such file', 'ENOENT', 'file not found'],
|
|
'common_causes' => ['Wrong path', 'File deleted', 'Typo in filename']],
|
|
'disk_full' => ['code' => 'FS_FULL', 'severity' => 'critical', 'retryable' => false,
|
|
'symptoms' => ['No space left', 'ENOSPC', 'disk full'],
|
|
'common_causes' => ['Logs too big', 'Backups not cleaned', 'Models filling disk']],
|
|
],
|
|
'application' => [
|
|
'validation' => ['code' => 'APP_VALID', 'severity' => 'low', 'retryable' => false,
|
|
'symptoms' => ['validation failed', 'invalid', 'required field'],
|
|
'common_causes' => ['Missing input', 'Wrong format', 'Out of range']],
|
|
'auth_failed' => ['code' => 'APP_AUTH', 'severity' => 'high', 'retryable' => false,
|
|
'symptoms' => ['401 Unauthorized', '403 Forbidden', 'authentication failed', 'invalid token'],
|
|
'common_causes' => ['Expired token', 'Wrong credentials', 'Insufficient permissions']],
|
|
'rate_limited' => ['code' => 'APP_RATE', 'severity' => 'medium', 'retryable' => true,
|
|
'symptoms' => ['429', 'rate limit', 'too many requests', 'throttled'],
|
|
'common_causes' => ['Too many requests', 'Burst limit', 'Daily quota']],
|
|
]
|
|
];
|
|
}
|
|
|
|
private function initRecoveryStrategies(): void {
|
|
$this->recoveryStrategies = [
|
|
'retry_with_backoff' => [
|
|
'description' => 'Retry with exponential backoff',
|
|
'max_attempts' => 3,
|
|
'delays' => [1, 2, 4],
|
|
'applicable_to' => ['NET_TIMEOUT', 'NET_DNS', 'NET_REFUSED', 'LLM_OVERLOAD', 'LLM_GEN_ERROR', 'LLM_EMPTY', 'DB_CONN', 'DB_TIMEOUT', 'APP_RATE']
|
|
],
|
|
'fallback_model' => [
|
|
'description' => 'Switch to a smaller/different model',
|
|
'fallback_chain' => ['deepseek-r1:32b' => 'deepseek-r1:14b', 'llama3.3:70b' => 'llama3.1:8b', 'qwen2.5-coder:32b' => 'qwen2.5-coder:14b'],
|
|
'applicable_to' => ['LLM_OVERLOAD', 'LLM_GEN_ERROR', 'LLM_CTX_OVERFLOW']
|
|
],
|
|
'compress_context' => [
|
|
'description' => 'Reduce context window by summarizing history',
|
|
'applicable_to' => ['LLM_CTX_OVERFLOW']
|
|
],
|
|
'reconnect_db' => [
|
|
'description' => 'Close and reopen database connection',
|
|
'applicable_to' => ['DB_CONN', 'DB_TIMEOUT']
|
|
],
|
|
'cleanup_disk' => [
|
|
'description' => 'Clean temp files, old logs, and caches',
|
|
'commands' => [
|
|
'find /tmp -name "wevia_*" -mtime +1 -delete',
|
|
'find /opt/wevads/logs/ -name "*.log" -mtime +7 -delete',
|
|
'journalctl --vacuum-time=3d'
|
|
],
|
|
'applicable_to' => ['FS_FULL']
|
|
],
|
|
'alert_admin' => [
|
|
'description' => 'Send alert to admin when recovery fails',
|
|
'applicable_to' => ['*'] // All errors after recovery fails
|
|
]
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Classifie une erreur
|
|
*/
|
|
public function classify(string $errorMessage, int $httpCode = 0): array {
|
|
$errorLower = mb_strtolower($errorMessage);
|
|
|
|
foreach ($this->taxonomy as $category => $types) {
|
|
foreach ($types as $typeName => $typeConfig) {
|
|
foreach ($typeConfig['symptoms'] as $symptom) {
|
|
if (mb_stripos($errorLower, mb_strtolower($symptom)) !== false) {
|
|
$classification = [
|
|
'category' => $category,
|
|
'type' => $typeName,
|
|
'code' => $typeConfig['code'],
|
|
'severity' => $typeConfig['severity'],
|
|
'retryable' => $typeConfig['retryable'],
|
|
'common_causes' => $typeConfig['common_causes'],
|
|
'matched_symptom' => $symptom
|
|
];
|
|
$classification['recovery'] = $this->getRecoveryStrategies($typeConfig['code']);
|
|
$this->errorLog[] = array_merge($classification, ['timestamp' => date('Y-m-d H:i:s'), 'raw' => mb_substr($errorMessage, 0, 500)]);
|
|
return $classification;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Unknown error
|
|
return [
|
|
'category' => 'unknown',
|
|
'type' => 'unclassified',
|
|
'code' => 'UNKNOWN',
|
|
'severity' => 'medium',
|
|
'retryable' => false,
|
|
'raw' => mb_substr($errorMessage, 0, 500),
|
|
'recovery' => [['strategy' => 'alert_admin', 'description' => 'Unknown error — alert admin']]
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Récupère les stratégies de recovery pour un code d'erreur
|
|
*/
|
|
public function getRecoveryStrategies(string $errorCode): array {
|
|
$strategies = [];
|
|
foreach ($this->recoveryStrategies as $name => $config) {
|
|
if (in_array($errorCode, $config['applicable_to']) || in_array('*', $config['applicable_to'])) {
|
|
$strategies[] = ['strategy' => $name, 'description' => $config['description']];
|
|
}
|
|
}
|
|
return $strategies;
|
|
}
|
|
|
|
/**
|
|
* Exécute la recovery automatique
|
|
*/
|
|
public function autoRecover(string $errorMessage, int $httpCode = 0, callable $retryCallback = null): array {
|
|
$classification = $this->classify($errorMessage, $httpCode);
|
|
|
|
if (!$classification['retryable']) {
|
|
return ['recovered' => false, 'classification' => $classification, 'action' => 'Manual intervention required'];
|
|
}
|
|
|
|
// Essayer les stratégies dans l'ordre
|
|
foreach ($classification['recovery'] as $strategy) {
|
|
switch ($strategy['strategy']) {
|
|
case 'retry_with_backoff':
|
|
if ($retryCallback) {
|
|
$config = $this->recoveryStrategies['retry_with_backoff'];
|
|
for ($i = 0; $i < $config['max_attempts']; $i++) {
|
|
sleep($config['delays'][$i] ?? 4);
|
|
try {
|
|
$result = $retryCallback();
|
|
if ($result !== false) {
|
|
return ['recovered' => true, 'strategy' => 'retry_with_backoff', 'attempt' => $i + 1, 'result' => $result];
|
|
}
|
|
} catch (\Exception $e) {
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
case 'cleanup_disk':
|
|
foreach ($this->recoveryStrategies['cleanup_disk']['commands'] as $cmd) {
|
|
shell_exec($cmd);
|
|
}
|
|
return ['recovered' => true, 'strategy' => 'cleanup_disk', 'action' => 'Disk cleanup executed'];
|
|
}
|
|
}
|
|
|
|
return ['recovered' => false, 'classification' => $classification, 'action' => 'All recovery strategies failed'];
|
|
}
|
|
|
|
public function getErrorLog(): array { return $this->errorLog; }
|
|
|
|
/**
|
|
* Résumé des erreurs pour le rapport
|
|
*/
|
|
public function errorSummary(): array {
|
|
$summary = ['total' => count($this->errorLog), 'by_category' => [], 'by_severity' => []];
|
|
foreach ($this->errorLog as $err) {
|
|
$cat = $err['category'] ?? 'unknown';
|
|
$sev = $err['severity'] ?? 'unknown';
|
|
$summary['by_category'][$cat] = ($summary['by_category'][$cat] ?? 0) + 1;
|
|
$summary['by_severity'][$sev] = ($summary['by_severity'][$sev] ?? 0) + 1;
|
|
}
|
|
return $summary;
|
|
}
|
|
}
|