Move src/ → apps/server/ and packages/web/ → apps/web/ to adopt standard monorepo conventions (apps/ for runnable apps, packages/ for reusable libraries). Update all config files, shared package imports, test fixtures, and documentation to reflect new paths. Key fixes: - Update workspace config to ["apps/*", "packages/*"] - Update tsconfig.json rootDir/include for apps/server/ - Add apps/web/** to vitest exclude list - Update drizzle.config.ts schema path - Fix ensure-schema.ts migration path detection (3 levels up in dev, 2 levels up in dist) - Fix tests/integration/cli-server.test.ts import paths - Update packages/shared imports to apps/server/ paths - Update all docs/ files with new paths
233 lines
7.0 KiB
TypeScript
233 lines
7.0 KiB
TypeScript
/**
|
|
* ErrorAnalyzer — Intelligent error classification and handling strategies.
|
|
*
|
|
* Analyzes various error conditions from agent processes and classifies them
|
|
* for appropriate retry and recovery strategies. Replaces scattered error
|
|
* handling with centralized, comprehensive error analysis.
|
|
*/
|
|
|
|
import { createModuleLogger } from '../../logger/index.js';
|
|
import type { SignalManager } from './signal-manager.js';
|
|
import type { AgentError, AgentErrorType } from './retry-policy.js';
|
|
|
|
const log = createModuleLogger('error-analyzer');
|
|
|
|
// Common error patterns for different providers
|
|
const ERROR_PATTERNS = {
|
|
auth_failure: [
|
|
/unauthorized/i,
|
|
/invalid.*(token|key|credential)/i,
|
|
/authentication.*failed/i,
|
|
/401/,
|
|
/access.*denied/i,
|
|
/invalid.*session/i,
|
|
/expired.*token/i,
|
|
],
|
|
usage_limit: [
|
|
/rate.*(limit|exceeded)/i,
|
|
/quota.*exceeded/i,
|
|
/too.*many.*requests/i,
|
|
/429/,
|
|
/usage.*limit/i,
|
|
/throttled/i,
|
|
/credit.*insufficient/i,
|
|
/api.*limit.*reached/i,
|
|
],
|
|
timeout: [
|
|
/timeout/i,
|
|
/timed.*out/i,
|
|
/deadline.*exceeded/i,
|
|
/connection.*timeout/i,
|
|
/read.*timeout/i,
|
|
],
|
|
process_crash: [
|
|
/segmentation.*fault/i,
|
|
/core.*dumped/i,
|
|
/fatal.*error/i,
|
|
/killed/i,
|
|
/aborted/i,
|
|
],
|
|
};
|
|
|
|
export class AgentErrorAnalyzer {
|
|
constructor(private signalManager: SignalManager) {}
|
|
|
|
/**
|
|
* Analyze an error and classify it for retry strategy.
|
|
* Combines multiple signals: error message, exit code, stderr, and workdir state.
|
|
*/
|
|
async analyzeError(
|
|
error: Error | string,
|
|
exitCode?: number | null,
|
|
stderr?: string,
|
|
agentWorkdir?: string
|
|
): Promise<AgentError> {
|
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
const fullContext = [errorMessage, stderr].filter(Boolean).join(' ');
|
|
|
|
log.debug({
|
|
errorMessage,
|
|
exitCode,
|
|
hasStderr: !!stderr,
|
|
hasWorkdir: !!agentWorkdir
|
|
}, 'analyzing agent error');
|
|
|
|
// Check for auth failure patterns
|
|
if (this.matchesPattern(fullContext, ERROR_PATTERNS.auth_failure)) {
|
|
return {
|
|
type: 'auth_failure',
|
|
message: errorMessage,
|
|
isTransient: true,
|
|
requiresAccountSwitch: false,
|
|
shouldPersistToDB: true,
|
|
exitCode,
|
|
originalError: error instanceof Error ? error : undefined,
|
|
};
|
|
}
|
|
|
|
// Check for usage limit patterns
|
|
if (this.matchesPattern(fullContext, ERROR_PATTERNS.usage_limit)) {
|
|
return {
|
|
type: 'usage_limit',
|
|
message: errorMessage,
|
|
isTransient: false,
|
|
requiresAccountSwitch: true,
|
|
shouldPersistToDB: true,
|
|
exitCode,
|
|
originalError: error instanceof Error ? error : undefined,
|
|
};
|
|
}
|
|
|
|
// Check for timeout patterns
|
|
if (this.matchesPattern(fullContext, ERROR_PATTERNS.timeout)) {
|
|
return {
|
|
type: 'timeout',
|
|
message: errorMessage,
|
|
isTransient: true,
|
|
requiresAccountSwitch: false,
|
|
shouldPersistToDB: true,
|
|
exitCode,
|
|
originalError: error instanceof Error ? error : undefined,
|
|
};
|
|
}
|
|
|
|
// Special case: process completed successfully but no signal.json
|
|
if (agentWorkdir && exitCode === 0) {
|
|
const hasSignal = await this.signalManager.checkSignalExists(agentWorkdir);
|
|
if (!hasSignal) {
|
|
log.debug({ agentWorkdir }, 'process completed successfully but no signal.json found');
|
|
return {
|
|
type: 'missing_signal',
|
|
message: 'Process completed successfully but no signal.json was generated',
|
|
isTransient: true,
|
|
requiresAccountSwitch: false,
|
|
shouldPersistToDB: false,
|
|
exitCode,
|
|
originalError: error instanceof Error ? error : undefined,
|
|
};
|
|
}
|
|
}
|
|
|
|
// Check for process crash patterns
|
|
if (this.matchesPattern(fullContext, ERROR_PATTERNS.process_crash) ||
|
|
(exitCode !== null && exitCode !== 0 && exitCode !== undefined)) {
|
|
|
|
// Determine if crash is transient based on exit code and patterns
|
|
const isTransient = this.isTransientCrash(exitCode, stderr);
|
|
|
|
return {
|
|
type: 'process_crash',
|
|
message: errorMessage,
|
|
isTransient,
|
|
requiresAccountSwitch: false,
|
|
shouldPersistToDB: true,
|
|
exitCode,
|
|
originalError: error instanceof Error ? error : undefined,
|
|
};
|
|
}
|
|
|
|
// Unknown error type
|
|
log.debug({
|
|
errorMessage,
|
|
exitCode,
|
|
stderr: stderr?.substring(0, 200) + '...'
|
|
}, 'error does not match known patterns, classifying as unknown');
|
|
|
|
return {
|
|
type: 'unknown',
|
|
message: errorMessage,
|
|
isTransient: false,
|
|
requiresAccountSwitch: false,
|
|
shouldPersistToDB: true,
|
|
exitCode,
|
|
originalError: error instanceof Error ? error : undefined,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Validate credentials with a brief test request using invalid token.
|
|
* This helps distinguish between token expiry vs. account exhaustion.
|
|
*/
|
|
async validateTokenWithInvalidRequest(accountId: string): Promise<boolean> {
|
|
// User requirement: "brief check with invalid access token to determine behavior"
|
|
// This would need integration with credential system and is provider-specific
|
|
// For now, return true to indicate token appears valid
|
|
log.debug({ accountId }, 'token validation requested (not yet implemented)');
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Check if error message or stderr matches any of the given patterns.
|
|
*/
|
|
private matchesPattern(text: string, patterns: RegExp[]): boolean {
|
|
if (!text) return false;
|
|
return patterns.some(pattern => pattern.test(text));
|
|
}
|
|
|
|
/**
|
|
* Determine if a process crash is likely transient (can be retried).
|
|
* Based on exit codes and stderr content.
|
|
*/
|
|
private isTransientCrash(exitCode?: number | null, stderr?: string): boolean {
|
|
// Exit codes that indicate transient failures
|
|
const transientExitCodes = new Set([
|
|
130, // SIGINT (interrupted)
|
|
143, // SIGTERM (terminated)
|
|
124, // timeout command
|
|
1, // Generic error (might be transient)
|
|
]);
|
|
|
|
if (exitCode !== null && exitCode !== undefined) {
|
|
if (transientExitCodes.has(exitCode)) {
|
|
log.debug({ exitCode }, 'exit code indicates transient failure');
|
|
return true;
|
|
}
|
|
|
|
// Very high exit codes often indicate system issues
|
|
if (exitCode > 128 && exitCode < 256) {
|
|
log.debug({ exitCode }, 'signal-based exit code may be transient');
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Check stderr for transient patterns
|
|
if (stderr) {
|
|
const transientPatterns = [
|
|
/temporary/i,
|
|
/network.*error/i,
|
|
/connection.*refused/i,
|
|
/service.*unavailable/i,
|
|
/disk.*full/i,
|
|
/out.*of.*memory/i,
|
|
];
|
|
|
|
if (transientPatterns.some(pattern => pattern.test(stderr))) {
|
|
log.debug({ stderr: stderr.substring(0, 100) + '...' }, 'stderr indicates transient failure');
|
|
return true;
|
|
}
|
|
}
|
|
|
|
log.debug({ exitCode, hasStderr: !!stderr }, 'crash appears non-transient');
|
|
return false;
|
|
}
|
|
} |