refactor: Restructure monorepo to apps/server/ and apps/web/ layout
Move src/ → apps/server/ and packages/web/ → apps/web/ to adopt standard monorepo conventions (apps/ for runnable apps, packages/ for reusable libraries). Update all config files, shared package imports, test fixtures, and documentation to reflect new paths. Key fixes: - Update workspace config to ["apps/*", "packages/*"] - Update tsconfig.json rootDir/include for apps/server/ - Add apps/web/** to vitest exclude list - Update drizzle.config.ts schema path - Fix ensure-schema.ts migration path detection (3 levels up in dev, 2 levels up in dist) - Fix tests/integration/cli-server.test.ts import paths - Update packages/shared imports to apps/server/ paths - Update all docs/ files with new paths
This commit is contained in:
233
apps/server/agent/lifecycle/error-analyzer.ts
Normal file
233
apps/server/agent/lifecycle/error-analyzer.ts
Normal file
@@ -0,0 +1,233 @@
|
||||
/**
|
||||
* ErrorAnalyzer — Intelligent error classification and handling strategies.
|
||||
*
|
||||
* Analyzes various error conditions from agent processes and classifies them
|
||||
* for appropriate retry and recovery strategies. Replaces scattered error
|
||||
* handling with centralized, comprehensive error analysis.
|
||||
*/
|
||||
|
||||
import { createModuleLogger } from '../../logger/index.js';
|
||||
import type { SignalManager } from './signal-manager.js';
|
||||
import type { AgentError, AgentErrorType } from './retry-policy.js';
|
||||
|
||||
const log = createModuleLogger('error-analyzer');
|
||||
|
||||
// Common error patterns for different providers
|
||||
const ERROR_PATTERNS = {
|
||||
auth_failure: [
|
||||
/unauthorized/i,
|
||||
/invalid.*(token|key|credential)/i,
|
||||
/authentication.*failed/i,
|
||||
/401/,
|
||||
/access.*denied/i,
|
||||
/invalid.*session/i,
|
||||
/expired.*token/i,
|
||||
],
|
||||
usage_limit: [
|
||||
/rate.*(limit|exceeded)/i,
|
||||
/quota.*exceeded/i,
|
||||
/too.*many.*requests/i,
|
||||
/429/,
|
||||
/usage.*limit/i,
|
||||
/throttled/i,
|
||||
/credit.*insufficient/i,
|
||||
/api.*limit.*reached/i,
|
||||
],
|
||||
timeout: [
|
||||
/timeout/i,
|
||||
/timed.*out/i,
|
||||
/deadline.*exceeded/i,
|
||||
/connection.*timeout/i,
|
||||
/read.*timeout/i,
|
||||
],
|
||||
process_crash: [
|
||||
/segmentation.*fault/i,
|
||||
/core.*dumped/i,
|
||||
/fatal.*error/i,
|
||||
/killed/i,
|
||||
/aborted/i,
|
||||
],
|
||||
};
|
||||
|
||||
export class AgentErrorAnalyzer {
|
||||
constructor(private signalManager: SignalManager) {}
|
||||
|
||||
/**
|
||||
* Analyze an error and classify it for retry strategy.
|
||||
* Combines multiple signals: error message, exit code, stderr, and workdir state.
|
||||
*/
|
||||
async analyzeError(
|
||||
error: Error | string,
|
||||
exitCode?: number | null,
|
||||
stderr?: string,
|
||||
agentWorkdir?: string
|
||||
): Promise<AgentError> {
|
||||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||||
const fullContext = [errorMessage, stderr].filter(Boolean).join(' ');
|
||||
|
||||
log.debug({
|
||||
errorMessage,
|
||||
exitCode,
|
||||
hasStderr: !!stderr,
|
||||
hasWorkdir: !!agentWorkdir
|
||||
}, 'analyzing agent error');
|
||||
|
||||
// Check for auth failure patterns
|
||||
if (this.matchesPattern(fullContext, ERROR_PATTERNS.auth_failure)) {
|
||||
return {
|
||||
type: 'auth_failure',
|
||||
message: errorMessage,
|
||||
isTransient: true,
|
||||
requiresAccountSwitch: false,
|
||||
shouldPersistToDB: true,
|
||||
exitCode,
|
||||
originalError: error instanceof Error ? error : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
// Check for usage limit patterns
|
||||
if (this.matchesPattern(fullContext, ERROR_PATTERNS.usage_limit)) {
|
||||
return {
|
||||
type: 'usage_limit',
|
||||
message: errorMessage,
|
||||
isTransient: false,
|
||||
requiresAccountSwitch: true,
|
||||
shouldPersistToDB: true,
|
||||
exitCode,
|
||||
originalError: error instanceof Error ? error : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
// Check for timeout patterns
|
||||
if (this.matchesPattern(fullContext, ERROR_PATTERNS.timeout)) {
|
||||
return {
|
||||
type: 'timeout',
|
||||
message: errorMessage,
|
||||
isTransient: true,
|
||||
requiresAccountSwitch: false,
|
||||
shouldPersistToDB: true,
|
||||
exitCode,
|
||||
originalError: error instanceof Error ? error : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
// Special case: process completed successfully but no signal.json
|
||||
if (agentWorkdir && exitCode === 0) {
|
||||
const hasSignal = await this.signalManager.checkSignalExists(agentWorkdir);
|
||||
if (!hasSignal) {
|
||||
log.debug({ agentWorkdir }, 'process completed successfully but no signal.json found');
|
||||
return {
|
||||
type: 'missing_signal',
|
||||
message: 'Process completed successfully but no signal.json was generated',
|
||||
isTransient: true,
|
||||
requiresAccountSwitch: false,
|
||||
shouldPersistToDB: false,
|
||||
exitCode,
|
||||
originalError: error instanceof Error ? error : undefined,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Check for process crash patterns
|
||||
if (this.matchesPattern(fullContext, ERROR_PATTERNS.process_crash) ||
|
||||
(exitCode !== null && exitCode !== 0 && exitCode !== undefined)) {
|
||||
|
||||
// Determine if crash is transient based on exit code and patterns
|
||||
const isTransient = this.isTransientCrash(exitCode, stderr);
|
||||
|
||||
return {
|
||||
type: 'process_crash',
|
||||
message: errorMessage,
|
||||
isTransient,
|
||||
requiresAccountSwitch: false,
|
||||
shouldPersistToDB: true,
|
||||
exitCode,
|
||||
originalError: error instanceof Error ? error : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
// Unknown error type
|
||||
log.debug({
|
||||
errorMessage,
|
||||
exitCode,
|
||||
stderr: stderr?.substring(0, 200) + '...'
|
||||
}, 'error does not match known patterns, classifying as unknown');
|
||||
|
||||
return {
|
||||
type: 'unknown',
|
||||
message: errorMessage,
|
||||
isTransient: false,
|
||||
requiresAccountSwitch: false,
|
||||
shouldPersistToDB: true,
|
||||
exitCode,
|
||||
originalError: error instanceof Error ? error : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate credentials with a brief test request using invalid token.
|
||||
* This helps distinguish between token expiry vs. account exhaustion.
|
||||
*/
|
||||
async validateTokenWithInvalidRequest(accountId: string): Promise<boolean> {
|
||||
// User requirement: "brief check with invalid access token to determine behavior"
|
||||
// This would need integration with credential system and is provider-specific
|
||||
// For now, return true to indicate token appears valid
|
||||
log.debug({ accountId }, 'token validation requested (not yet implemented)');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if error message or stderr matches any of the given patterns.
|
||||
*/
|
||||
private matchesPattern(text: string, patterns: RegExp[]): boolean {
|
||||
if (!text) return false;
|
||||
return patterns.some(pattern => pattern.test(text));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if a process crash is likely transient (can be retried).
|
||||
* Based on exit codes and stderr content.
|
||||
*/
|
||||
private isTransientCrash(exitCode?: number | null, stderr?: string): boolean {
|
||||
// Exit codes that indicate transient failures
|
||||
const transientExitCodes = new Set([
|
||||
130, // SIGINT (interrupted)
|
||||
143, // SIGTERM (terminated)
|
||||
124, // timeout command
|
||||
1, // Generic error (might be transient)
|
||||
]);
|
||||
|
||||
if (exitCode !== null && exitCode !== undefined) {
|
||||
if (transientExitCodes.has(exitCode)) {
|
||||
log.debug({ exitCode }, 'exit code indicates transient failure');
|
||||
return true;
|
||||
}
|
||||
|
||||
// Very high exit codes often indicate system issues
|
||||
if (exitCode > 128 && exitCode < 256) {
|
||||
log.debug({ exitCode }, 'signal-based exit code may be transient');
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check stderr for transient patterns
|
||||
if (stderr) {
|
||||
const transientPatterns = [
|
||||
/temporary/i,
|
||||
/network.*error/i,
|
||||
/connection.*refused/i,
|
||||
/service.*unavailable/i,
|
||||
/disk.*full/i,
|
||||
/out.*of.*memory/i,
|
||||
];
|
||||
|
||||
if (transientPatterns.some(pattern => pattern.test(stderr))) {
|
||||
log.debug({ stderr: stderr.substring(0, 100) + '...' }, 'stderr indicates transient failure');
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
log.debug({ exitCode, hasStderr: !!stderr }, 'crash appears non-transient');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user