refactor: Restructure monorepo to apps/server/ and apps/web/ layout

Move src/ → apps/server/ and packages/web/ → apps/web/ to adopt standard monorepo conventions (apps/ for runnable apps, packages/ for reusable libraries). Update all config files, shared package imports, test fixtures, and documentation to reflect new paths. Key fixes: - Update workspace config to ["apps/*", "packages/*"] - Update tsconfig.json rootDir/include for apps/server/ - Add apps/web/** to vitest exclude list - Update drizzle.config.ts schema path - Fix ensure-schema.ts migration path detection (3 levels up in dev, 2 levels up in dist) - Fix tests/integration/cli-server.test.ts import paths - Update packages/shared imports to apps/server/ paths - Update all docs/ files with new paths
2026-03-03 11:22:53 +01:00
parent 8c38d958ce
commit 34578d39c6
535 changed files with 75452 additions and 687 deletions
--- a/apps/server/agent/lifecycle/controller.ts
+++ b/apps/server/agent/lifecycle/controller.ts
@@ -0,0 +1,358 @@
+/**
+ * AgentLifecycleController — Unified orchestrator for complete agent lifecycle.
+ *
+ * Replaces scattered lifecycle logic with comprehensive orchestration including:
+ * - Always clear signal.json before spawn/resume
+ * - Robust process completion waiting
+ * - Retry up to 3 times with comprehensive error handling
+ * - Auth/usage limit error detection with account switching
+ * - Missing signal recovery with instruction prompts
+ * - Debug mode archival vs production cleanup
+ */
+
+import { createModuleLogger } from '../../logger/index.js';
+import type { AgentRepository } from '../../db/repositories/agent-repository.js';
+import type { AccountRepository } from '../../db/repositories/account-repository.js';
+import type { ProcessManager } from '../process-manager.js';
+import type { CleanupManager } from '../cleanup-manager.js';
+import type { SpawnAgentOptions } from '../types.js';
+import type { SignalManager, SignalData } from './signal-manager.js';
+import type { RetryPolicy, AgentError } from './retry-policy.js';
+import { AgentExhaustedError, AgentFailureError } from './retry-policy.js';
+import type { AgentErrorAnalyzer } from './error-analyzer.js';
+import type { CleanupStrategy, AgentInfo } from './cleanup-strategy.js';
+
+const log = createModuleLogger('lifecycle-controller');
+
+export interface CompletionResult {
+  success: boolean;
+  signal?: SignalData;
+  error?: Error;
+  exitCode?: number | null;
+  stderr?: string;
+}
+
+export interface ResumeAgentOptions {
+  agentId: string;
+  answers: Record<string, string>;
+}
+
+export class AgentLifecycleController {
+  constructor(
+    private signalManager: SignalManager,
+    private retryPolicy: RetryPolicy,
+    private errorAnalyzer: AgentErrorAnalyzer,
+    private processManager: ProcessManager,
+    private repository: AgentRepository,
+    private cleanupManager: CleanupManager,
+    private cleanupStrategy: CleanupStrategy,
+    private accountRepository?: AccountRepository,
+    private debug: boolean = false,
+  ) {}
+
+  /**
+   * Execute spawn operation with comprehensive retry and error handling.
+   * Always clears signal.json before starting and waits for process completion.
+   */
+  async spawnWithRetry(
+    spawnFn: (options: SpawnAgentOptions) => Promise<AgentInfo>,
+    options: SpawnAgentOptions
+  ): Promise<AgentInfo> {
+    log.info({
+      taskId: options.taskId,
+      provider: options.provider,
+      initiativeId: options.initiativeId,
+      mode: options.mode
+    }, 'starting agent spawn with retry');
+
+    return this.executeWithRetry('spawn', spawnFn, options);
+  }
+
+  /**
+   * Execute resume operation with comprehensive retry and error handling.
+   * Always clears signal.json before resuming and waits for process completion.
+   */
+  async resumeWithRetry(
+    resumeFn: (agentId: string, answers: Record<string, string>) => Promise<void>,
+    options: ResumeAgentOptions
+  ): Promise<void> {
+    log.info({
+      agentId: options.agentId,
+      answerKeys: Object.keys(options.answers)
+    }, 'starting agent resume with retry');
+
+    await this.executeWithRetry('resume', async () => {
+      await resumeFn(options.agentId, options.answers);
+      const agent = await this.repository.findById(options.agentId);
+      if (!agent) throw new Error(`Agent '${options.agentId}' not found after resume`);
+      return this.toAgentInfo(agent);
+    }, options);
+  }
+
+  /**
+   * Main retry orchestrator for spawn/resume operations.
+   */
+  private async executeWithRetry<T>(
+    operation: 'spawn' | 'resume',
+    operationFn: (options: T) => Promise<AgentInfo>,
+    options: T
+  ): Promise<AgentInfo> {
+
+    for (let attempt = 1; attempt <= this.retryPolicy.maxAttempts; attempt++) {
+      try {
+        log.debug({ operation, attempt, maxAttempts: this.retryPolicy.maxAttempts }, 'starting attempt');
+
+        // Execute operation
+        const agent = await operationFn(options);
+        const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);
+
+        // CRITICAL: Always clear signal.json before start
+        log.debug({ agentId: agent.id, agentWorkdir }, 'clearing signal.json before process start');
+        await this.signalManager.clearSignal(agentWorkdir);
+
+        // Wait for process completion with robust detection
+        const result = await this.waitForCompletion(agent);
+
+        if (result.success) {
+          // Handle post-completion cleanup
+          await this.handlePostCompletion(agent);
+          log.info({
+            agentId: agent.id,
+            name: agent.name,
+            attempt,
+            operation
+          }, 'agent lifecycle completed successfully');
+          return agent;
+        }
+
+        // Analyze error and determine retry strategy
+        const agentError = await this.errorAnalyzer.analyzeError(
+          result.error || new Error('Unknown completion failure'),
+          result.exitCode,
+          result.stderr,
+          agentWorkdir
+        );
+
+        // Persist error to DB if required
+        if (agentError.shouldPersistToDB) {
+          await this.persistError(agent.id, agentError);
+        }
+
+        // Handle account switching for usage limits
+        if (agentError.requiresAccountSwitch) {
+          await this.handleAccountExhaustion(agent.id);
+          throw new AgentExhaustedError(agentError.message, agentError);
+        }
+
+        // Check if should retry
+        if (!this.retryPolicy.shouldRetry(agentError, attempt)) {
+          log.warn({
+            agentId: agent.id,
+            errorType: agentError.type,
+            attempt,
+            maxAttempts: this.retryPolicy.maxAttempts
+          }, 'max retry attempts reached or error not retriable');
+          throw new AgentFailureError(agentError.message, agentError);
+        }
+
+        // Handle special retry cases
+        if (agentError.type === 'missing_signal') {
+          // This would need to modify the options to add instruction prompt
+          // For now, log the special case
+          log.info({
+            agentId: agent.id,
+            attempt
+          }, 'will retry with missing signal instruction (not yet implemented)');
+        }
+
+        // Wait before retry
+        const delay = this.retryPolicy.getRetryDelay(attempt);
+        log.info({
+          agentId: agent.id,
+          attempt,
+          delay,
+          errorType: agentError.type,
+          errorMessage: agentError.message
+        }, 'retrying after delay');
+        await this.delay(delay);
+
+      } catch (error) {
+        if (error instanceof AgentExhaustedError || error instanceof AgentFailureError) {
+          throw error; // Don't retry these
+        }
+
+        if (attempt === this.retryPolicy.maxAttempts) {
+          log.error({
+            operation,
+            attempt,
+            error: error instanceof Error ? error.message : String(error)
+          }, 'final attempt failed, giving up');
+          throw error;
+        }
+
+        log.warn({
+          operation,
+          attempt,
+          error: error instanceof Error ? error.message : String(error)
+        }, 'attempt failed, will retry');
+      }
+    }
+
+    throw new Error('Unexpected: retry loop completed without success or terminal error');
+  }
+
+  /**
+   * Wait for process completion with robust signal detection.
+   * Replaces scattered completion detection with unified approach.
+   */
+  private async waitForCompletion(agent: AgentInfo): Promise<CompletionResult> {
+    const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);
+
+    log.debug({
+      agentId: agent.id,
+      name: agent.name,
+      agentWorkdir
+    }, 'waiting for process completion');
+
+    // Wait for process to exit (this would need integration with ProcessManager)
+    // For now, simulate with a timeout approach
+    // TODO: Implement waitForProcessCompletion in ProcessManager
+
+    // Wait for signal within reasonable timeout (30 seconds)
+    const signal = await this.signalManager.waitForSignal(agentWorkdir, 30000);
+
+    if (signal) {
+      log.debug({
+        agentId: agent.id,
+        signalStatus: signal.status
+      }, 'agent completed with valid signal');
+      return { success: true, signal };
+    }
+
+    // No signal found - this is an error condition
+    log.warn({
+      agentId: agent.id,
+      agentWorkdir
+    }, 'process completed without valid signal.json');
+
+    return {
+      success: false,
+      error: new Error('Process completed without valid signal.json'),
+      exitCode: null // Would get from ProcessManager
+    };
+  }
+
+  /**
+   * Handle post-completion cleanup based on agent status and debug mode.
+   */
+  private async handlePostCompletion(agent: AgentInfo): Promise<void> {
+    // Only cleanup if agent is not waiting for user input
+    if (agent.status === 'waiting_for_input') {
+      log.debug({ agentId: agent.id }, 'agent waiting for input, skipping cleanup');
+      return;
+    }
+
+    try {
+      const cleanupAction = await this.cleanupStrategy.shouldCleanup(agent, this.debug);
+      await this.cleanupStrategy.executeCleanup(agent, cleanupAction);
+
+      log.debug({
+        agentId: agent.id,
+        name: agent.name,
+        cleanupAction
+      }, 'post-completion cleanup executed');
+    } catch (error) {
+      log.warn({
+        agentId: agent.id,
+        error: error instanceof Error ? error.message : String(error)
+      }, 'post-completion cleanup failed');
+    }
+  }
+
+  /**
+   * Persist error details to database for debugging.
+   */
+  private async persistError(agentId: string, error: AgentError): Promise<void> {
+    try {
+      const errorData = {
+        errorType: error.type,
+        errorMessage: error.message,
+        exitCode: error.exitCode,
+        isTransient: error.isTransient,
+        requiresAccountSwitch: error.requiresAccountSwitch,
+        updatedAt: new Date(),
+      };
+
+      // This would need database schema updates to store error details
+      // For now, just update with basic error info
+      await this.repository.update(agentId, {
+        exitCode: error.exitCode,
+        updatedAt: new Date(),
+      });
+
+      log.debug({
+        agentId,
+        errorType: error.type,
+        exitCode: error.exitCode
+      }, 'error details persisted to database');
+    } catch (dbError) {
+      log.warn({
+        agentId,
+        error: dbError instanceof Error ? dbError.message : String(dbError)
+      }, 'failed to persist error to database');
+    }
+  }
+
+  /**
+   * Handle account exhaustion by marking account as exhausted.
+   */
+  private async handleAccountExhaustion(agentId: string): Promise<void> {
+    if (!this.accountRepository) {
+      log.debug({ agentId }, 'no account repository available for exhaustion handling');
+      return;
+    }
+
+    try {
+      const agent = await this.repository.findById(agentId);
+      if (!agent?.accountId) {
+        log.debug({ agentId }, 'agent has no account ID for exhaustion handling');
+        return;
+      }
+
+      // Mark account as exhausted for 1 hour
+      const exhaustedUntil = new Date(Date.now() + 60 * 60 * 1000);
+      await this.accountRepository.markExhausted(agent.accountId, exhaustedUntil);
+
+      log.info({
+        agentId,
+        accountId: agent.accountId,
+        exhaustedUntil
+      }, 'marked account as exhausted due to usage limits');
+    } catch (error) {
+      log.warn({
+        agentId,
+        error: error instanceof Error ? error.message : String(error)
+      }, 'failed to mark account as exhausted');
+    }
+  }
+
+  /**
+   * Simple delay utility for retry backoff.
+   */
+  private delay(ms: number): Promise<void> {
+    return new Promise(resolve => setTimeout(resolve, ms));
+  }
+
+  /**
+   * Convert database agent record to AgentInfo.
+   */
+  private toAgentInfo(agent: any): AgentInfo {
+    return {
+      id: agent.id,
+      name: agent.name,
+      status: agent.status,
+      initiativeId: agent.initiativeId,
+      worktreeId: agent.worktreeId,
+    };
+  }
+}