fix: prevent stale duplicate planning tasks from blocking phase completion
Three fixes for phases getting stuck when a detail task crashes and is retried: 1. detailPhase mutation (architect.ts): clean up orphaned pending/in_progress detail tasks before creating new ones, preventing duplicates at the source 2. orchestrator recovery: detect and complete stale duplicate planning tasks (same category+phase, one completed, one pending) 3. ensureBranch: catch "already exists" TOCTOU race instead of blocking phase
This commit is contained in:
@@ -20,7 +20,7 @@ import type { ProjectRepository } from '../db/repositories/project-repository.js
|
|||||||
import type { AgentRepository } from '../db/repositories/agent-repository.js';
|
import type { AgentRepository } from '../db/repositories/agent-repository.js';
|
||||||
import type { DispatchManager, PhaseDispatchManager } from '../dispatch/types.js';
|
import type { DispatchManager, PhaseDispatchManager } from '../dispatch/types.js';
|
||||||
import type { ConflictResolutionService } from '../coordination/conflict-resolution-service.js';
|
import type { ConflictResolutionService } from '../coordination/conflict-resolution-service.js';
|
||||||
import { phaseBranchName, taskBranchName } from '../git/branch-naming.js';
|
import { phaseBranchName, taskBranchName, isPlanningCategory } from '../git/branch-naming.js';
|
||||||
import { ensureProjectClone } from '../git/project-clones.js';
|
import { ensureProjectClone } from '../git/project-clones.js';
|
||||||
import { createModuleLogger } from '../logger/index.js';
|
import { createModuleLogger } from '../logger/index.js';
|
||||||
import { phaseMetaCache, fileDiffCache } from '../review/diff-cache.js';
|
import { phaseMetaCache, fileDiffCache } from '../review/diff-cache.js';
|
||||||
@@ -637,6 +637,23 @@ export class ExecutionOrchestrator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clean up stale duplicate planning tasks (e.g. a crashed detail task
|
||||||
|
// that was reset to pending, then a new detail task was created and completed).
|
||||||
|
const tasksAfterRecovery = await this.taskRepository.findByPhaseId(phase.id);
|
||||||
|
const completedPlanningNames = new Set<string>();
|
||||||
|
for (const t of tasksAfterRecovery) {
|
||||||
|
if (isPlanningCategory(t.category) && t.status === 'completed') {
|
||||||
|
completedPlanningNames.add(`${t.category}:${t.phaseId}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (const t of tasksAfterRecovery) {
|
||||||
|
if (isPlanningCategory(t.category) && t.status === 'pending' && completedPlanningNames.has(`${t.category}:${t.phaseId}`)) {
|
||||||
|
await this.taskRepository.update(t.id, { status: 'completed', summary: 'Superseded by retry' });
|
||||||
|
tasksRecovered++;
|
||||||
|
log.info({ taskId: t.id, category: t.category }, 'recovered stale duplicate planning task');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Re-read tasks after recovery updates and check if phase is now fully done
|
// Re-read tasks after recovery updates and check if phase is now fully done
|
||||||
const updatedTasks = await this.taskRepository.findByPhaseId(phase.id);
|
const updatedTasks = await this.taskRepository.findByPhaseId(phase.id);
|
||||||
const allDone = updatedTasks.every((t) => t.status === 'completed');
|
const allDone = updatedTasks.every((t) => t.status === 'completed');
|
||||||
|
|||||||
@@ -46,8 +46,17 @@ export class SimpleGitBranchManager implements BranchManager {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
await git.branch([branch, baseBranch]);
|
try {
|
||||||
log.info({ repoPath, branch, baseBranch }, 'branch created');
|
await git.branch([branch, baseBranch]);
|
||||||
|
log.info({ repoPath, branch, baseBranch }, 'branch created');
|
||||||
|
} catch (err) {
|
||||||
|
// Handle TOCTOU race: branch may have been created between the exists check and now
|
||||||
|
if (err instanceof Error && err.message.includes('already exists')) {
|
||||||
|
log.debug({ repoPath, branch }, 'branch created by concurrent process');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async mergeBranch(repoPath: string, sourceBranch: string, targetBranch: string): Promise<MergeResult> {
|
async mergeBranch(repoPath: string, sourceBranch: string, targetBranch: string): Promise<MergeResult> {
|
||||||
|
|||||||
@@ -337,6 +337,14 @@ export function architectProcedures(publicProcedure: ProcedureBuilder) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clean up orphaned pending/in_progress detail tasks from previous failed attempts
|
||||||
|
const phaseTasks = await taskRepo.findByPhaseId(input.phaseId);
|
||||||
|
for (const t of phaseTasks) {
|
||||||
|
if (t.category === 'detail' && (t.status === 'pending' || t.status === 'in_progress')) {
|
||||||
|
await taskRepo.update(t.id, { status: 'completed', summary: 'Superseded by retry' });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const detailTaskName = input.taskName ?? `Detail: ${phase.name}`;
|
const detailTaskName = input.taskName ?? `Detail: ${phase.name}`;
|
||||||
const task = await taskRepo.create({
|
const task = await taskRepo.create({
|
||||||
phaseId: phase.id,
|
phaseId: phase.id,
|
||||||
|
|||||||
@@ -149,8 +149,11 @@ When an agent crashes (`agent:crashed` event), the orchestrator automatically re
|
|||||||
On server restart, `recoverDispatchQueues()` also recovers:
|
On server restart, `recoverDispatchQueues()` also recovers:
|
||||||
- Stuck `in_progress` tasks whose agents are dead (status is not `running` or `waiting_for_input`) — reset to `pending` and re-queued
|
- Stuck `in_progress` tasks whose agents are dead (status is not `running` or `waiting_for_input`) — reset to `pending` and re-queued
|
||||||
- Erroneously `blocked` tasks whose agents completed successfully (status is `idle` or `stopped`) — marked `completed` so the phase can progress. This handles the legacy case where conflict resolution incorrectly blocked already-completed tasks.
|
- Erroneously `blocked` tasks whose agents completed successfully (status is `idle` or `stopped`) — marked `completed` so the phase can progress. This handles the legacy case where conflict resolution incorrectly blocked already-completed tasks.
|
||||||
|
- Stale duplicate planning tasks — if a phase has both a completed and a pending task of the same planning category (e.g. two `detail` tasks from a crash-and-retry), the pending one is marked `completed` with summary "Superseded by retry"
|
||||||
- Fully-completed `in_progress` phases — after task recovery, if all tasks in an `in_progress` phase are completed, triggers `handlePhaseAllTasksDone` to complete/review the phase
|
- Fully-completed `in_progress` phases — after task recovery, if all tasks in an `in_progress` phase are completed, triggers `handlePhaseAllTasksDone` to complete/review the phase
|
||||||
|
|
||||||
|
The `detailPhase` mutation in `architect.ts` also cleans up orphaned pending/in_progress detail tasks before creating new ones, preventing duplicates at the source.
|
||||||
|
|
||||||
Manual retry via `retryBlockedTask()` resets `retryCount` to 0, giving the task a fresh set of automatic retries.
|
Manual retry via `retryBlockedTask()` resets `retryCount` to 0, giving the task a fresh set of automatic retries.
|
||||||
|
|
||||||
### Coalesced Scheduling
|
### Coalesced Scheduling
|
||||||
|
|||||||
Reference in New Issue
Block a user