Skip to content

Commit

Permalink
Retry heartbeat timeouts by putting back in the queue (#1689)
Browse files Browse the repository at this point in the history
* If there’s a heartbeat error and no attempts we put it back in the queue to try again

* When nacking, return whether it was put back in the queue or not

* Try and nack, if it fails then fail the run

* Consolidated switch statement

* Fail executing/retrying runs
  • Loading branch information
matt-aitken authored Feb 10, 2025
1 parent bc7d445 commit 4dd42cd
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 30 deletions.
9 changes: 6 additions & 3 deletions apps/webapp/app/v3/marqs/index.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,8 @@ export class MarQS {
}

/**
* Negative acknowledge a message, which will requeue the message
* Negative acknowledge a message, which will requeue the message.
* Returns whether it went back into the queue or not.
*/
public async nackMessage(
messageId: string,
Expand All @@ -657,7 +658,7 @@ export class MarQS {
updates,
service: this.name,
});
return;
return false;
}

const nackCount = await this.#getNackCount(messageId);
Expand All @@ -676,7 +677,7 @@ export class MarQS {

// If we have reached the maximum nack count, we will ack the message
await this.acknowledgeMessage(messageId, "maximum nack count reached");
return;
return false;
}

span.setAttributes({
Expand Down Expand Up @@ -705,6 +706,8 @@ export class MarQS {
});

await this.options.subscriber?.messageNacked(message);

return true;
},
{
kind: SpanKind.CONSUMER,
Expand Down
58 changes: 31 additions & 27 deletions apps/webapp/app/v3/taskRunHeartbeatFailed.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,42 @@ export class TaskRunHeartbeatFailedService extends BaseService {
supportsLazyAttempts: true,
},
},
_count: {
select: {
attempts: true,
},
},
},
});

if (!taskRun) {
logger.error("[RequeueTaskRunService] Task run not found", {
logger.error("[TaskRunHeartbeatFailedService] Task run not found", {
runId,
});

return;
}

const service = new FailedTaskRunService();

switch (taskRun.status) {
case "PENDING": {
if (taskRun.lockedAt) {
case "PENDING":
case "WAITING_TO_RESUME":
case "PAUSED": {
const backInQueue = await marqs?.nackMessage(taskRun.id);

if (backInQueue) {
logger.debug(
"[RequeueTaskRunService] Failing task run because the heartbeat failed and it's PENDING but locked",
`[TaskRunHeartbeatFailedService] ${taskRun.status} run is back in the queue run`,
{
taskRun,
}
);
} else {
logger.debug(
`[TaskRunHeartbeatFailedService] ${taskRun.status} run not back in the queue, failing`,
{ taskRun }
);

const service = new FailedTaskRunService();

await service.call(taskRun.friendlyId, {
ok: false,
id: taskRun.friendlyId,
Expand All @@ -61,19 +76,13 @@ export class TaskRunHeartbeatFailedService extends BaseService {
message: "Did not receive a heartbeat from the worker in time",
},
});
} else {
logger.debug("[RequeueTaskRunService] Nacking task run", { taskRun });

await marqs?.nackMessage(taskRun.id);
}

break;
}
case "EXECUTING":
case "RETRYING_AFTER_FAILURE": {
logger.debug("[RequeueTaskRunService] Failing task run", { taskRun });

const service = new FailedTaskRunService();
logger.debug(`[RequeueTaskRunService] ${taskRun.status} failing task run`, { taskRun });

await service.call(taskRun.friendlyId, {
ok: false,
Expand All @@ -90,23 +99,18 @@ export class TaskRunHeartbeatFailedService extends BaseService {
}
case "DELAYED":
case "WAITING_FOR_DEPLOY": {
logger.debug("[RequeueTaskRunService] Removing task run from queue", { taskRun });
logger.debug(
`[TaskRunHeartbeatFailedService] ${taskRun.status} Removing task run from queue`,
{ taskRun }
);

await marqs?.acknowledgeMessage(
taskRun.id,
"Run is either DELAYED or WAITING_FOR_DEPLOY so we cannot requeue it in RequeueTaskRunService"
"Run is either DELAYED or WAITING_FOR_DEPLOY so we cannot requeue it in TaskRunHeartbeatFailedService"
);

break;
}
case "WAITING_TO_RESUME":
case "PAUSED": {
logger.debug("[RequeueTaskRunService] Requeueing task run", { taskRun });

await marqs?.nackMessage(taskRun.id);

break;
}
case "SYSTEM_FAILURE":
case "INTERRUPTED":
case "CRASHED":
Expand All @@ -115,11 +119,11 @@ export class TaskRunHeartbeatFailedService extends BaseService {
case "EXPIRED":
case "TIMED_OUT":
case "CANCELED": {
logger.debug("[RequeueTaskRunService] Task run is completed", { taskRun });
logger.debug("[TaskRunHeartbeatFailedService] Task run is completed", { taskRun });

await marqs?.acknowledgeMessage(
taskRun.id,
"Task run is already completed in RequeueTaskRunService"
"Task run is already completed in TaskRunHeartbeatFailedService"
);

try {
Expand All @@ -135,7 +139,7 @@ export class TaskRunHeartbeatFailedService extends BaseService {
delayInMs: taskRun.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined,
});
} catch (error) {
logger.error("[RequeueTaskRunService] Error signaling run cancellation", {
logger.error("[TaskRunHeartbeatFailedService] Error signaling run cancellation", {
runId: taskRun.id,
error: error instanceof Error ? error.message : error,
});
Expand Down

0 comments on commit 4dd42cd

Please sign in to comment.