Skip to content
This repository has been archived by the owner on Dec 13, 2023. It is now read-only.

Commit

Permalink
Improvements to QueueDAO resiliency changes to verify the case when u…
Browse files Browse the repository at this point in the history
…pdate task is called with IN_PROGRESS state.
  • Loading branch information
kishorebanala committed Oct 14, 2020
1 parent f836543 commit 61d403f
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -520,14 +520,7 @@ public void rewind(String workflowId, boolean useLatestDefinitions) {
} catch (Exception e) {
Monitors.recordWorkflowStartError(workflowDef.getName(), WorkflowContext.get().getClientApp());
LOGGER.error("Unable to restart workflow: {}", workflowDef.getName(), e);

// It's possible the terminate workflow call hits an exception as well, in that case we want to log both
// errors to help diagnosis.
try {
terminateWorkflow(workflowId, "Error when restarting the workflow");
} catch (Exception rwe) {
LOGGER.error("Could not terminate the workflowId: " + workflowId, rwe);
}
terminateWorkflow(workflowId, "Error when restarting the workflow");
throw e;
}

Expand Down Expand Up @@ -878,40 +871,48 @@ public void updateTask(TaskResult taskResult) {
task.setEndTime(System.currentTimeMillis());
}

// Try and ignore QueueDAO operations based on task status
try {
String updateTaskQueueDesc = "Updating Task queues for taskId: " + task.getTaskId();
String taskQueueOperation = "updateTaskQueues";
// Update message in Task queue based on Task status
switch (task.getStatus()) {
case COMPLETED:
case CANCELED:
case FAILED:
case FAILED_WITH_TERMINAL_ERROR:
case TIMED_OUT:
try {
queueDAO.remove(taskQueueName, taskResult.getTaskId());
LOGGER.debug("Task: {} removed from taskQueue: {} since the task status is {}", task, taskQueueName, task.getStatus().name());
} catch (Exception e) {
// Ignore exceptions on queue remove as it wouldn't impact task and workflow execution, and will be cleaned up eventually
String errorMsg = String.format("Error removing the message in queue for task: %s for workflow: %s", task.getTaskId(), workflowId);
LOGGER.warn(errorMsg, e);
Monitors.recordTaskQueueOpError(task.getTaskType(), workflowInstance.getWorkflowName());
}
break;
case IN_PROGRESS:
case SCHEDULED:
try {
String postponeTaskMessageDesc = "Postponing Task message in queue for taskId: " + task.getTaskId();
String postponeTaskMessageOperation = "postponeTaskMessage";

new RetryUtil<>().retryOnException(() -> {
switch (task.getStatus()) {
case COMPLETED:
case CANCELED:
case FAILED:
case FAILED_WITH_TERMINAL_ERROR:
case TIMED_OUT:
queueDAO.remove(taskQueueName, taskResult.getTaskId());
LOGGER.debug("Task: {} removed from taskQueue: {} since the task status is {}", task, taskQueueName, task.getStatus().name());
break;
case IN_PROGRESS:
case SCHEDULED:
new RetryUtil<>().retryOnException(() -> {
// postpone based on callbackAfterSeconds
long callBack = taskResult.getCallbackAfterSeconds();
queueDAO.postpone(taskQueueName, task.getTaskId(), task.getWorkflowPriority(), callBack);
LOGGER.debug("Task: {} postponed in taskQueue: {} since the task status is {} with callbackAfterSeconds: {}", task, taskQueueName, task.getStatus().name(), callBack);
break;
default:
break;
return null;
}, null, null, 2, postponeTaskMessageDesc, postponeTaskMessageOperation);
} catch (Exception e) {
// Throw exceptions on queue postpone, this would impact task execution
String errorMsg = String.format("Error postponing the message in queue for task: %s for workflow: %s", task.getTaskId(), workflowId);
LOGGER.error(errorMsg, e);
Monitors.recordTaskQueueOpError(task.getTaskType(), workflowInstance.getWorkflowName());
throw new ApplicationException(Code.BACKEND_ERROR, e);
}
return null;
}, null, null, 2, updateTaskQueueDesc, taskQueueOperation);
} catch (Exception e) {
String errorMsg = String.format("Error updating the queue for task: %s for workflow: %s", task.getTaskId(), workflowId);
LOGGER.warn(errorMsg, e);
Monitors.recordTaskQueueOpError(task.getTaskType(), workflowInstance.getWorkflowName());
break;
default:
break;
}


// Throw an ApplicationException if below operations fail to avoid workflow inconsistencies.
try {
String updateTaskDesc = "Updating Task with taskId: " + task.getTaskId();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
/*
* Copyright 2020 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.netflix.conductor.test.resiliency

import com.netflix.conductor.common.metadata.tasks.Task
Expand Down Expand Up @@ -417,7 +432,7 @@ class QueueResiliencySpec extends Specification {
pollResult == null
}

def "Verify updateTask succeeds when QueueDAO is unavailable"() {
def "Verify updateTask with COMPLETE status succeeds when QueueDAO is unavailable"() {
when: "Start a simple workflow"
def workflowInstanceId = workflowResource.startWorkflow(new StartWorkflowRequest()
.withName(SIMPLE_TWO_TASK_WORKFLOW)
Expand Down Expand Up @@ -445,11 +460,44 @@ class QueueResiliencySpec extends Specification {
def result = taskResource.updateTask(taskResult)

then: "updateTask returns successfully without any exceptions"
queueDAO.remove(*_) >> { throw new IllegalStateException("Queue remove failed from Spy") }
1 * queueDAO.remove(*_) >> { throw new IllegalStateException("Queue remove failed from Spy") }
result == task.getTaskId()
notThrown(Exception)
}

def "Verify updateTask with IN_PROGRESS state fails when QueueDAO is unavailable"() {
when: "Start a simple workflow"
def workflowInstanceId = workflowResource.startWorkflow(new StartWorkflowRequest()
.withName(SIMPLE_TWO_TASK_WORKFLOW)
.withVersion(1))

then: "Verify workflow is started"
with(workflowResource.getExecutionStatus(workflowInstanceId, true)) {
status == Workflow.WorkflowStatus.RUNNING
tasks.size() == 1
tasks[0].taskType == 'integration_task_1'
tasks[0].status == Task.Status.SCHEDULED
}

when: "The first task 'integration_task_1' is polled"
def task = taskResource.poll("integration_task_1", "test", null)

then: "Verify task is returned successfully"
task
task.status == Task.Status.IN_PROGRESS
task.taskType == 'integration_task_1'

when: "the above task is updated, while QueueDAO is unavailable"
def taskResult = new TaskResult(task)
taskResult.setStatus(TaskResult.Status.IN_PROGRESS)
taskResult.setCallbackAfterSeconds(120)
def result = taskResource.updateTask(taskResult)

then: "updateTask fails with an exception"
2 * queueDAO.postpone(*_) >> { throw new IllegalStateException("Queue postpone failed from Spy") }
thrown(Exception)
}

def "verify removeTaskFromQueue fail when QueueDAO is unavailable"() {
when: "Start a simple workflow"
def workflowInstanceId = workflowResource.startWorkflow(new StartWorkflowRequest()
Expand Down

0 comments on commit 61d403f

Please sign in to comment.