Skip to content

Commit

Permalink
fix: Better message formating for nodes (#5160)
Browse files Browse the repository at this point in the history
Signed-off-by: Simon Behar <simbeh7@gmail.com>
  • Loading branch information
simster7 authored Feb 22, 2021
1 parent d33b5cc commit bec80c8
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 16 deletions.
3 changes: 0 additions & 3 deletions workflow/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ const (
AnnotationKeyRBACRule = workflow.WorkflowFullName + "/rbac-rule"
AnnotationKeyRBACRulePrecedence = workflow.WorkflowFullName + "/rbac-rule-precedence"

// AnnotationKeyNodeMessage is the pod metadata annotation key the executor will use to
// communicate errors encountered by the executor during artifact load/save, etc...
AnnotationKeyNodeMessage = workflow.WorkflowFullName + "/node-message"
// AnnotationKeyTemplate is the pod metadata annotation key containing the container template as JSON
AnnotationKeyTemplate = workflow.WorkflowFullName + "/template"
// AnnotationKeyOutputs is the pod metadata annotation key containing the container outputs
Expand Down
11 changes: 5 additions & 6 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1270,7 +1270,6 @@ func inferFailedReason(pod *apiv1.Pod) (wfv1.NodePhase, string) {
// Pod has a nice error message. Use that.
return wfv1.NodeFailed, pod.Status.Message
}
annotatedMsg := pod.Annotations[common.AnnotationKeyNodeMessage]

// We only get one message to set for the overall node status.
// If multiple containers failed, in order of preference:
Expand Down Expand Up @@ -1309,18 +1308,18 @@ func inferFailedReason(pod *apiv1.Pod) (wfv1.NodePhase, string) {
continue
}

msg := fmt.Sprintf("exit code %d: %s; %s; %s", t.ExitCode, t.Reason, t.Message, annotatedMsg)
msg := fmt.Sprintf("%s (exit code %d)", t.Reason, t.ExitCode)
if t.Message != "" {
msg = fmt.Sprintf("%s: %s", msg, t.Message)
}

switch ctr.Name {
case common.InitContainerName:
return wfv1.NodeError, msg
case common.MainContainerName:
return wfv1.NodeFailed, msg
case common.WaitContainerName:
// executor is expected to annotate a message to the pod upon any errors.
// If we failed to see the annotated message, it is likely the pod ran with
// insufficient privileges. Give a hint to that effect.
return wfv1.NodeError, fmt.Sprintf("%s; verify serviceaccount %s:%s has necessary privileges", msg, pod.Namespace, pod.Spec.ServiceAccountName)
return wfv1.NodeError, msg
default:
if t.ExitCode == 137 || t.ExitCode == 143 {
// if the sidecar was SIGKILL'd (exit code 137) assume it was because argoexec
Expand Down
4 changes: 1 addition & 3 deletions workflow/executor/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,11 @@ func NewExecutor(clientset kubernetes.Interface, podName, namespace, podAnnotati
// HandleError is a helper to annotate the pod with the error message upon a unexpected executor panic or error
func (we *WorkflowExecutor) HandleError(ctx context.Context) {
if r := recover(); r != nil {
_ = we.AddAnnotation(ctx, common.AnnotationKeyNodeMessage, fmt.Sprintf("%v", r))
util.WriteTeriminateMessage(fmt.Sprintf("%v", r))
log.Fatalf("executor panic: %+v\n%s", r, debug.Stack())
} else {
if len(we.errors) > 0 {
util.WriteTeriminateMessage(we.errors[0].Error())
_ = we.AddAnnotation(ctx, common.AnnotationKeyNodeMessage, we.errors[0].Error())
}
}
}
Expand Down Expand Up @@ -1059,7 +1057,7 @@ func (we *WorkflowExecutor) monitorDeadline(ctx context.Context, containerNames
message = "Step exceeded its deadline"
}
log.Info(message)
_ = we.AddAnnotation(ctx, common.AnnotationKeyNodeMessage, message)
util.WriteTeriminateMessage(message)
log.Infof("Killing main container")
terminationGracePeriodDuration, _ := we.GetTerminationGracePeriodDuration(ctx)
err := we.RuntimeExecutor.Kill(ctx, containerNames, terminationGracePeriodDuration)
Expand Down
8 changes: 4 additions & 4 deletions workflow/executor/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ import (

"github.com/argoproj/argo-workflows/v3/errors"
wfv1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
"github.com/argoproj/argo-workflows/v3/util"
argoerr "github.com/argoproj/argo-workflows/v3/util/errors"
"github.com/argoproj/argo-workflows/v3/workflow/common"
os_specific "github.com/argoproj/argo-workflows/v3/workflow/executor/os-specific"
)

Expand Down Expand Up @@ -125,7 +125,7 @@ func (g gjsonLabels) Get(label string) string {

// signalMonitoring start the goroutine which listens for a SIGUSR2.
// Upon receiving of the signal, We update the pod annotation and exit the process.
func (we *WorkflowExecutor) signalMonitoring(ctx context.Context) {
func (we *WorkflowExecutor) signalMonitoring() {
log.Infof("Starting SIGUSR2 signal monitor")
sigs := make(chan os.Signal, 1)

Expand All @@ -134,7 +134,7 @@ func (we *WorkflowExecutor) signalMonitoring(ctx context.Context) {
for {
<-sigs
log.Infof("Received SIGUSR2 signal. Process is terminated")
_ = we.AddAnnotation(ctx, common.AnnotationKeyNodeMessage, "Received user signal to terminate the workflow")
util.WriteTeriminateMessage("Received user signal to terminate the workflow")
os.Exit(130)
}
}()
Expand All @@ -143,7 +143,7 @@ func (we *WorkflowExecutor) signalMonitoring(ctx context.Context) {
// WaitResource waits for a specific resource to satisfy either the success or failure condition
func (we *WorkflowExecutor) WaitResource(ctx context.Context, resourceNamespace string, resourceName string) error {
// Monitor the SIGTERM
we.signalMonitoring(ctx)
we.signalMonitoring()

if we.Template.Resource.SuccessCondition == "" && we.Template.Resource.FailureCondition == "" {
return nil
Expand Down

0 comments on commit bec80c8

Please sign in to comment.