From 3f2c2a2b2363e61f7329aedda81d61ba4c4e6989 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 30 Oct 2023 10:51:36 +0800 Subject: [PATCH] disttask: add more retryable error (#48033) (#48045) ref pingcap/tidb#46258, close pingcap/tidb#48034 --- pkg/disttask/framework/scheduler/BUILD.bazel | 2 ++ pkg/disttask/framework/scheduler/scheduler.go | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pkg/disttask/framework/scheduler/BUILD.bazel b/pkg/disttask/framework/scheduler/BUILD.bazel index 7d4d274cf0b80..09dde40ab890b 100644 --- a/pkg/disttask/framework/scheduler/BUILD.bazel +++ b/pkg/disttask/framework/scheduler/BUILD.bazel @@ -20,10 +20,12 @@ go_library( "//pkg/disttask/framework/storage", "//pkg/domain/infosync", "//pkg/metrics", + "//pkg/parser/terror", "//pkg/resourcemanager/pool/spool", "//pkg/resourcemanager/util", "//pkg/util", "//pkg/util/backoff", + "//pkg/util/dbterror", "//pkg/util/logutil", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", diff --git a/pkg/disttask/framework/scheduler/scheduler.go b/pkg/disttask/framework/scheduler/scheduler.go index bcefa5c79b65a..6253a2c6cab8b 100644 --- a/pkg/disttask/framework/scheduler/scheduler.go +++ b/pkg/disttask/framework/scheduler/scheduler.go @@ -29,7 +29,9 @@ import ( "github.com/pingcap/tidb/pkg/disttask/framework/storage" "github.com/pingcap/tidb/pkg/domain/infosync" "github.com/pingcap/tidb/pkg/metrics" + "github.com/pingcap/tidb/pkg/parser/terror" "github.com/pingcap/tidb/pkg/util/backoff" + "github.com/pingcap/tidb/pkg/util/dbterror" "github.com/pingcap/tidb/pkg/util/logutil" "go.uber.org/zap" ) @@ -575,6 +577,18 @@ func (s *BaseScheduler) finishSubtaskAndUpdateState(ctx context.Context, subtask metrics.IncDistTaskSubTaskCnt(subtask) } +// TODO: abstract interface for each business to implement it. +func isRetryableError(err error) bool { + originErr := errors.Cause(err) + if tErr, ok := originErr.(*terror.Error); ok { + sqlErr := terror.ToSQLError(tErr) + _, ok := dbterror.ReorgRetryableErrCodes[sqlErr.Code] + return ok + } + // can't retry Unknown err + return false +} + // markSubTaskCanceledOrFailed check the error type and decide the subtasks' state. // 1. Only cancel subtasks when meet ErrCancelSubtask. // 2. Only fail subtasks when meet non retryable error. @@ -584,7 +598,7 @@ func (s *BaseScheduler) markSubTaskCanceledOrFailed(ctx context.Context, subtask if ctx.Err() != nil && context.Cause(ctx) == ErrCancelSubtask { logutil.Logger(s.logCtx).Warn("subtask canceled", zap.Error(err)) s.updateSubtaskStateAndError(subtask, proto.TaskStateCanceled, nil) - } else if common.IsRetryableError(err) { + } else if common.IsRetryableError(err) || isRetryableError(err) { logutil.Logger(s.logCtx).Warn("met retryable error", zap.Error(err)) } else if errors.Cause(err) != context.Canceled { logutil.Logger(s.logCtx).Warn("subtask failed", zap.Error(err))