-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
br: cli refactor backup error handling logic #54697
Changes from all commits
e8e870f
976ad25
f917357
597dd71
0642669
62b39d0
6b994c9
7e3a7e9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
// Copyright 2024 PingCAP, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package utils | ||
|
||
import ( | ||
"fmt" | ||
"strings" | ||
"sync" | ||
|
||
backuppb "github.com/pingcap/kvproto/pkg/brpb" | ||
"github.com/pingcap/log" | ||
"go.uber.org/zap" | ||
) | ||
|
||
// UNSAFE! TODO: remove and map them to error types | ||
var retryableErrorMsg = []string{ | ||
"server closed", | ||
"connection refused", | ||
"connection reset by peer", | ||
"channel closed", | ||
"error trying to connect", | ||
"connection closed before message completed", | ||
"body write aborted", | ||
"error during dispatch", | ||
"put object timeout", | ||
"timeout after", | ||
"internalerror", | ||
"not read from or written to within the timeout period", | ||
"<code>requesttimeout</code>", | ||
"<code>invalidpart</code>", | ||
"end of file before message length reached", | ||
} | ||
|
||
// non-retryable error messages | ||
// UNSAFE! TODO: remove and map them to error types | ||
const ( | ||
ioMsg = "io" | ||
notFoundMsg = "notfound" | ||
permissionDeniedMsg = "permissiondenied" | ||
) | ||
|
||
// error messages | ||
const ( | ||
unreachableRetryMsg = "unreachable retry" | ||
retryOnKvErrorMsg = "retry on kv error" | ||
retryOnRegionErrorMsg = "retry on region error" | ||
clusterIdMismatchMsg = "cluster id mismatch" | ||
unknownErrorMsg = "unknown error" | ||
contextCancelledMsg = "context canceled" | ||
retryOnUnknownErrorMsg = "unknown error, retry it for a few times" | ||
noRetryOnUnknownErrorMsg = "unknown error, retried too many times, give up" | ||
retryableStorageErrorMsg = "retryable storage error" | ||
) | ||
|
||
type ErrorHandlingResult struct { | ||
Strategy ErrorHandlingStrategy | ||
Reason string | ||
} | ||
|
||
type ErrorHandlingStrategy int | ||
|
||
const ( | ||
// StrategyRetry error can be retried but will consume the backoff attempt quota. | ||
StrategyRetry ErrorHandlingStrategy = iota | ||
// StrategyGiveUp means unrecoverable error happened and the BR should exit | ||
// for example: | ||
// 1. permission not valid. | ||
// 2. data not found. | ||
// 3. retry too many times | ||
StrategyGiveUp | ||
// StrategyUnknown for StrategyUnknown error | ||
StrategyUnknown | ||
) | ||
|
||
type ErrorContext struct { | ||
mu sync.Mutex | ||
// encounter times for one context on a store | ||
// we may use this value to determine the retry policy | ||
encounterTimes map[uint64]int | ||
// unknown error retry limitation. | ||
// encounter many times error makes Retry to GiveUp. | ||
encounterTimesLimitation int | ||
description string | ||
} | ||
|
||
func NewErrorContext(scenario string, limitation int) *ErrorContext { | ||
return &ErrorContext{ | ||
description: scenario, | ||
encounterTimes: make(map[uint64]int), | ||
encounterTimesLimitation: limitation, | ||
} | ||
} | ||
|
||
func NewDefaultContext() *ErrorContext { | ||
return &ErrorContext{ | ||
description: "default", | ||
encounterTimes: make(map[uint64]int), | ||
encounterTimesLimitation: 1, | ||
} | ||
} | ||
|
||
func HandleBackupError(err *backuppb.Error, storeId uint64, ec *ErrorContext) ErrorHandlingResult { | ||
if err == nil { | ||
return ErrorHandlingResult{StrategyRetry, unreachableRetryMsg} | ||
} | ||
res := handleBackupProtoError(err) | ||
// try the best effort handle unknown error based on their error message | ||
if res.Strategy == StrategyUnknown && len(err.Msg) != 0 { | ||
return HandleUnknownBackupError(err.Msg, storeId, ec) | ||
} | ||
return res | ||
} | ||
|
||
func handleBackupProtoError(e *backuppb.Error) ErrorHandlingResult { | ||
switch e.Detail.(type) { | ||
case *backuppb.Error_KvError: | ||
return ErrorHandlingResult{StrategyRetry, retryOnKvErrorMsg} | ||
case *backuppb.Error_RegionError: | ||
return ErrorHandlingResult{StrategyRetry, retryOnRegionErrorMsg} | ||
case *backuppb.Error_ClusterIdError: | ||
return ErrorHandlingResult{StrategyGiveUp, clusterIdMismatchMsg} | ||
} | ||
return ErrorHandlingResult{StrategyUnknown, unknownErrorMsg} | ||
} | ||
|
||
// HandleUnknownBackupError UNSAFE! TODO: remove this method and map all the current unknown errors to error types | ||
func HandleUnknownBackupError(msg string, uuid uint64, ec *ErrorContext) ErrorHandlingResult { | ||
// UNSAFE! TODO: use meaningful error code instead of unstructured message to find failed to write error. | ||
logger := log.L().With(zap.String("description", ec.description)) | ||
if messageIsNotFoundStorageError(msg) { | ||
reason := fmt.Sprintf("File or directory not found on TiKV Node (store id: %v). "+ | ||
"workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid.", | ||
uuid) | ||
return ErrorHandlingResult{StrategyGiveUp, reason} | ||
} | ||
if messageIsPermissionDeniedStorageError(msg) { | ||
reason := fmt.Sprintf("I/O permission denied error occurs on TiKV Node(store id: %v). "+ | ||
"workaround: please ensure tikv has permission to read from & write to the storage.", | ||
uuid) | ||
return ErrorHandlingResult{StrategyGiveUp, reason} | ||
} | ||
msgLower := strings.ToLower(msg) | ||
if strings.Contains(msgLower, contextCancelledMsg) { | ||
return ErrorHandlingResult{StrategyGiveUp, contextCancelledMsg} | ||
} | ||
|
||
if MessageIsRetryableStorageError(msg) { | ||
logger.Warn(retryableStorageErrorMsg, zap.String("error", msg)) | ||
return ErrorHandlingResult{StrategyRetry, retryableStorageErrorMsg} | ||
} | ||
|
||
// retry enough on same store | ||
ec.mu.Lock() | ||
defer ec.mu.Unlock() | ||
ec.encounterTimes[uuid]++ | ||
if ec.encounterTimes[uuid] <= ec.encounterTimesLimitation { | ||
return ErrorHandlingResult{StrategyRetry, retryOnUnknownErrorMsg} | ||
} | ||
return ErrorHandlingResult{StrategyGiveUp, noRetryOnUnknownErrorMsg} | ||
} | ||
|
||
// messageIsNotFoundStorageError checks whether the message returning from TiKV is "NotFound" storage I/O error | ||
func messageIsNotFoundStorageError(msg string) bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @3pointer does the storage here mean S3 or any backup destination? At first I thought it's TiKV internal but there is a proto type for it already so got a bit confused. Thanks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. at this context storage means TiKV. |
||
msgLower := strings.ToLower(msg) | ||
return strings.Contains(msgLower, ioMsg) && strings.Contains(msgLower, notFoundMsg) | ||
} | ||
|
||
// MessageIsPermissionDeniedStorageError checks whether the message returning from TiKV is "PermissionDenied" storage I/O error | ||
func messageIsPermissionDeniedStorageError(msg string) bool { | ||
msgLower := strings.ToLower(msg) | ||
return strings.Contains(msgLower, permissionDeniedMsg) | ||
} | ||
|
||
// MessageIsRetryableStorageError checks whether the message returning from TiKV is retryable ExternalStorageError. | ||
func MessageIsRetryableStorageError(msg string) bool { | ||
msgLower := strings.ToLower(msg) | ||
// UNSAFE! TODO: Add a error type for retryable connection error. | ||
for _, errStr := range retryableErrorMsg { | ||
if strings.Contains(msgLower, errStr) { | ||
return true | ||
} | ||
} | ||
return false | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
package utils | ||
|
||
import ( | ||
"testing" | ||
|
||
backuppb "github.com/pingcap/kvproto/pkg/brpb" | ||
"github.com/pingcap/kvproto/pkg/errorpb" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestHandleError(t *testing.T) { | ||
ec := NewErrorContext("test", 3) | ||
// Test case 1: Error is nil | ||
result := HandleBackupError(nil, 123, ec) | ||
require.Equal(t, ErrorHandlingResult{Strategy: StrategyRetry, Reason: unreachableRetryMsg}, result) | ||
|
||
// Test case 2: Error is KvError and can be ignored | ||
kvError := &backuppb.Error_KvError{} | ||
result = HandleBackupError(&backuppb.Error{Detail: kvError}, 123, ec) | ||
require.Equal(t, ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryOnKvErrorMsg}, result) | ||
|
||
// Test case 3: Error is RegionError and can be ignored | ||
regionError := &backuppb.Error_RegionError{ | ||
RegionError: &errorpb.Error{NotLeader: &errorpb.NotLeader{RegionId: 1}}} | ||
result = HandleBackupError(&backuppb.Error{Detail: regionError}, 123, ec) | ||
require.Equal(t, ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryOnRegionErrorMsg}, result) | ||
|
||
// Test case 4: Error is ClusterIdError | ||
clusterIdError := &backuppb.Error_ClusterIdError{} | ||
result = HandleBackupError(&backuppb.Error{Detail: clusterIdError}, 123, ec) | ||
require.Equal(t, ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: clusterIdMismatchMsg}, result) | ||
} | ||
|
||
func TestHandleErrorMsg(t *testing.T) { | ||
ec := NewErrorContext("test", 3) | ||
|
||
// Test messageIsNotFoundStorageError | ||
msg := "IO: files Notfound error" | ||
uuid := uint64(456) | ||
expectedReason := "File or directory not found on TiKV Node (store id: 456). workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid." | ||
expectedResult := ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: expectedReason} | ||
actualResult := HandleUnknownBackupError(msg, uuid, ec) | ||
require.Equal(t, expectedResult, actualResult) | ||
|
||
// Test messageIsPermissionDeniedStorageError | ||
msg = "I/O permissiondenied error occurs on TiKV Node(store id: 456)." | ||
expectedReason = "I/O permission denied error occurs on TiKV Node(store id: 456). workaround: please ensure tikv has permission to read from & write to the storage." | ||
expectedResult = ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: expectedReason} | ||
actualResult = HandleUnknownBackupError(msg, uuid, ec) | ||
require.Equal(t, expectedResult, actualResult) | ||
|
||
// Test MessageIsRetryableStorageError | ||
msg = "server closed" | ||
expectedResult = ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryableStorageErrorMsg} | ||
actualResult = HandleUnknownBackupError(msg, uuid, ec) | ||
require.Equal(t, expectedResult, actualResult) | ||
|
||
// Test unknown error | ||
msg = "unknown error" | ||
expectedResult = ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryOnUnknownErrorMsg} | ||
actualResult = HandleUnknownBackupError(msg, uuid, ec) | ||
require.Equal(t, expectedResult, actualResult) | ||
|
||
// Test retry too many times | ||
_ = HandleUnknownBackupError(msg, uuid, ec) | ||
_ = HandleUnknownBackupError(msg, uuid, ec) | ||
expectedResult = ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: noRetryOnUnknownErrorMsg} | ||
actualResult = HandleUnknownBackupError(msg, uuid, ec) | ||
require.Equal(t, expectedResult, actualResult) | ||
} |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
new file should add license first, you can copy from other files.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah! good catch