Skip to content

Circuit breakers: add client.ErrCircuitBreakerOpen type #7324

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion pkg/distributor/distributor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"testing"
"time"

"github.com/failsafe-go/failsafe-go/circuitbreaker"
"github.com/go-kit/log"
"github.com/gogo/status"
"github.com/grafana/dskit/flagext"
Expand Down Expand Up @@ -1044,6 +1045,21 @@ func TestDistributor_PushHAInstances(t *testing.T) {
}
}

func TestDistributor_PushWithCircuitBreakers(t *testing.T) {
ds, _, _ := prepare(t, prepConfig{
numIngesters: 3,
happyIngesters: 3,
numDistributors: 1,
circuitBreakerOpen: true,
})

ctx := user.InjectOrgID(context.Background(), "user")
err := ds[0].push(ctx, NewParsedRequest(makeWriteRequest(123456789000, 10, 0, false, true, "foo")))
require.Error(t, err)
require.ErrorIs(t, err, circuitbreaker.ErrOpen)
require.ErrorAs(t, err, &client.ErrCircuitBreakerOpen{})
}

func TestDistributor_PushQuery(t *testing.T) {
const metricName = "foo"
ctx := user.InjectOrgID(context.Background(), "user")
Expand Down Expand Up @@ -3511,7 +3527,8 @@ type prepConfig struct {

configure func(*Config)

timeOut bool
timeOut bool
circuitBreakerOpen bool
}

// totalIngesters takes into account ingesterStateByZone and numIngesters.
Expand Down Expand Up @@ -3636,6 +3653,7 @@ func prepareIngesterZone(zone string, state ingesterZoneState, cfg prepConfig) [
zone: zone,
labelNamesStreamResponseDelay: labelNamesStreamResponseDelay,
timeOut: cfg.timeOut,
circuitBreakerOpen: cfg.circuitBreakerOpen,
})
}
return ingesters
Expand Down Expand Up @@ -4111,6 +4129,7 @@ type mockIngester struct {
timeOut bool
tokens []uint32
id int
circuitBreakerOpen bool
}

func (i *mockIngester) address() string {
Expand Down Expand Up @@ -4157,6 +4176,10 @@ func (i *mockIngester) Push(ctx context.Context, req *mimirpb.WriteRequest, _ ..
return nil, context.DeadlineExceeded
}

if i.circuitBreakerOpen {
return nil, client.ErrCircuitBreakerOpen{}
}

if len(req.Timeseries) > 0 && i.timeseries == nil {
i.timeseries = map[uint32]*mimirpb.PreallocTimeseries{}
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/distributor/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/pkg/errors"
"google.golang.org/grpc/codes"

"github.com/grafana/mimir/pkg/ingester/client"
"github.com/grafana/mimir/pkg/mimirpb"
"github.com/grafana/mimir/pkg/util"
"github.com/grafana/mimir/pkg/util/globalerror"
Expand Down Expand Up @@ -220,6 +221,8 @@ func toGRPCError(pushErr error, serviceOverloadErrorEnabled bool) error {
case mimirpb.TOO_MANY_CLUSTERS:
errCode = codes.FailedPrecondition
}
} else if errors.As(pushErr, &client.ErrCircuitBreakerOpen{}) {
errCode = codes.Unavailable
}
stat := status.New(errCode, pushErr.Error())
if errDetails != nil {
Expand Down
21 changes: 21 additions & 0 deletions pkg/distributor/errors_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"net/http"
"testing"

"github.com/failsafe-go/failsafe-go/circuitbreaker"
"github.com/gogo/status"
"github.com/grafana/dskit/grpcutil"
"github.com/grafana/dskit/httpgrpc"
Expand All @@ -18,6 +19,7 @@ import (
"google.golang.org/grpc/codes"
grpcstatus "google.golang.org/grpc/status"

"github.com/grafana/mimir/pkg/ingester/client"
"github.com/grafana/mimir/pkg/mimirpb"
)

Expand Down Expand Up @@ -329,6 +331,16 @@ func TestToGRPCError(t *testing.T) {
expectedErrorMsg: fmt.Sprintf("%s %s: %s", failedPushingToIngesterMessage, ingesterID, originalMsg),
expectedErrorDetails: &mimirpb.ErrorDetails{Cause: mimirpb.UNKNOWN_CAUSE},
},
"a client.ErrCircuitBreakerOpen error gets translated into an Unavailable error without details": {
err: client.ErrCircuitBreakerOpen{},
expectedGRPCCode: codes.Unavailable,
expectedErrorMsg: circuitbreaker.ErrOpen.Error(),
},
"a wrapped client.ErrCircuitBreakerOpen error gets translated into an Unavailable error without details": {
err: errors.Wrap(client.ErrCircuitBreakerOpen{}, fmt.Sprintf("%s %s", failedPushingToIngesterMessage, ingesterID)),
expectedGRPCCode: codes.Unavailable,
expectedErrorMsg: fmt.Sprintf("%s %s: %s", failedPushingToIngesterMessage, ingesterID, circuitbreaker.ErrOpen),
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
Expand Down Expand Up @@ -364,6 +376,15 @@ func TestWrapIngesterPushError(t *testing.T) {
require.NoError(t, err)
})

// Ensure that client.ErrCircuitBreakerOpen error gets correctly wrapped.
t.Run("an ErrCircuitBreakerOpen error gives an ErrCircuitBreakerOpen error", func(t *testing.T) {
ingesterPushErr := client.ErrCircuitBreakerOpen{}
err := wrapIngesterPushError(ingesterPushErr, ingesterID)
require.Error(t, err)
require.ErrorIs(t, err, circuitbreaker.ErrOpen)
require.ErrorAs(t, err, &client.ErrCircuitBreakerOpen{})
})

// Ensure that the errors created by httpgrpc get translated into
// other errors created by httpgrpc with the same code, and with
// a more explanatory message.
Expand Down
3 changes: 3 additions & 0 deletions pkg/distributor/push.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/grafana/dskit/tenant"
"github.com/opentracing/opentracing-go"

"github.com/grafana/mimir/pkg/ingester/client"
"github.com/grafana/mimir/pkg/mimirpb"
"github.com/grafana/mimir/pkg/util"
"github.com/grafana/mimir/pkg/util/globalerror"
Expand Down Expand Up @@ -227,6 +228,8 @@ func toHTTPStatus(ctx context.Context, pushErr error, limits *validation.Overrid
case mimirpb.TSDB_UNAVAILABLE:
return http.StatusServiceUnavailable
}
} else if errors.As(pushErr, &client.ErrCircuitBreakerOpen{}) {
return http.StatusServiceUnavailable
}

return http.StatusInternalServerError
Expand Down
14 changes: 13 additions & 1 deletion pkg/distributor/push_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
"bytes"
"compress/gzip"
"context"
"errors"
"fmt"
"io"
"net/http"
Expand All @@ -18,13 +17,15 @@ import (
"testing"
"time"

"github.com/failsafe-go/failsafe-go/circuitbreaker"
"github.com/go-kit/log"
"github.com/golang/snappy"
"github.com/grafana/dskit/flagext"
"github.com/grafana/dskit/httpgrpc"
"github.com/grafana/dskit/httpgrpc/server"
"github.com/grafana/dskit/middleware"
"github.com/grafana/dskit/user"
"github.com/pkg/errors"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
"github.com/prometheus/prometheus/storage/remote"
Expand All @@ -35,6 +36,7 @@ import (
"go.opentelemetry.io/collector/pdata/pmetric/pmetricotlp"
"google.golang.org/grpc/codes"

"github.com/grafana/mimir/pkg/ingester/client"
"github.com/grafana/mimir/pkg/mimirpb"
"github.com/grafana/mimir/pkg/util"
"github.com/grafana/mimir/pkg/util/test"
Expand Down Expand Up @@ -1134,6 +1136,16 @@ func TestHandler_ToHTTPStatus(t *testing.T) {
expectedHTTPStatus: http.StatusInternalServerError,
expectedErrorMsg: fmt.Sprintf("%s %s: %s", failedPushingToIngesterMessage, ingesterID, context.DeadlineExceeded),
},
"a client.ErrCircuitBreakerOpen error gets translated into an HTTP 503": {
err: client.ErrCircuitBreakerOpen{},
expectedHTTPStatus: http.StatusServiceUnavailable,
expectedErrorMsg: circuitbreaker.ErrOpen.Error(),
},
"a wrapped client.ErrCircuitBreakerOpen error gets translated into an HTTP 503": {
err: errors.Wrap(client.ErrCircuitBreakerOpen{}, fmt.Sprintf("%s %s", failedPushingToIngesterMessage, ingesterID)),
expectedHTTPStatus: http.StatusServiceUnavailable,
expectedErrorMsg: fmt.Sprintf("%s %s: %s", failedPushingToIngesterMessage, ingesterID, circuitbreaker.ErrOpen),
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
Expand Down
18 changes: 18 additions & 0 deletions pkg/ingester/client/circuitbreaker.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package client
import (
"context"
"errors"
"time"

"github.com/failsafe-go/failsafe-go"
"github.com/failsafe-go/failsafe-go/circuitbreaker"
Expand Down Expand Up @@ -41,6 +42,22 @@ var (
}
)

type ErrCircuitBreakerOpen struct {
remainingDelay time.Duration
}

func (e ErrCircuitBreakerOpen) RemainingDelay() time.Duration {
return e.remainingDelay
}

func (e ErrCircuitBreakerOpen) Error() string {
return circuitbreaker.ErrOpen.Error()
}

func (e ErrCircuitBreakerOpen) Unwrap() error {
return circuitbreaker.ErrOpen
}

func NewCircuitBreaker(inst ring.InstanceDesc, cfg CircuitBreakerConfig, metrics *Metrics, logger log.Logger) grpc.UnaryClientInterceptor {
// Initialize each of the known labels for circuit breaker metrics for this particular ingester
transitionOpen := metrics.circuitBreakerTransitions.WithLabelValues(inst.Id, circuitbreaker.OpenState.String())
Expand Down Expand Up @@ -88,6 +105,7 @@ func NewCircuitBreaker(inst ring.InstanceDesc, cfg CircuitBreakerConfig, metrics

if err != nil && errors.Is(err, circuitbreaker.ErrOpen) {
countOpen.Inc()
return ErrCircuitBreakerOpen{remainingDelay: breaker.RemainingDelay()}
}

return err
Expand Down
14 changes: 12 additions & 2 deletions pkg/ingester/client/circuitbreaker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,13 @@ func TestNewCircuitBreaker(t *testing.T) {
conn := grpc.ClientConn{}
reg := prometheus.NewPedanticRegistry()
inst := ring.InstanceDesc{Id: "test", Addr: "localhost:8080"}
coolDown := 10 * time.Second
breaker := NewCircuitBreaker(inst, CircuitBreakerConfig{
Enabled: true,
FailureThreshold: 1,
FailureExecutionThreshold: 1,
ThresholdingPeriod: 60 * time.Second,
CooldownPeriod: 60 * time.Second,
CooldownPeriod: coolDown,
}, NewMetrics(reg), test.NewTestingLogger(t))

// Initial request that should succeed because the circuit breaker is "closed"
Expand All @@ -109,6 +110,15 @@ func TestNewCircuitBreaker(t *testing.T) {
// Subsequent requests should fail with this specific error once "open"
err = breaker(context.Background(), "/cortex.Ingester/Push", "", "", &conn, success)
require.ErrorIs(t, err, circuitbreaker.ErrOpen)
var errCBOpen1 ErrCircuitBreakerOpen
require.ErrorAs(t, err, &errCBOpen1)
require.Less(t, errCBOpen1.RemainingDelay(), coolDown)

err = breaker(context.Background(), "/cortex.Ingester/Push", "", "", &conn, success)
require.ErrorIs(t, err, circuitbreaker.ErrOpen)
var errCBOpen2 ErrCircuitBreakerOpen
require.ErrorAs(t, err, &errCBOpen2)
require.Less(t, errCBOpen2.RemainingDelay(), errCBOpen1.RemainingDelay())

// Non-ingester methods shouldn't be short-circuited
err = breaker(context.Background(), "Different.Method", "", "", &conn, success)
Expand All @@ -118,7 +128,7 @@ func TestNewCircuitBreaker(t *testing.T) {
require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
# HELP cortex_ingester_client_circuit_breaker_results_total Results of executing requests via the circuit breaker
# TYPE cortex_ingester_client_circuit_breaker_results_total counter
cortex_ingester_client_circuit_breaker_results_total{ingester="test",result="circuit_breaker_open"} 1
cortex_ingester_client_circuit_breaker_results_total{ingester="test",result="circuit_breaker_open"} 2
cortex_ingester_client_circuit_breaker_results_total{ingester="test",result="error"} 1
cortex_ingester_client_circuit_breaker_results_total{ingester="test",result="success"} 1
`), "cortex_ingester_client_circuit_breaker_results_total"))
Expand Down