fix: check alert threshold larger or equal (#597)

alexluong · alexluong · commit cda99eb3617b · 2025-12-17T20:59:03.000+07:00
* test: e2e suite testing destination disable

* fix: check alert threshold larger or equal
diff --git a/cmd/e2e/suites_test.go b/cmd/e2e/suites_test.go
@@ -223,3 +223,150 @@ func TestBasicSuiteWithDeploymentID(t *testing.T) {
 		deploymentID:   "dp_e2e_test",
 	})
 }
+
+// TestAutoDisableWithoutCallbackURL tests the scenario from issue #596:
+// ALERT_AUTO_DISABLE_DESTINATION=true without ALERT_CALLBACK_URL set.
+// Run with: go test -v -run TestAutoDisableWithoutCallbackURL ./cmd/e2e/...
+func TestAutoDisableWithoutCallbackURL(t *testing.T) {
+	t.Parallel()
+	if testing.Short() {
+		t.Skip("skipping e2e test")
+	}
+
+	// Setup infrastructure
+	testinfraCleanup := testinfra.Start(t)
+	defer testinfraCleanup()
+	gin.SetMode(gin.TestMode)
+	mockServerBaseURL := testinfra.GetMockServer(t)
+
+	// Configure WITHOUT alert callback URL (the issue #596 scenario)
+	cfg := configs.Basic(t, configs.BasicOpts{
+		LogStorage: configs.LogStorageTypePostgres,
+	})
+	cfg.Alert.CallbackURL = ""              // No callback URL
+	cfg.Alert.AutoDisableDestination = true // Auto-disable enabled
+	cfg.Alert.ConsecutiveFailureCount = 20  // Default threshold
+
+	require.NoError(t, cfg.Validate(config.Flags{}))
+
+	// Start application
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	appDone := make(chan struct{})
+	go func() {
+		defer close(appDone)
+		application := app.New(&cfg)
+		if err := application.Run(ctx); err != nil {
+			log.Println("Application stopped:", err)
+		}
+	}()
+	defer func() {
+		cancel()
+		<-appDone
+	}()
+
+	// Wait for services to start
+	time.Sleep(2 * time.Second)
+
+	// Setup test client
+	client := httpclient.New(fmt.Sprintf("http://localhost:%d/api/v1", cfg.APIPort), cfg.APIKey)
+	mockServerInfra := testinfra.NewMockServerInfra(mockServerBaseURL)
+
+	// Test data
+	tenantID := fmt.Sprintf("tenant_%d", time.Now().UnixNano())
+	destinationID := fmt.Sprintf("dest_%d", time.Now().UnixNano())
+	secret := "testsecret1234567890abcdefghijklmnop"
+
+	// Create tenant
+	resp, err := client.Do(httpclient.Request{
+		Method:  httpclient.MethodPUT,
+		Path:    "/" + tenantID,
+		Headers: map[string]string{"Authorization": "Bearer " + cfg.APIKey},
+	})
+	require.NoError(t, err)
+	require.Equal(t, 201, resp.StatusCode, "failed to create tenant")
+
+	// Configure mock server destination to return errors
+	resp, err = client.Do(httpclient.Request{
+		Method:  httpclient.MethodPUT,
+		BaseURL: mockServerBaseURL,
+		Path:    "/destinations",
+		Body: map[string]interface{}{
+			"id":   destinationID,
+			"type": "webhook",
+			"config": map[string]interface{}{
+				"url": fmt.Sprintf("%s/webhook/%s", mockServerBaseURL, destinationID),
+			},
+			"credentials": map[string]interface{}{
+				"secret": secret,
+			},
+		},
+	})
+	require.NoError(t, err)
+	require.Equal(t, 200, resp.StatusCode, "failed to configure mock server")
+
+	// Create destination
+	resp, err = client.Do(httpclient.Request{
+		Method:  httpclient.MethodPOST,
+		Path:    "/" + tenantID + "/destinations",
+		Headers: map[string]string{"Authorization": "Bearer " + cfg.APIKey},
+		Body: map[string]interface{}{
+			"id":     destinationID,
+			"type":   "webhook",
+			"topics": "*",
+			"config": map[string]interface{}{
+				"url": fmt.Sprintf("%s/webhook/%s", mockServerBaseURL, destinationID),
+			},
+			"credentials": map[string]interface{}{
+				"secret": secret,
+			},
+		},
+	})
+	require.NoError(t, err)
+	require.Equal(t, 201, resp.StatusCode, "failed to create destination")
+
+	// Publish 21 events that will fail (1 more than threshold to test idempotency)
+	for i := 0; i < 21; i++ {
+		resp, err = client.Do(httpclient.Request{
+			Method:  httpclient.MethodPOST,
+			Path:    "/publish",
+			Headers: map[string]string{"Authorization": "Bearer " + cfg.APIKey},
+			Body: map[string]interface{}{
+				"tenant_id":          tenantID,
+				"topic":              "user.created",
+				"eligible_for_retry": false,
+				"metadata": map[string]any{
+					"should_err": "true",
+				},
+				"data": map[string]any{
+					"index": i,
+				},
+			},
+		})
+		require.NoError(t, err)
+		require.Equal(t, 202, resp.StatusCode, "failed to publish event %d", i)
+	}
+
+	// Wait for deliveries to be processed
+	time.Sleep(time.Second)
+
+	// Check if destination is disabled
+	resp, err = client.Do(httpclient.Request{
+		Method:  httpclient.MethodGET,
+		Path:    "/" + tenantID + "/destinations/" + destinationID,
+		Headers: map[string]string{"Authorization": "Bearer " + cfg.APIKey},
+	})
+	require.NoError(t, err)
+	require.Equal(t, 200, resp.StatusCode, "failed to get destination")
+
+	// Parse response to check disabled_at
+	bodyMap, ok := resp.Body.(map[string]interface{})
+	require.True(t, ok, "response body should be a map")
+
+	disabledAt := bodyMap["disabled_at"]
+	assert.NotNil(t, disabledAt, "destination should be disabled (disabled_at should not be null) - issue #596")
+
+	// Cleanup mock server
+	_ = mockServerInfra
+}
diff --git a/internal/alert/evaluator.go b/internal/alert/evaluator.go
@@ -69,9 +69,21 @@ func (e *alertEvaluator) ShouldAlert(failures int) (int, bool) {
 	}
 
 	// Get current alert level
+	// Iterate from highest to lowest threshold
 	for i := len(e.thresholds) - 1; i >= 0; i-- {
-		if failures == e.thresholds[i].failures {
-			return e.thresholds[i].percentage, true
+		threshold := e.thresholds[i]
+
+		// For the 100% threshold (auto-disable), use >= to ensure we don't miss it
+		// if concurrent processing causes us to skip over the exact count.
+		// For other thresholds, use exact match to avoid duplicate alerts.
+		if threshold.percentage == 100 {
+			if failures >= threshold.failures {
+				return threshold.percentage, true
+			}
+		} else {
+			if failures == threshold.failures {
+				return threshold.percentage, true
+			}
 		}
 	}
 
diff --git a/internal/alert/monitor_test.go b/internal/alert/monitor_test.go
@@ -168,3 +168,71 @@ func TestAlertMonitor_ConsecutiveFailures_Reset(t *testing.T) {
 	// Verify the destination was never disabled
 	disabler.AssertNotCalled(t, "DisableDestination")
 }
+
+func TestAlertMonitor_ConsecutiveFailures_AboveThreshold(t *testing.T) {
+	// Tests that failures above the 100% threshold still trigger disable.
+	// This ensures we don't miss the disable if concurrent processing
+	// causes us to skip over the exact threshold count.
+	t.Parallel()
+	ctx := context.Background()
+	logger := testutil.CreateTestLogger(t)
+	redisClient := testutil.CreateTestRedisClient(t)
+	notifier := &mockAlertNotifier{}
+	notifier.On("Notify", mock.Anything, mock.Anything).Return(nil)
+	disabler := &mockDestinationDisabler{}
+	disabler.On("DisableDestination", mock.Anything, mock.Anything, mock.Anything).Return(nil)
+
+	monitor := alert.NewAlertMonitor(
+		logger,
+		redisClient,
+		alert.WithNotifier(notifier),
+		alert.WithDisabler(disabler),
+		alert.WithAutoDisableFailureCount(20),
+		alert.WithAlertThresholds([]int{50, 70, 90, 100}),
+	)
+
+	dest := &alert.AlertDestination{ID: "dest_above", TenantID: "tenant_above"}
+	event := &models.Event{Topic: "test.event"}
+	deliveryEvent := &models.DeliveryEvent{Event: *event}
+	attempt := alert.DeliveryAttempt{
+		Success:       false,
+		DeliveryEvent: deliveryEvent,
+		Destination:   dest,
+		DeliveryResponse: map[string]interface{}{
+			"status": "500",
+		},
+		Timestamp: time.Now(),
+	}
+
+	// Send 25 consecutive failures (5 more than the threshold)
+	for i := 1; i <= 25; i++ {
+		require.NoError(t, monitor.HandleAttempt(ctx, attempt))
+	}
+
+	// Verify notifications at 50%, 70%, 90%, and 100% thresholds
+	// Plus additional notifications for failures 21-25 (all at 100% level)
+	var notifyCallCount int
+	var disableNotifyCount int
+	for _, call := range notifier.Calls {
+		if call.Method == "Notify" {
+			notifyCallCount++
+			alertData := call.Arguments.Get(1).(alert.ConsecutiveFailureAlert)
+			if alertData.Data.ConsecutiveFailures >= 20 {
+				disableNotifyCount++
+				require.True(t, alertData.Data.WillDisable, "WillDisable should be true at and above 100%")
+			}
+		}
+	}
+	// 4 alerts at thresholds (10, 14, 18, 20) + 5 alerts for 21-25
+	require.Equal(t, 9, notifyCallCount, "Should have sent 9 notifications (4 at thresholds + 5 above)")
+	require.Equal(t, 6, disableNotifyCount, "Should have 6 notifications with WillDisable=true (20-25)")
+
+	// Verify destination was disabled multiple times (once per failure >= 20)
+	var disableCallCount int
+	for _, call := range disabler.Calls {
+		if call.Method == "DisableDestination" {
+			disableCallCount++
+		}
+	}
+	require.Equal(t, 6, disableCallCount, "Should have called disable 6 times (for failures 20-25)")
+}