Revert "Add /liveness endpoint to elastic-agent (#4499)" (#4583)

This reverts commit 29ce53e.
elastic · Apr 16, 2024 · eca5bc7 · eca5bc7
1 parent 29ce53e
commit eca5bc7
Show file tree

Hide file tree

Showing 22 changed files with 88 additions and 975 deletions.
diff --git a/_meta/config/common.p2.yml.tmpl b/_meta/config/common.p2.yml.tmpl
@@ -66,19 +66,6 @@ inputs:
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
 #   # exposes agent metrics using http, by default sockets and named pipes are used
-#   #
-#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
-#   # 200: Agent is healthy
-#   # 500: A component or unit is in a failed state
-#   # 503: The agent coordinator is unresponsive
-#   #
-#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
-#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
-#   # The possible values for `failon` are:
-#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
-#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
-#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
 #       enabled: false

diff --git a/_meta/config/common.reference.p2.yml.tmpl b/_meta/config/common.reference.p2.yml.tmpl
@@ -144,20 +144,7 @@ inputs:
 #   pprof.enabled: false
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
-#   # Exposes agent metrics using http, by default sockets and named pipes are used.
-#   #
-#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
-#   # 200: Agent is healthy
-#   # 500: A component or unit is in a failed state
-#   # 503: The agent coordinator is unresponsive
-#   #
-#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
-#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
-#   # The possible values for `failon` are:
-#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
-#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
-#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
+#   # exposes agent metrics using http, by default sockets and named pipes are used
 #   http:
 #       # enables http endpoint
 #       enabled: false

diff --git a/_meta/config/elastic-agent.docker.yml.tmpl b/_meta/config/elastic-agent.docker.yml.tmpl
@@ -116,19 +116,6 @@ inputs:
 #   # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
 #   pprof.enabled: false
 #   # exposes agent metrics using http, by default sockets and named pipes are used
-#   #
-#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
-#   # 200: Agent is healthy
-#   # 500: A component or unit is in a failed state
-#   # 503: The agent coordinator is unresponsive
-#   #
-#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
-#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
-#   # The possible values for `failon` are:
-#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
-#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
-#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
 #       enabled: false

diff --git a/_meta/elastic-agent.yml b/_meta/elastic-agent.yml
@@ -103,15 +103,7 @@ inputs:
 #   logs: false
 #   # enables metrics monitoring
 #   metrics: false
-#   # Exposes agent metrics using http, by default sockets and named pipes are used.
-#   # Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
-#   # 200: Agent is healthy
-#   # 500: A component or unit is in a failed state
-#   # 503: The agent coordinator is unresponsive
-#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
-#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
-#   # The two possible values for `failon` are `degraded` and `failed`. If no `failon` parameter is provided, the default 
-#   # behavior is `failon=failed`
+#   # exposes agent metrics using http, by default sockets and named pipes are used
 #   http:
 #       # enables http endpoint
 #       enabled: false

diff --git a/changelog/fragments/1711653910-add-liveness-endpoint.yaml b/changelog/fragments/1711653910-add-liveness-endpoint.yaml
diff --git a/elastic-agent.docker.yml b/elastic-agent.docker.yml
@@ -116,19 +116,6 @@ inputs:
 #   # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
 #   pprof.enabled: false
 #   # exposes agent metrics using http, by default sockets and named pipes are used
-#   #
-#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
-#   # 200: Agent is healthy
-#   # 500: A component or unit is in a failed state
-#   # 503: The agent coordinator is unresponsive
-#   #
-#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
-#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
-#   # The possible values for `failon` are:
-#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
-#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
-#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
 #       enabled: false

diff --git a/elastic-agent.reference.yml b/elastic-agent.reference.yml
@@ -150,20 +150,7 @@ inputs:
 #   pprof.enabled: false
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
-#   # Exposes agent metrics using http, by default sockets and named pipes are used.
-#   #
-#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
-#   # 200: Agent is healthy
-#   # 500: A component or unit is in a failed state
-#   # 503: The agent coordinator is unresponsive
-#   #
-#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
-#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
-#   # The possible values for `failon` are:
-#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
-#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
-#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
+#   # exposes agent metrics using http, by default sockets and named pipes are used
 #   http:
 #       # enables http endpoint
 #       enabled: false

diff --git a/elastic-agent.yml b/elastic-agent.yml
@@ -72,19 +72,6 @@ inputs:
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
 #   # exposes agent metrics using http, by default sockets and named pipes are used
-#   #
-#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
-#   # 200: Agent is healthy
-#   # 500: A component or unit is in a failed state
-#   # 503: The agent coordinator is unresponsive
-#   #
-#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
-#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
-#   # The possible values for `failon` are:
-#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
-#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
-#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
 #       enabled: false

diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go
@@ -279,11 +279,6 @@ type Coordinator struct {
 
 	// mx         sync.RWMutex
 	// protection protection.Config
-
-	// a sync channel that can be called by other components to check if the main coordinator
-	// loop in runLoopIteration() is active and listening.
-	// Should only be interacted with via CoordinatorActive() or runLoopIteration()
-	heartbeatChan chan struct{}
 }
 
 // The channels Coordinator reads to receive updates from the various managers.
@@ -377,7 +372,6 @@ func New(logger *logger.Logger, cfg *configuration.Configuration, logLevel logp.
 		logLevelCh:         make(chan logp.Level),
 		overrideStateChan:  make(chan *coordinatorOverrideState),
 		upgradeDetailsChan: make(chan *details.Details),
-		heartbeatChan:      make(chan struct{}),
 	}
 	// Setup communication channels for any non-nil components. This pattern
 	// lets us transparently accept nil managers / simulated events during
@@ -418,22 +412,6 @@ func (c *Coordinator) State() State {
 	return c.stateBroadcaster.Get()
 }
 
-// CoordinatorActive is a blocking method that waits for a channel response
-// from the coordinator loop. This can be used to as a basic health check,
-// as we'll timeout and return false if the coordinator run loop doesn't
-// respond to our channel.
-func (c *Coordinator) CoordinatorActive(timeout time.Duration) bool {
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	defer cancel()
-
-	select {
-	case <-c.heartbeatChan:
-		return true
-	case <-ctx.Done():
-		return false
-	}
-}
-
 func (c *Coordinator) RegisterMonitoringServer(s configReloader) {
 	c.monitoringServerReloader = s
 }
@@ -999,8 +977,6 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) {
 	case upgradeDetails := <-c.upgradeDetailsChan:
 		c.setUpgradeDetails(upgradeDetails)
 
-	case c.heartbeatChan <- struct{}{}:
-
 	case componentState := <-c.managerChans.runtimeManagerUpdate:
 		// New component change reported by the runtime manager via
 		// Coordinator.watchRuntimeComponents(), merge it with the

diff --git a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go
@@ -14,7 +14,6 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"net"
 	"testing"
 	"time"
 
@@ -571,7 +570,7 @@ func TestCoordinatorPolicyChangeUpdatesMonitorReloader(t *testing.T) {
 	}
 
 	monitoringServer := &fakeMonitoringServer{}
-	newServerFn := func(*monitoringCfg.MonitoringConfig) (reload.ServerController, error) {
+	newServerFn := func() (reload.ServerController, error) {
 		return monitoringServer, nil
 	}
 	monitoringReloader := reload.NewServerReloader(newServerFn, logger, monitoringCfg.DefaultConfig())
@@ -1055,7 +1054,3 @@ func (fs *fakeMonitoringServer) Reset() {
 	fs.stopTriggered = false
 	fs.startTriggered = false
 }
-
-func (fs *fakeMonitoringServer) Addr() net.Addr {
-	return nil
-}
diff --git a/internal/pkg/agent/application/monitoring/handler.go b/internal/pkg/agent/application/monitoring/handler.go
@@ -8,9 +8,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"net/http"
-	"time"
-
-	"github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator"
 )
 
 const errTypeUnexpected = "UNEXPECTED"
@@ -19,13 +16,6 @@ type apiError interface {
 	Status() int
 }
 
-// CoordinatorState is used by the HTTP handlers that take a coordinator object.
-// This interface exists to help make testing easier.
-type CoordinatorState interface {
-	State() coordinator.State
-	CoordinatorActive(timeout time.Duration) bool
-}
-
 func createHandler(fn func(w http.ResponseWriter, r *http.Request) error) *apiHandler {
 	return &apiHandler{
 		innerFn: fn,
@@ -40,7 +30,7 @@ type apiHandler struct {
 func (h *apiHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	err := h.innerFn(w, r)
 	if err != nil {
-		switch e := err.(type) { //nolint:errorlint // Will need refactor.
+		switch e := err.(type) { // nolint:errorlint // Will need refactor.
 		case apiError:
 			w.WriteHeader(e.Status())
 		default:

diff --git a/internal/pkg/agent/application/monitoring/liveness.go b/internal/pkg/agent/application/monitoring/liveness.go