diff --git a/_meta/config/common.p2.yml.tmpl b/_meta/config/common.p2.yml.tmpl index 99305a43ce1..7ca36f155a1 100644 --- a/_meta/config/common.p2.yml.tmpl +++ b/_meta/config/common.p2.yml.tmpl @@ -66,19 +66,6 @@ inputs: # # The name of the output to use for monitoring data. # use_output: monitoring # # exposes agent metrics using http, by default sockets and named pipes are used -# # -# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status: -# # 200: Agent is healthy -# # 500: A component or unit is in a failed state -# # 503: The agent coordinator is unresponsive -# # -# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500. -# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state. -# # The possible values for `failon` are: -# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. -# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. -# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` # http: # # enables http endpoint # enabled: false diff --git a/_meta/config/common.reference.p2.yml.tmpl b/_meta/config/common.reference.p2.yml.tmpl index 4c729a57e81..325cd13ab18 100644 --- a/_meta/config/common.reference.p2.yml.tmpl +++ b/_meta/config/common.reference.p2.yml.tmpl @@ -144,20 +144,7 @@ inputs: # pprof.enabled: false # # The name of the output to use for monitoring data. # use_output: monitoring -# # Exposes agent metrics using http, by default sockets and named pipes are used. -# # -# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status: -# # 200: Agent is healthy -# # 500: A component or unit is in a failed state -# # 503: The agent coordinator is unresponsive -# # -# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500. -# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state. -# # The possible values for `failon` are: -# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. -# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. -# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` +# # exposes agent metrics using http, by default sockets and named pipes are used # http: # # enables http endpoint # enabled: false diff --git a/_meta/config/elastic-agent.docker.yml.tmpl b/_meta/config/elastic-agent.docker.yml.tmpl index bc8ebee12f0..134aecf3249 100644 --- a/_meta/config/elastic-agent.docker.yml.tmpl +++ b/_meta/config/elastic-agent.docker.yml.tmpl @@ -116,19 +116,6 @@ inputs: # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false # # exposes agent metrics using http, by default sockets and named pipes are used -# # -# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status: -# # 200: Agent is healthy -# # 500: A component or unit is in a failed state -# # 503: The agent coordinator is unresponsive -# # -# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500. -# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state. -# # The possible values for `failon` are: -# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. -# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. -# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` # http: # # enables http endpoint # enabled: false diff --git a/_meta/elastic-agent.yml b/_meta/elastic-agent.yml index c25bffafae3..b2be165ba2f 100644 --- a/_meta/elastic-agent.yml +++ b/_meta/elastic-agent.yml @@ -103,15 +103,7 @@ inputs: # logs: false # # enables metrics monitoring # metrics: false -# # Exposes agent metrics using http, by default sockets and named pipes are used. -# # Also exposes a /liveness endpoint that will return an HTTP code depending on agent status: -# # 200: Agent is healthy -# # 500: A component or unit is in a failed state -# # 503: The agent coordinator is unresponsive -# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500. -# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state. -# # The two possible values for `failon` are `degraded` and `failed`. If no `failon` parameter is provided, the default -# # behavior is `failon=failed` +# # exposes agent metrics using http, by default sockets and named pipes are used # http: # # enables http endpoint # enabled: false diff --git a/changelog/fragments/1711653910-add-liveness-endpoint.yaml b/changelog/fragments/1711653910-add-liveness-endpoint.yaml deleted file mode 100644 index 058e99b088c..00000000000 --- a/changelog/fragments/1711653910-add-liveness-endpoint.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: feature - -# Change summary; a 80ish characters long description of the change. -summary: Add a configurable /liveness endpoint. - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -description: Adds a liveness endpoint suitable for use as a k8s liveness probe. - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: monitoring - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/4499 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/elastic-agent.docker.yml b/elastic-agent.docker.yml index eac46becfd8..87aa2cdd0cd 100644 --- a/elastic-agent.docker.yml +++ b/elastic-agent.docker.yml @@ -116,19 +116,6 @@ inputs: # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false # # exposes agent metrics using http, by default sockets and named pipes are used -# # -# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status: -# # 200: Agent is healthy -# # 500: A component or unit is in a failed state -# # 503: The agent coordinator is unresponsive -# # -# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500. -# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state. -# # The possible values for `failon` are: -# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. -# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. -# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` # http: # # enables http endpoint # enabled: false diff --git a/elastic-agent.reference.yml b/elastic-agent.reference.yml index 66e8bbb0951..7ee19406674 100644 --- a/elastic-agent.reference.yml +++ b/elastic-agent.reference.yml @@ -150,20 +150,7 @@ inputs: # pprof.enabled: false # # The name of the output to use for monitoring data. # use_output: monitoring -# # Exposes agent metrics using http, by default sockets and named pipes are used. -# # -# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status: -# # 200: Agent is healthy -# # 500: A component or unit is in a failed state -# # 503: The agent coordinator is unresponsive -# # -# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500. -# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state. -# # The possible values for `failon` are: -# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. -# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. -# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` +# # exposes agent metrics using http, by default sockets and named pipes are used # http: # # enables http endpoint # enabled: false diff --git a/elastic-agent.yml b/elastic-agent.yml index 0b4e2d38e3f..342fd50f432 100644 --- a/elastic-agent.yml +++ b/elastic-agent.yml @@ -72,19 +72,6 @@ inputs: # # The name of the output to use for monitoring data. # use_output: monitoring # # exposes agent metrics using http, by default sockets and named pipes are used -# # -# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status: -# # 200: Agent is healthy -# # 500: A component or unit is in a failed state -# # 503: The agent coordinator is unresponsive -# # -# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500. -# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state. -# # The possible values for `failon` are: -# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. -# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. -# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` # http: # # enables http endpoint # enabled: false diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go index 1d0dd63e5f2..10be2cc2aa8 100644 --- a/internal/pkg/agent/application/coordinator/coordinator.go +++ b/internal/pkg/agent/application/coordinator/coordinator.go @@ -279,11 +279,6 @@ type Coordinator struct { // mx sync.RWMutex // protection protection.Config - - // a sync channel that can be called by other components to check if the main coordinator - // loop in runLoopIteration() is active and listening. - // Should only be interacted with via CoordinatorActive() or runLoopIteration() - heartbeatChan chan struct{} } // The channels Coordinator reads to receive updates from the various managers. @@ -377,7 +372,6 @@ func New(logger *logger.Logger, cfg *configuration.Configuration, logLevel logp. logLevelCh: make(chan logp.Level), overrideStateChan: make(chan *coordinatorOverrideState), upgradeDetailsChan: make(chan *details.Details), - heartbeatChan: make(chan struct{}), } // Setup communication channels for any non-nil components. This pattern // lets us transparently accept nil managers / simulated events during @@ -418,22 +412,6 @@ func (c *Coordinator) State() State { return c.stateBroadcaster.Get() } -// CoordinatorActive is a blocking method that waits for a channel response -// from the coordinator loop. This can be used to as a basic health check, -// as we'll timeout and return false if the coordinator run loop doesn't -// respond to our channel. -func (c *Coordinator) CoordinatorActive(timeout time.Duration) bool { - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - - select { - case <-c.heartbeatChan: - return true - case <-ctx.Done(): - return false - } -} - func (c *Coordinator) RegisterMonitoringServer(s configReloader) { c.monitoringServerReloader = s } @@ -999,8 +977,6 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) { case upgradeDetails := <-c.upgradeDetailsChan: c.setUpgradeDetails(upgradeDetails) - case c.heartbeatChan <- struct{}{}: - case componentState := <-c.managerChans.runtimeManagerUpdate: // New component change reported by the runtime manager via // Coordinator.watchRuntimeComponents(), merge it with the diff --git a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go index 2831e3e6a4e..62074106e5b 100644 --- a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go +++ b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go @@ -14,7 +14,6 @@ import ( "context" "errors" "fmt" - "net" "testing" "time" @@ -571,7 +570,7 @@ func TestCoordinatorPolicyChangeUpdatesMonitorReloader(t *testing.T) { } monitoringServer := &fakeMonitoringServer{} - newServerFn := func(*monitoringCfg.MonitoringConfig) (reload.ServerController, error) { + newServerFn := func() (reload.ServerController, error) { return monitoringServer, nil } monitoringReloader := reload.NewServerReloader(newServerFn, logger, monitoringCfg.DefaultConfig()) @@ -1055,7 +1054,3 @@ func (fs *fakeMonitoringServer) Reset() { fs.stopTriggered = false fs.startTriggered = false } - -func (fs *fakeMonitoringServer) Addr() net.Addr { - return nil -} diff --git a/internal/pkg/agent/application/monitoring/handler.go b/internal/pkg/agent/application/monitoring/handler.go index 04a440ee74a..6bec3eb37f2 100644 --- a/internal/pkg/agent/application/monitoring/handler.go +++ b/internal/pkg/agent/application/monitoring/handler.go @@ -8,9 +8,6 @@ import ( "encoding/json" "fmt" "net/http" - "time" - - "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" ) const errTypeUnexpected = "UNEXPECTED" @@ -19,13 +16,6 @@ type apiError interface { Status() int } -// CoordinatorState is used by the HTTP handlers that take a coordinator object. -// This interface exists to help make testing easier. -type CoordinatorState interface { - State() coordinator.State - CoordinatorActive(timeout time.Duration) bool -} - func createHandler(fn func(w http.ResponseWriter, r *http.Request) error) *apiHandler { return &apiHandler{ innerFn: fn, @@ -40,7 +30,7 @@ type apiHandler struct { func (h *apiHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { err := h.innerFn(w, r) if err != nil { - switch e := err.(type) { //nolint:errorlint // Will need refactor. + switch e := err.(type) { // nolint:errorlint // Will need refactor. case apiError: w.WriteHeader(e.Status()) default: diff --git a/internal/pkg/agent/application/monitoring/liveness.go b/internal/pkg/agent/application/monitoring/liveness.go deleted file mode 100644 index 3f90c507036..00000000000 --- a/internal/pkg/agent/application/monitoring/liveness.go +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - -package monitoring - -import ( - "fmt" - "net/http" - "time" - - "github.com/elastic/elastic-agent-client/v7/pkg/client" -) - -const formValueKey = "failon" - -type LivenessFailConfig struct { - Degraded bool `yaml:"degraded" config:"degraded"` - Failed bool `yaml:"failed" config:"failed"` - Heartbeat bool `yaml:"heartbeat" config:"heartbeat"` -} - -// process the form values we get via HTTP -func handleFormValues(req *http.Request) (LivenessFailConfig, error) { - err := req.ParseForm() - if err != nil { - return LivenessFailConfig{}, fmt.Errorf("Error parsing form: %w", err) - } - - defaultUserCfg := LivenessFailConfig{Degraded: false, Failed: false, Heartbeat: true} - - for formKey := range req.Form { - if formKey != formValueKey { - return defaultUserCfg, fmt.Errorf("got invalid HTTP form key: '%s'", formKey) - } - } - - userConfig := req.Form.Get(formValueKey) - switch userConfig { - case "failed": - return LivenessFailConfig{Degraded: false, Failed: true, Heartbeat: true}, nil - case "degraded": - return LivenessFailConfig{Failed: true, Degraded: true, Heartbeat: true}, nil - case "heartbeat", "": - return defaultUserCfg, nil - default: - return defaultUserCfg, fmt.Errorf("got unexpected value for `%s` attribute: %s", formValueKey, userConfig) - } -} - -func livenessHandler(coord CoordinatorState) func(http.ResponseWriter, *http.Request) error { - return func(w http.ResponseWriter, r *http.Request) error { - w.Header().Set("Content-Type", "application/json; charset=utf-8") - - state := coord.State() - isUp := coord.CoordinatorActive(time.Second * 10) - // the coordinator check is always on, so if that fails, always return false - if !isUp { - w.WriteHeader(http.StatusServiceUnavailable) - return nil - } - - failConfig, err := handleFormValues(r) - if err != nil { - return fmt.Errorf("error handling form values: %w", err) - } - - // if user has requested `coordinator` mode, just revert to that, skip everything else - if !failConfig.Degraded && !failConfig.Failed && failConfig.Heartbeat { - if !isUp { - w.WriteHeader(http.StatusServiceUnavailable) - return nil - } - } - - unhealthyComponent := false - for _, comp := range state.Components { - if (failConfig.Failed && comp.State.State == client.UnitStateFailed) || (failConfig.Degraded && comp.State.State == client.UnitStateDegraded) { - unhealthyComponent = true - } - } - // bias towards the coordinator check, since it can be otherwise harder to diagnose - if unhealthyComponent { - w.WriteHeader(http.StatusInternalServerError) - } - return nil - } -} diff --git a/internal/pkg/agent/application/monitoring/liveness_test.go b/internal/pkg/agent/application/monitoring/liveness_test.go deleted file mode 100644 index 36aa8d28409..00000000000 --- a/internal/pkg/agent/application/monitoring/liveness_test.go +++ /dev/null @@ -1,373 +0,0 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - -package monitoring - -import ( - "context" - "fmt" - "net/http" - "net/http/httptest" - "testing" - "time" - - "github.com/stretchr/testify/require" - - "github.com/elastic/elastic-agent-client/v7/pkg/client" - "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" - "github.com/elastic/elastic-agent/pkg/component" - "github.com/elastic/elastic-agent/pkg/component/runtime" -) - -type mockCoordinator struct { - state coordinator.State - isUp bool -} - -func (mc mockCoordinator) State() coordinator.State { - return mc.state -} - -func (mc mockCoordinator) CoordinatorActive(_ time.Duration) bool { - return mc.isUp -} - -func TestProcessHTTPHandler(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) - defer cancel() - testCases := []struct { - name string - coord mockCoordinator - expectedCode int - liveness bool - failon string - }{ - { - name: "default-failed", - coord: mockCoordinator{ - isUp: true, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "2", - State: runtime.ComponentState{State: client.UnitStateFailed}, - Component: component.Component{ - ID: "test-component", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 200, - liveness: true, - failon: "heartbeat", - }, - { - name: "default-healthy", - coord: mockCoordinator{ - isUp: true, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "2", - State: runtime.ComponentState{State: client.UnitStateHealthy}, - Component: component.Component{ - ID: "test-component", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 200, - liveness: true, - failon: "heartbeat", - }, - { - name: "degraded", - coord: mockCoordinator{ - isUp: true, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "2", - State: runtime.ComponentState{State: client.UnitStateDegraded}, - Component: component.Component{ - ID: "test-component", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 500, - liveness: true, - failon: "degraded", - }, - { - name: "degraded-check-off", - coord: mockCoordinator{ - isUp: true, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "2", - State: runtime.ComponentState{State: client.UnitStateDegraded}, - Component: component.Component{ - ID: "test-component", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 200, - liveness: true, - failon: "failed", - }, - { - name: "degraded-liveness-off", - coord: mockCoordinator{ - isUp: true, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "2", - State: runtime.ComponentState{State: client.UnitStateDegraded}, - Component: component.Component{ - ID: "test-component", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 200, - liveness: false, - failon: "degraded", - }, - { - name: "healthy", - coord: mockCoordinator{ - isUp: true, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "5", - State: runtime.ComponentState{State: client.UnitStateHealthy}, - Component: component.Component{ - ID: "test-component3", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 200, - liveness: true, - failon: "degraded", - }, - { - name: "coord-fail-only-healthy", - coord: mockCoordinator{ - isUp: false, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "5", - State: runtime.ComponentState{State: client.UnitStateHealthy}, - Component: component.Component{ - ID: "test-component3", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 503, - liveness: true, - failon: "heartbeat", - }, - { - name: "coord-fail-only-failed", - coord: mockCoordinator{ - isUp: false, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "5", - State: runtime.ComponentState{State: client.UnitStateFailed}, - Component: component.Component{ - ID: "test-component3", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 503, - liveness: true, - failon: "heartbeat", - }, - { - name: "degraded-coordinator-down", - coord: mockCoordinator{ - isUp: false, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "5", - State: runtime.ComponentState{State: client.UnitStateDegraded}, - Component: component.Component{ - ID: "test-component3", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 503, - liveness: true, - failon: "degraded", - }, - { - name: "unhealthy-coordinator-down", - coord: mockCoordinator{ - isUp: false, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "5", - State: runtime.ComponentState{State: client.UnitStateFailed}, - Component: component.Component{ - ID: "test-component3", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 503, - liveness: true, - failon: "degraded", - }, - { - name: "healthy-coordinator-down", - coord: mockCoordinator{ - isUp: false, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "5", - State: runtime.ComponentState{State: client.UnitStateHealthy}, - Component: component.Component{ - ID: "test-component3", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 503, - liveness: true, - failon: "degraded", - }, - { - name: "healthy-liveness-off", - coord: mockCoordinator{ - isUp: true, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "5", - State: runtime.ComponentState{State: client.UnitStateHealthy}, - Component: component.Component{ - ID: "test-component3", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 200, - liveness: false, - failon: "degraded", - }, - { - name: "degraded-and-healthy", - coord: mockCoordinator{ - isUp: true, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "2", - State: runtime.ComponentState{State: client.UnitStateDegraded}, - Component: component.Component{ - ID: "test-component", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - { - LegacyPID: "3", - State: runtime.ComponentState{State: client.UnitStateHealthy}, - Component: component.Component{ - ID: "test-component2", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - }, - expectedCode: 500, - liveness: true, - failon: "degraded", - }, - } - - // test with processesHandler - for _, test := range testCases { - t.Run(test.name, func(t *testing.T) { - testSrv := httptest.NewServer(createHandler(livenessHandler(test.coord))) - defer testSrv.Close() - - path := fmt.Sprintf("%s?failon=%s", testSrv.URL, test.failon) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, path, nil) - require.NoError(t, err) - res, err := http.DefaultClient.Do(req) - require.NoError(t, err) - res.Body.Close() - - }) - } - -} diff --git a/internal/pkg/agent/application/monitoring/process.go b/internal/pkg/agent/application/monitoring/process.go index 78267479c60..dc969553d09 100644 --- a/internal/pkg/agent/application/monitoring/process.go +++ b/internal/pkg/agent/application/monitoring/process.go @@ -16,6 +16,7 @@ import ( "github.com/gorilla/mux" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/pkg/utils" @@ -43,7 +44,7 @@ var redirectableProcesses = []string{ profilingServicePrefix, } -func processHandler(coord CoordinatorState, statsHandler func(http.ResponseWriter, *http.Request) error, operatingSystem string) func(http.ResponseWriter, *http.Request) error { +func processHandler(coord *coordinator.Coordinator, statsHandler func(http.ResponseWriter, *http.Request) error, operatingSystem string) func(http.ResponseWriter, *http.Request) error { return func(w http.ResponseWriter, r *http.Request) error { w.Header().Set("Content-Type", "application/json; charset=utf-8") @@ -81,9 +82,8 @@ func processHandler(coord CoordinatorState, statsHandler func(http.ResponseWrite state := coord.State() - for iter, c := range state.Components { - // access the components array manually to avoid a memory aliasing error. This is fixed in go 1.22 - if matchesCloudProcessID(&state.Components[iter].Component, componentID) { + for _, c := range state.Components { + if matchesCloudProcessID(&c.Component, componentID) { data := struct { State string `json:"state"` Message string `json:"message"` diff --git a/internal/pkg/agent/application/monitoring/processes.go b/internal/pkg/agent/application/monitoring/processes.go index 41ce26723d9..c0628a40277 100644 --- a/internal/pkg/agent/application/monitoring/processes.go +++ b/internal/pkg/agent/application/monitoring/processes.go @@ -9,6 +9,8 @@ import ( "fmt" "net/http" "strings" + + "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" ) type source struct { @@ -38,7 +40,7 @@ func sourceFromComponentID(procID string) source { return s } -func processesHandler(coord CoordinatorState) func(http.ResponseWriter, *http.Request) error { +func processesHandler(coord *coordinator.Coordinator) func(http.ResponseWriter, *http.Request) error { return func(w http.ResponseWriter, r *http.Request) error { w.Header().Set("Content-Type", "application/json; charset=utf-8") @@ -46,11 +48,10 @@ func processesHandler(coord CoordinatorState) func(http.ResponseWriter, *http.Re state := coord.State() - for iter, c := range state.Components { + for _, c := range state.Components { if c.Component.InputSpec != nil { procs = append(procs, process{ - // access the components array manually to avoid a memory aliasing error. This is fixed in go 1.22 - ID: expectedCloudProcessID(&state.Components[iter].Component), + ID: expectedCloudProcessID(&c.Component), PID: c.LegacyPID, Binary: c.Component.InputSpec.BinaryName, Source: sourceFromComponentID(c.Component.ID), diff --git a/internal/pkg/agent/application/monitoring/reload/reload.go b/internal/pkg/agent/application/monitoring/reload/reload.go index f8e485af0ac..430b425d6c6 100644 --- a/internal/pkg/agent/application/monitoring/reload/reload.go +++ b/internal/pkg/agent/application/monitoring/reload/reload.go @@ -5,8 +5,6 @@ package reload import ( - "fmt" - "net" "sync/atomic" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" @@ -20,14 +18,13 @@ import ( type ServerController interface { Start() Stop() error - Addr() net.Addr } -type serverConstructor func(*monitoringCfg.MonitoringConfig) (ServerController, error) +type serverConstructor func() (ServerController, error) type ServerReloader struct { - srvController ServerController - log *logger.Logger - newServerFn serverConstructor + s ServerController + log *logger.Logger + newServerFn serverConstructor config *monitoringCfg.MonitoringConfig isServerRunning atomic.Bool @@ -44,26 +41,26 @@ func NewServerReloader(newServerFn serverConstructor, log *logger.Logger, mcfg * } func (sr *ServerReloader) Start() { - if sr.srvController != nil && sr.isServerRunning.Load() { + if sr.s != nil && sr.isServerRunning.Load() { // server is already running return } sr.log.Info("Starting server") var err error - sr.srvController, err = sr.newServerFn(sr.config) + sr.s, err = sr.newServerFn() if err != nil { sr.log.Errorf("Failed creating a server: %v", err) return } - sr.srvController.Start() + sr.s.Start() sr.log.Debugf("Server started") sr.isServerRunning.Store(true) } func (sr *ServerReloader) Stop() error { - if sr.srvController == nil { + if sr.s == nil { // stopping not started server sr.isServerRunning.Store(false) return nil @@ -71,24 +68,15 @@ func (sr *ServerReloader) Stop() error { sr.log.Info("Stopping server") sr.isServerRunning.Store(false) - if err := sr.srvController.Stop(); err != nil { + if err := sr.s.Stop(); err != nil { return err } sr.log.Debugf("Server stopped") - sr.srvController = nil + sr.s = nil return nil } -// Addr returns the address interface used by the underlying network listener -func (sr *ServerReloader) Addr() net.Addr { - if sr.srvController != nil { - return sr.srvController.Addr() - } - // just return a "bare" Addr so we don't have to return a nil - return &net.TCPAddr{Port: 0, IP: net.IP{}} -} - func (sr *ServerReloader) Reload(rawConfig *aConfig.Config) error { newConfig := configuration.DefaultConfiguration() if err := rawConfig.Unpack(&newConfig); err != nil { @@ -96,24 +84,17 @@ func (sr *ServerReloader) Reload(rawConfig *aConfig.Config) error { } sr.config = newConfig.Settings.MonitoringConfig - var err error - - if sr.config != nil && sr.config.Enabled { - if sr.isServerRunning.Load() { - err = sr.Stop() - if err != nil { - return fmt.Errorf("error stopping monitoring server: %w", err) - } - sr.isServerRunning.Store(false) - } + shouldRunMetrics := sr.config.Enabled + if shouldRunMetrics && !sr.isServerRunning.Load() { sr.Start() + sr.isServerRunning.Store(true) return nil } - if sr.config != nil && !sr.config.Enabled && sr.isServerRunning.Load() { - + if !shouldRunMetrics && sr.isServerRunning.Load() { + sr.isServerRunning.Store(false) return sr.Stop() } diff --git a/internal/pkg/agent/application/monitoring/reload/reload_test.go b/internal/pkg/agent/application/monitoring/reload/reload_test.go index 23f8b3466bb..e45eae4d006 100644 --- a/internal/pkg/agent/application/monitoring/reload/reload_test.go +++ b/internal/pkg/agent/application/monitoring/reload/reload_test.go @@ -5,7 +5,6 @@ package reload import ( - "net" "testing" "github.com/stretchr/testify/require" @@ -59,6 +58,14 @@ agent.monitoring.enabled: false `, false, false, true, }, + { + "do not stop when running, monitoring.metrics disabled", + true, true, true, + ` +agent.monitoring.metrics: false +`, + true, false, false, + }, { "stop stopped server", false, false, false, @@ -67,6 +74,14 @@ agent.monitoring.enabled: false `, false, false, false, }, + { + "start started server", + true, true, true, + ` +agent.monitoring.enabled: true +`, + true, false, false, + }, } for _, tc := range tcs { @@ -78,7 +93,7 @@ agent.monitoring.enabled: false MonitorMetrics: tc.currMetrics, } r := NewServerReloader( - func(mcfg *monitoringCfg.MonitoringConfig) (ServerController, error) { + func() (ServerController, error) { return fsc, nil }, log, @@ -86,7 +101,7 @@ agent.monitoring.enabled: false ) r.isServerRunning.Store(tc.currRunning) if tc.currRunning { - r.srvController = fsc + r.s = fsc } newCfg := aConfig.MustNewConfigFrom(tc.newConfig) @@ -113,7 +128,3 @@ func (fsc *fakeServerController) Reset() { fsc.startTriggered = false fsc.stopTriggered = false } - -func (fsc *fakeServerController) Addr() net.Addr { - return nil -} diff --git a/internal/pkg/agent/application/monitoring/server.go b/internal/pkg/agent/application/monitoring/server.go index 2d1f7970d17..bd809e83a3f 100644 --- a/internal/pkg/agent/application/monitoring/server.go +++ b/internal/pkg/agent/application/monitoring/server.go @@ -17,7 +17,9 @@ import ( "go.elastic.co/apm/module/apmgorilla" "github.com/elastic/elastic-agent-libs/api" + "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/monitoring" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring/reload" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" monitoringCfg "github.com/elastic/elastic-agent/internal/pkg/core/monitoring/config" @@ -30,7 +32,8 @@ func NewServer( endpointConfig api.Config, ns func(string) *monitoring.Namespace, tracer *apm.Tracer, - coord CoordinatorState, + coord *coordinator.Coordinator, + enableProcessStats bool, operatingSystem string, mcfg *monitoringCfg.MonitoringConfig, ) (*reload.ServerReloader, error) { @@ -39,54 +42,47 @@ func NewServer( log.Warnf("failed to create monitoring drop: %v", err) } - return exposeMetricsEndpoint(log, ns, tracer, coord, operatingSystem, mcfg) + if strings.TrimSpace(endpointConfig.Host) == "" { + endpointConfig.Host = monitoringCfg.DefaultHost + } + + cfg, err := config.NewConfigFrom(endpointConfig) + if err != nil { + return nil, err + } + + return exposeMetricsEndpoint(log, cfg, ns, tracer, coord, enableProcessStats, operatingSystem, mcfg) } func exposeMetricsEndpoint( log *logger.Logger, + config *config.C, ns func(string) *monitoring.Namespace, tracer *apm.Tracer, - coord CoordinatorState, + coord *coordinator.Coordinator, + enableProcessStats bool, operatingSystem string, mcfg *monitoringCfg.MonitoringConfig, ) (*reload.ServerReloader, error) { + r := mux.NewRouter() + if tracer != nil { + r.Use(apmgorilla.Middleware(apmgorilla.WithTracer(tracer))) + } + statsHandler := statsHandler(ns("stats")) + r.Handle("/stats", createHandler(statsHandler)) + + if enableProcessStats { + r.Handle("/processes", createHandler(processesHandler(coord))) + r.Handle("/processes/{componentID}", createHandler(processHandler(coord, statsHandler, operatingSystem))) + r.Handle("/processes/{componentID}/", createHandler(processHandler(coord, statsHandler, operatingSystem))) + r.Handle("/processes/{componentID}/{metricsPath}", createHandler(processHandler(coord, statsHandler, operatingSystem))) + } - newServerFn := func(cfg *monitoringCfg.MonitoringConfig) (reload.ServerController, error) { - r := mux.NewRouter() - if tracer != nil { - r.Use(apmgorilla.Middleware(apmgorilla.WithTracer(tracer))) - } - - // This will probably only be nil in tests. - statNs := &monitoring.Namespace{} - if ns != nil { - statNs = ns("stats") - } - - statsHandler := statsHandler(statNs) - r.Handle("/stats", createHandler(statsHandler)) - - if isProcessStatsEnabled(cfg) { - r.Handle("/processes", createHandler(processesHandler(coord))) - r.Handle("/processes/{componentID}", createHandler(processHandler(coord, statsHandler, operatingSystem))) - r.Handle("/processes/{componentID}/", createHandler(processHandler(coord, statsHandler, operatingSystem))) - r.Handle("/processes/{componentID}/{metricsPath}", createHandler(processHandler(coord, statsHandler, operatingSystem))) - - r.Handle("/liveness", createHandler(livenessHandler(coord))) - } - - mux := http.NewServeMux() - mux.Handle("/", r) - - if strings.TrimSpace(cfg.HTTP.Host) == "" { - cfg.HTTP.Host = monitoringCfg.DefaultHost - } + mux := http.NewServeMux() + mux.Handle("/", r) - srvCfg := api.DefaultConfig() - srvCfg.Enabled = cfg.Enabled - srvCfg.Host = AgentMonitoringEndpoint(operatingSystem, cfg) - srvCfg.Port = cfg.HTTP.Port - apiServer, err := api.NewFromConfig(log, mux, srvCfg) + newServerFn := func() (reload.ServerController, error) { + apiServer, err := api.New(log, mux, config) if err != nil { return nil, errors.New(err, "failed to create api server") } @@ -125,7 +121,3 @@ func isHttpUrl(s string) bool { u, err := url.Parse(strings.TrimSpace(s)) return err == nil && (u.Scheme == "http" || u.Scheme == "https") && u.Host != "" } - -func isProcessStatsEnabled(cfg *monitoringCfg.MonitoringConfig) bool { - return cfg != nil && cfg.HTTP.Enabled -} diff --git a/internal/pkg/agent/application/monitoring/server_test.go b/internal/pkg/agent/application/monitoring/server_test.go index 5f6fd27017e..3c2f1c5aa92 100644 --- a/internal/pkg/agent/application/monitoring/server_test.go +++ b/internal/pkg/agent/application/monitoring/server_test.go @@ -5,89 +5,11 @@ package monitoring import ( - "context" - "fmt" - "net" - "net/http" "testing" - "time" "github.com/google/go-cmp/cmp" - "github.com/stretchr/testify/require" - - "github.com/elastic/elastic-agent-client/v7/pkg/client" - "github.com/elastic/elastic-agent-libs/api" - "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" - "github.com/elastic/elastic-agent/internal/pkg/core/monitoring/config" - "github.com/elastic/elastic-agent/pkg/component" - "github.com/elastic/elastic-agent/pkg/component/runtime" ) -func TestBasicLivenessConfig(t *testing.T) { - _ = logp.DevelopmentSetup() - testAPIConfig := api.Config{} - mockCoord := mockCoordinator{ - isUp: true, - state: coordinator.State{ - Components: []runtime.ComponentComponentState{ - { - LegacyPID: "2", - State: runtime.ComponentState{State: client.UnitStateDegraded}, - Component: component.Component{ - ID: "test-component", - InputSpec: &component.InputRuntimeSpec{ - BinaryName: "testbeat", - }, - }, - }, - }, - }, - } - testConfig := config.MonitoringConfig{ - Enabled: true, - HTTP: &config.MonitoringHTTPConfig{ - Enabled: true, - Port: 0, - }, - } - serverReloader, err := NewServer(logp.L(), testAPIConfig, nil, nil, mockCoord, "linux", &testConfig) - require.NoError(t, err) - - t.Logf("starting server...") - serverReloader.Start() - - waitOnReturnCode(t, http.StatusInternalServerError, "?failon=degraded", serverReloader.Addr()) - - waitOnReturnCode(t, http.StatusOK, "?failon=failed", serverReloader.Addr()) - - t.Logf("stopping server...") - err = serverReloader.Stop() - require.NoError(t, err) - -} - -func waitOnReturnCode(t *testing.T, expectedReturnCode int, formValue string, addr net.Addr) { - ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) - defer cancel() - client := &http.Client{} - path := fmt.Sprintf("http://%s/liveness%s", addr.String(), formValue) - t.Logf("checking %s", path) - req, err := http.NewRequestWithContext(ctx, "GET", path, nil) - require.NoError(t, err) - - require.Eventually(t, func() bool { - resp, err := client.Do(req) - if err != nil { - t.Logf("error fetching endpoint: %s", err) - return false - } - defer resp.Body.Close() - // should return 500 as we have one component set to UnitStateDegraded - return resp.StatusCode == expectedReturnCode - }, time.Second*30, time.Second*3) -} - func TestIsHTTPUrl(t *testing.T) { tests := []struct { diff --git a/internal/pkg/agent/cmd/run.go b/internal/pkg/agent/cmd/run.go index 7bd421afe3c..4bfbf369ee1 100644 --- a/internal/pkg/agent/cmd/run.go +++ b/internal/pkg/agent/cmd/run.go @@ -638,7 +638,7 @@ func setupMetrics( Host: monitoring.AgentMonitoringEndpoint(operatingSystem, cfg), } - s, err := monitoring.NewServer(logger, endpointConfig, monitoringLib.GetNamespace, tracer, coord, operatingSystem, cfg) + s, err := monitoring.NewServer(logger, endpointConfig, monitoringLib.GetNamespace, tracer, coord, isProcessStatsEnabled(cfg), operatingSystem, cfg) if err != nil { return nil, errors.New(err, "could not start the HTTP server for the API") } @@ -646,6 +646,10 @@ func setupMetrics( return s, nil } +func isProcessStatsEnabled(cfg *monitoringCfg.MonitoringConfig) bool { + return cfg != nil && cfg.HTTP.Enabled +} + // handleUpgrade checks if agent is being run as part of an // ongoing upgrade operation, i.e. being re-exec'd and performs // any upgrade-specific work, if needed. diff --git a/internal/pkg/core/monitoring/config/config.go b/internal/pkg/core/monitoring/config/config.go index 3e1417cc0de..cc6f345e123 100644 --- a/internal/pkg/core/monitoring/config/config.go +++ b/internal/pkg/core/monitoring/config/config.go @@ -39,7 +39,7 @@ type MonitoringConfig struct { type MonitoringHTTPConfig struct { Enabled bool `yaml:"enabled" config:"enabled"` Host string `yaml:"host" config:"host"` - Port int `yaml:"port" config:"port" validate:"min=0,max=65535"` + Port int `yaml:"port" config:"port" validate:"min=0,max=65535,nonzero"` Buffer *BufferConfig `yaml:"buffer" config:"buffer"` } @@ -49,7 +49,7 @@ func (c *MonitoringHTTPConfig) Unpack(cfg *c.C) error { tmp := struct { Enabled bool `yaml:"enabled" config:"enabled"` Host string `yaml:"host" config:"host"` - Port int `yaml:"port" config:"port" validate:"min=0,max=65535"` + Port int `yaml:"port" config:"port" validate:"min=0,max=65535,nonzero"` Buffer *BufferConfig `yaml:"buffer" config:"buffer"` }{ Enabled: c.Enabled, diff --git a/testing/integration/monitoring_probe_reload_test.go b/testing/integration/monitoring_probe_reload_test.go deleted file mode 100644 index 21455b39017..00000000000 --- a/testing/integration/monitoring_probe_reload_test.go +++ /dev/null @@ -1,180 +0,0 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - -//go:build integration - -package integration - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "net/http" - "testing" - "time" - - "github.com/google/uuid" - "github.com/stretchr/testify/require" - "github.com/stretchr/testify/suite" - - "github.com/elastic/elastic-agent-libs/kibana" - "github.com/elastic/elastic-agent/pkg/control/v2/cproto" - atesting "github.com/elastic/elastic-agent/pkg/testing" - "github.com/elastic/elastic-agent/pkg/testing/define" - "github.com/elastic/elastic-agent/pkg/testing/tools" -) - -type MonitoringRunner struct { - suite.Suite - info *define.Info - agentFixture *atesting.Fixture - - ESHost string - - healthCheckTime time.Duration - healthCheckRefreshTime time.Duration - - policyID string - policyName string -} - -func TestMonitoringLivenessReloadable(t *testing.T) { - info := define.Require(t, define.Requirements{ - Group: "fleet", - Stack: &define.Stack{}, - Local: false, // requires Agent installation - Sudo: true, // requires Agent installation - OS: []define.OS{ - {Type: define.Linux}, - {Type: define.Windows}, - }, - }) - - suite.Run(t, &MonitoringRunner{info: info, healthCheckTime: time.Minute * 5, healthCheckRefreshTime: time.Second * 5}) -} - -func (runner *MonitoringRunner) SetupSuite() { - fixture, err := define.NewFixture(runner.T(), define.Version()) - require.NoError(runner.T(), err) - runner.agentFixture = fixture - - policyUUID := uuid.New().String() - basePolicy := kibana.AgentPolicy{ - Name: "test-policy-" + policyUUID, - Namespace: "default", - Description: "Test policy " + policyUUID, - MonitoringEnabled: []kibana.MonitoringEnabledOption{ - kibana.MonitoringEnabledLogs, - kibana.MonitoringEnabledMetrics, - }, - } - - installOpts := atesting.InstallOpts{ - NonInteractive: true, - Force: true, - Privileged: true, - } - - ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) - defer cancel() - - policyResp, err := tools.InstallAgentWithPolicy(ctx, runner.T(), installOpts, runner.agentFixture, runner.info.KibanaClient, basePolicy) - require.NoError(runner.T(), err) - - runner.policyID = policyResp.ID - runner.policyName = basePolicy.Name - - _, err = tools.InstallPackageFromDefaultFile(ctx, runner.info.KibanaClient, "system", "1.53.1", "system_integration_setup.json", uuid.New().String(), policyResp.ID) - require.NoError(runner.T(), err) -} - -func (runner *MonitoringRunner) TestMonitoringLiveness() { - ctx, cancel := context.WithTimeout(context.Background(), time.Minute*10) - defer cancel() - - runner.AllComponentsHealthy(ctx) - - client := http.Client{Timeout: time.Second * 4} - endpoint := "http://localhost:6792/liveness" - // first stage: ensure the default behavior, http monitoring is off. This should return an error - req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil) - require.NoError(runner.T(), err) - - _, err = client.Do(req) - require.Error(runner.T(), err) - - // use the fleet override API to enable http monitoring. - // This tests both the http server itself, and tests that the agent reloader actually reloads the agent config. - override := map[string]interface{}{ - "name": runner.policyName, - "namespace": "default", - "overrides": map[string]interface{}{ - "agent": map[string]interface{}{ - "monitoring": map[string]interface{}{ - "http": map[string]interface{}{ - "enabled": true, - "host": "localhost", - "port": 6792, - }, - }, - }, - }, - } - - raw, err := json.Marshal(override) - require.NoError(runner.T(), err) - reader := bytes.NewBuffer(raw) - overrideEndpoint := fmt.Sprintf("/api/fleet/agent_policies/%s", runner.policyID) - statusCode, overrideResp, err := runner.info.KibanaClient.Request("PUT", overrideEndpoint, nil, nil, reader) - require.NoError(runner.T(), err) - require.Equal(runner.T(), http.StatusOK, statusCode, "non-200 status code; got response: %s", string(overrideResp)) - - runner.AllComponentsHealthy(ctx) - - // check to make sure that we now have a liveness probe response - req, err = http.NewRequestWithContext(ctx, "GET", endpoint, nil) - require.NoError(runner.T(), err) - - // second check: the /liveness endpoint should now be responding - runner.CheckResponse(ctx, endpoint) - - runner.CheckResponse(ctx, fmt.Sprintf("%s?failon=degraded", endpoint)) - - runner.CheckResponse(ctx, fmt.Sprintf("%s?failon=failed", endpoint)) - - runner.CheckResponse(ctx, fmt.Sprintf("%s?failon=heartbeat", endpoint)) -} - -// CheckResponse checks to see if the liveness probe returns a 200 -func (runner *MonitoringRunner) CheckResponse(ctx context.Context, endpoint string) { - req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil) - require.NoError(runner.T(), err) - - client := http.Client{Timeout: time.Second * 4} - - livenessResp, err := client.Do(req) - require.NoError(runner.T(), err) - defer livenessResp.Body.Close() - require.Equal(runner.T(), http.StatusOK, livenessResp.StatusCode) // this is effectively the check for the test -} - -// AllComponentsHealthy ensures all the beats and agent are healthy and working before we continue -func (runner *MonitoringRunner) AllComponentsHealthy(ctx context.Context) { - compDebugName := "" - require.Eventually(runner.T(), func() bool { - allHealthy := true - status, err := runner.agentFixture.ExecStatus(ctx) - - require.NoError(runner.T(), err) - for _, comp := range status.Components { - runner.T().Logf("component state: %s", comp.Message) - if comp.State != int(cproto.State_HEALTHY) { - compDebugName = comp.Name - allHealthy = false - } - } - return allHealthy - }, runner.healthCheckTime, runner.healthCheckRefreshTime, "install never became healthy: components did not return a healthy state: %s", compDebugName) -}