Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 17 additions & 28 deletions apiserver/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import (
)

func NewAPIController(r *runner.Runner, authenticator *auth.Authenticator, hub *wsWriter.Hub) (*APIController, error) {
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext())
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext(context.Background()))
if err != nil {
return nil, errors.Wrap(err, "failed to get controller info")
}
Expand Down Expand Up @@ -95,19 +95,6 @@ func handleError(ctx context.Context, w http.ResponseWriter, err error) {
}
}

func (a *APIController) webhookMetricLabelValues(ctx context.Context, valid, reason string) []string {
controllerInfo, err := a.r.GetControllerInfo(auth.GetAdminContext())
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
// If labels are empty, not attempt will be made to record webhook.
return []string{}
}
return []string{
valid, reason,
controllerInfo.Hostname, controllerInfo.ControllerID.String(),
}
}

func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()
body, err := io.ReadAll(r.Body)
Expand All @@ -119,31 +106,33 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
signature := r.Header.Get("X-Hub-Signature-256")
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")

var labelValues []string
defer func() {
if len(labelValues) == 0 {
return
}
if err := metrics.RecordWebhookWithLabels(labelValues...); err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to record metric")
}
}()

if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
if errors.Is(err, gErrors.ErrNotFound) {
labelValues = a.webhookMetricLabelValues(ctx, "false", "owner_unknown")
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"owner_unknown", // label: reason
).Inc()
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
return
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
labelValues = a.webhookMetricLabelValues(ctx, "false", "signature_invalid")
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"signature_invalid", // label: reason
).Inc()
} else {
labelValues = a.webhookMetricLabelValues(ctx, "false", "unknown")
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"unknown", // label: reason
).Inc()
}

handleError(ctx, w, err)
return
}
labelValues = a.webhookMetricLabelValues(ctx, "true", "")
metrics.WebhooksReceived.WithLabelValues(
"true", // label: valid
"", // label: reason
).Inc()
}

func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) {
Expand Down
6 changes: 4 additions & 2 deletions auth/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,10 @@ func UserID(ctx context.Context) string {

// GetAdminContext will return an admin context. This can be used internally
// when fetching users.
func GetAdminContext() context.Context {
ctx := context.Background()
func GetAdminContext(ctx context.Context) context.Context {
if ctx == nil {
ctx = context.Background()
}
ctx = SetUserID(ctx, "")
ctx = SetAdmin(ctx, true)
ctx = SetIsEnabled(ctx, true)
Expand Down
14 changes: 10 additions & 4 deletions cmd/garm/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
runnerMetrics "github.com/cloudbase/garm/runner/metrics"
garmUtil "github.com/cloudbase/garm/util"
"github.com/cloudbase/garm/util/appdefaults"
"github.com/cloudbase/garm/websocket"
Expand Down Expand Up @@ -214,13 +215,18 @@ func main() {

router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement)

// start the metrics collector
if cfg.Metrics.Enable {
slog.InfoContext(ctx, "registering prometheus metrics collectors")
if err := metrics.RegisterCollectors(runner); err != nil {
log.Fatal(err)
}
slog.InfoContext(ctx, "setting up metric routes")
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)

slog.InfoContext(ctx, "register metrics")
if err := metrics.RegisterMetrics(); err != nil {
log.Fatal(err)
}
Comment on lines +224 to +226
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one downside of this implementation (register the metrics here instead of doing it in the init):

for any further metric we want to change, we have to do a e.g.

if cfg.Metrics.Enable {
       metrics.InstanceCreatedCount.WithLabelValues(
		bootstrapParams.PoolID, // label: pool_id
		e.cfg.Name,             // label: provider
	).Inc()
}

i'm either thinking of registering the metrics, no matter if cfg.Metrics.Enable is set or directly drop cfg.Metrics.Enable and always run garm with /metrics.

WDYT @gabriel-samfira ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't we just add it to the RegisterMetrics() function and it gets registered here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In essence, RegisterMetrics() would act like the init() function, but would be capable of returning an error we can potentially handle. So instead of using the init() function, we define our own and call it here to register the metrics.

If metrics are enabled, we register them and start the collector loop.


slog.InfoContext(ctx, "start metrics collection")
runnerMetrics.CollectObjectMetric(ctx, runner, cfg.Metrics.Duration())
}

if cfg.Default.DebugServer {
Expand Down
33 changes: 32 additions & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -456,8 +456,39 @@ func (t *TLSConfig) Validate() error {
}

type Metrics struct {
// DisableAuth defines if the API endpoint will be protected by
// JWT authentication
DisableAuth bool `toml:"disable_auth" json:"disable-auth"`
Enable bool `toml:"enable" json:"enable"`
// Enable define if the API endpoint for metrics collection will
// be enabled
Enable bool `toml:"enable" json:"enable"`
// Period defines the internal period at which internal metrics are getting updated
// and propagated to the /metrics endpoint
Period time.Duration `toml:"period" json:"period"`
}

// ParseDuration parses the configured duration and returns a time.Duration of 0
// if the duration is invalid.
func (m *Metrics) ParseDuration() (time.Duration, error) {
duration, err := time.ParseDuration(fmt.Sprint(m.Period))
if err != nil {
return 0, err
}
return duration, nil
}

// Duration returns the configured duration or the default duration if no value
// is configured or the configured value is invalid.
func (m *Metrics) Duration() time.Duration {
duration, err := m.ParseDuration()
if err != nil {
slog.With(slog.Any("error", err)).Error(fmt.Sprintf("defined duration %s is invalid", m.Period))
}
if duration == 0 {
slog.Debug(fmt.Sprintf("using default duration %s for metrics update interval", appdefaults.DefaultMetricsUpdateInterval))
return appdefaults.DefaultMetricsUpdateInterval
}
return duration
}

// APIServer holds configuration for the API server
Expand Down
34 changes: 23 additions & 11 deletions doc/config_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ This is one of the features in GARM that I really love having. For one thing, it

## Common metrics

| Metric name | Type | Labels | Description |
|--------------------------|---------|-------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
| `garm_health` | Gauge | `controller_id`=&lt;controller id&gt; <br>`name`=&lt;hostname&gt; | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
| `garm_webhooks_received` | Counter | `controller_id`=&lt;controller id&gt; <br>`name`=&lt;hostname&gt; | This is a counter that increments every time GARM receives a webhook from GitHub. |
| Metric name | Type | Labels | Description |
|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
| `garm_health` | Gauge | `controller_id`=&lt;controller id&gt; <br>`callback_url`=&lt;callback url&gt; <br>`controller_webhook_url`=&lt;controller webhook url&gt; <br>`metadata_url`=&lt;metadata url&gt; <br>`webhook_url`=&lt;webhook url&gt; <br>`name`=&lt;hostname&gt; | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
| `garm_webhooks_received` | Counter | `valid`=&lt;valid request&gt; <br>`reason`=&lt;reason for invalid requests&gt; | This is a counter that increments every time GARM receives a webhook from GitHub. |

## Enterprise metrics

Expand Down Expand Up @@ -48,9 +48,9 @@ This is one of the features in GARM that I really love having. For one thing, it

## Runner metrics

| Metric name | Type | Labels | Description |
|----------------------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `controller_id`=&lt;controller id&gt; <br>`hostname`=&lt;hostname&gt; <br>`name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |
| Metric name | Type | Labels | Description |
|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |

More metrics will be added in the future.

Expand All @@ -60,15 +60,27 @@ Metrics are disabled by default. To enable them, add the following to your confi

```toml
[metrics]
# Toggle metrics. If set to false, the API endpoint for metrics collection will
# be disabled.
enable = true

# Toggle to disable authentication (not recommended) on the metrics endpoint.
# If you do disable authentication, I encourage you to put a reverse proxy in front
# of garm and limit which systems can access that particular endpoint. Ideally, you
# would enable some kind of authentication using the reverse proxy, if the built-in auth
# is not sufficient for your needs.
disable_auth = false
#
# Default: false
disable_auth = true

# Toggle metrics. If set to false, the API endpoint for metrics collection will
# be disabled.
#
# Default: false
enable = true

# period is the time interval when the /metrics endpoint will update internal metrics about
# controller specific objects (e.g. runners, pools, etc.)
#
# Default: "60s"
period = "30s"
```

You can choose to disable authentication if you wish, however it's not terribly difficult to set up, so I generally advise against disabling it.
Expand Down
57 changes: 14 additions & 43 deletions metrics/enterprise.go
Original file line number Diff line number Diff line change
@@ -1,50 +1,21 @@
package metrics

import (
"log/slog"
"strconv"

"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)

// CollectOrganizationMetric collects the metrics for the enterprise objects
func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
ctx := auth.GetAdminContext()

enterprises, err := c.runner.ListEnterprises(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
return
}

for _, enterprise := range enterprises {
var (
EnterpriseInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsEnterpriseSubsystem,
Name: "info",
Help: "Info of the enterprise",
}, []string{"name", "id"})

enterpriseInfo, err := prometheus.NewConstMetric(
c.enterpriseInfo,
prometheus.GaugeValue,
1,
enterprise.Name, // label: name
enterprise.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterpriseInfo metric")
continue
}
ch <- enterpriseInfo

enterprisePoolManagerStatus, err := prometheus.NewConstMetric(
c.enterprisePoolManagerStatus,
prometheus.GaugeValue,
bool2float64(enterprise.PoolManagerStatus.IsRunning),
enterprise.Name, // label: name
enterprise.ID, // label: id
strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprisePoolManagerStatus metric")
continue
}
ch <- enterprisePoolManagerStatus
}
}
EnterprisePoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsEnterpriseSubsystem,
Name: "pool_manager_status",
Help: "Status of the enterprise pool manager",
}, []string{"name", "id", "running"})
)
23 changes: 7 additions & 16 deletions metrics/health.go
Original file line number Diff line number Diff line change
@@ -1,22 +1,13 @@
package metrics

import (
"log/slog"

"github.com/prometheus/client_golang/prometheus"
)

func (c *GarmCollector) CollectHealthMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
m, err := prometheus.NewConstMetric(
c.healthMetric,
prometheus.GaugeValue,
1,
hostname,
controllerID,
)
if err != nil {
slog.With(slog.Any("error", err)).Error("error on creating health metric")
return
}
ch <- m
}
var (
GarmHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "health",
Help: "Health of the garm",
}, []string{"metadata_url", "callback_url", "webhook_url", "controller_webhook_url", "controller_id"})
)
Loading