Skip to content
This repository has been archived by the owner on Mar 5, 2024. It is now read-only.

Commit

Permalink
Idiamond metrics (#131)
Browse files Browse the repository at this point in the history
* Add prometheus metrics and replace statsd
- Replace go-metrics Prometheus metrics with the Prometheus go client
library
- Replace go-metrics statsd with a client that does not perform in
memory aggregation
- Add additional logging as to the duration of requests
* Add metrics docs
* add documentation for grpc client/server metrics
  • Loading branch information
pingles authored Aug 3, 2018
1 parent 0aeed9b commit ecf1dd2
Show file tree
Hide file tree
Showing 102 changed files with 3,977 additions and 4,765 deletions.
28 changes: 10 additions & 18 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 1 addition & 9 deletions Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,7 @@

[[constraint]]
name = "github.com/prometheus/client_golang"
version = "0.8.0"

[[constraint]]
branch = "master"
name = "github.com/pubnub/go-metrics-statsd"

[[constraint]]
branch = "master"
name = "github.com/rcrowley/go-metrics"
version = "v0.9.0-pre1"

[[constraint]]
name = "github.com/sirupsen/logrus"
Expand Down
29 changes: 16 additions & 13 deletions cmd/kiam/opts.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@ package main

import (
"context"
"net"
"fmt"
"time"

statsd "github.com/pubnub/go-metrics-statsd"
metrics "github.com/rcrowley/go-metrics"
log "github.com/sirupsen/logrus"
"github.com/uswitch/kiam/pkg/prometheus"
"github.com/uswitch/kiam/pkg/statsd"
)

type logOptions struct {
Expand Down Expand Up @@ -52,27 +51,31 @@ func (o *logOptions) configureLogger() {
}

type telemetryOptions struct {
statsd string
statsdInterval time.Duration
statsD string
statsDInterval time.Duration
statsDPrefix string
prometheusListen string
prometheusSync time.Duration
}

func (o *telemetryOptions) bind(parser parser) {
parser.Flag("statsd", "UDP address to publish StatsD metrics. e.g. 127.0.0.1:8125").Default("").StringVar(&o.statsd)
parser.Flag("statsd-interval", "Interval to publish to StatsD").Default("10s").DurationVar(&o.statsdInterval)
parser.Flag("statsd", "UDP address to publish StatsD metrics. e.g. 127.0.0.1:8125").Default("").StringVar(&o.statsD)
parser.Flag("statsd-prefix", "statsd namespace to use").Default("kiam").StringVar(&o.statsDPrefix)
parser.Flag("statsd-interval", "Interval to publish to StatsD").Default("100ms").DurationVar(&o.statsDInterval)

parser.Flag("prometheus-listen-addr", "Prometheus HTTP listen address. e.g. localhost:9620").StringVar(&o.prometheusListen)
parser.Flag("prometheus-sync-interval", "How frequently to update Prometheus metrics").Default("5s").DurationVar(&o.prometheusSync)
}

func (o telemetryOptions) start(ctx context.Context, identifier string) {
if o.statsd != "" {
addr, err := net.ResolveUDPAddr("udp", o.statsd)
if err != nil {
log.Fatal("error parsing statsd address:", err.Error())
}
go statsd.StatsD(metrics.DefaultRegistry, o.statsdInterval, "kiam."+identifier, addr)
err := statsd.New(
o.statsD,
fmt.Sprintf("%s.%s", o.statsDPrefix, identifier),
o.statsDInterval,
)

if err != nil {
log.Fatalf("Error initing statsd: %v", err)
}

if o.prometheusListen != "" {
Expand Down
72 changes: 72 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Metrics

Kiam exports both Prometheus and StatsD metrics to determine the health of the
system, check the timing of each RPC call, and monitor the size of the
credentials cache. By default, Prometheus metrics are exported on
`localhost:9620` and StatsD metrics are sent to `127.0.0.1:8125`. StatsD
metrics are flushed every 100ms but are not aggregated.

## Metrics configuration

- The `statsd` flag controls the address to which to send StatsD metrics. This
is by default `127.0.0..1:8125`. If this is blank, StatsD metrics will be
silenced.
- The `statsd-prefix` flag controls the initial prefix that will be appended to
Kiam's StatsD metrics. This is by default `kiam`.
- The `statsd-interval` flag controls how frequently the in-memory metrics
buffer will be flushed to the specified StatsD endpoint. Metrics are
not aggregated in this buffer and the raw counts will be flushed to the
underlying StatsD sink. This is by default `100ms`.
- The `prometheus-listen-addr` controls which address Kiam should create a
Prometheus endpoint on. This is by default `localhost:9620`. The metrics
themselves can be accessed at `<prometheus-listen-addr>/metrics`.
- The `prometheus-sync-interval` flag controls how frequently Prometheus
metrics should be updated. This is by default `5s`.

## Emitted Metrics

### Prometheus

#### Metadata Subsystem
- `handler_latency_milliseconds` - Bucketed histogram of handler timings. Tagged by handler
- `credential_fetch_errors_total` - Number of errors fetching the credentials for a pod
- `credential_encode_errors_total` - Number of errors encoding credentials for a pod
- `find_role_errors_total` - Number of errors finding the role for a pod
- `empty_role_total` - Number of empty roles returned
- `success_total` - Number of successful responses from a handler
- `responses_total` - Responses from mocked out metadata handlers

#### STS Subsystem
- `cache_hit_total` - Number of cache hits to the metadata cache
- `cache_miss_total` - Number of cache misses to the metadata cache
- `issuing_errors_total` - Number of errors issuing credentials
- `assumerole_timing_milliseconds` - Bucketed histogram of assumeRole timings
- `assumerole_current` - Number of assume role calls currently executing

#### K8s Subsystem
- `dropped_pods_total` - Number of dropped pods because of full buffer

#### gRPC Server (Kiam Server)
- `grpc_server_handled_total` - Total number of RPCs completed on the server, regardless of success or failure.
- `grpc_server_msg_received_total` - Total number of RPC stream messages received on the server.
- `grpc_server_msg_sent_total` - Total number of gRPC stream messages sent by the server.
- `grpc_server_started_total` - Total number of RPCs started on the server.

#### gRPC Client (Kiam Agent)
- `grpc_client_handled_total` - Total number of RPCs completed by the client, regardless of success or failure.
- `grpc_client_msg_received_total` - Total number of RPC stream messages received by the client.
- `grpc_client_msg_sent_total` - Total number of gRPC stream messages sent by the client.
- `grpc_client_started_total` - Total number of RPCs started on the client.

### StatsD Timing metrics
- `gateway.rpc.GetRole` - Observed client side latency of GetRole RPC
- `gateway.rpc.GetCredentials` - Observed client side latency of GetCredentials RPC
- `server.rpc.GetRoleCredentials` - Observed server side latency of GetRoleCredentials RPC
- `server.rpc.IsAllowedAssumeRole` - Observed server side latency of IsAllowedAssumeRole RPC
- `server.rpc.GetHealth` - Observed server side latency of GetHealth RPC
- `server.rpc.GetPodRole` - Observed server side latency of GetPodRole RPC
- `server.rpc.GetRoleCredentials` - Observed server side latency of GetRoleCredentials RPC
- `handler.role_name` - Observed latency of role_name handler
- `handler.health` - Observed latency of health handler
- `handler.credentials` - Observed latency of credentials handler
- `aws.assume_role` - Observed latency of aws assume role request
17 changes: 10 additions & 7 deletions pkg/aws/metadata/handler_credentials.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@ import (
"context"
"encoding/json"
"fmt"
"net/http"

"github.com/cenkalti/backoff"
"github.com/gorilla/mux"
"github.com/rcrowley/go-metrics"
"github.com/prometheus/client_golang/prometheus"
"github.com/uswitch/kiam/pkg/aws/sts"
"github.com/uswitch/kiam/pkg/server"
"net/http"
"time"
"github.com/uswitch/kiam/pkg/statsd"
)

type credentialsHandler struct {
Expand All @@ -36,9 +37,9 @@ func (c *credentialsHandler) Install(router *mux.Router) {
}

func (c *credentialsHandler) Handle(ctx context.Context, w http.ResponseWriter, req *http.Request) (int, error) {
credentialTimings := metrics.GetOrRegisterTimer("credentialsHandler", metrics.DefaultRegistry)
startTime := time.Now()
defer credentialTimings.UpdateSince(startTime)
timer := prometheus.NewTimer(handlerTimer.WithLabelValues("credentials"))
defer timer.ObserveDuration()
defer statsd.Client.NewTiming().Send("handler.credentials")

err := req.ParseForm()
if err != nil {
Expand All @@ -53,16 +54,18 @@ func (c *credentialsHandler) Handle(ctx context.Context, w http.ResponseWriter,
requestedRole := mux.Vars(req)["role"]
credentials, err := c.fetchCredentials(ctx, ip, requestedRole)
if err != nil {
credentialFetchError.WithLabelValues("credentials").Inc()
return http.StatusInternalServerError, fmt.Errorf("error fetching credentials: %s", err)
}

err = json.NewEncoder(w).Encode(credentials)
if err != nil {
credentialEncodeError.WithLabelValues("credentials").Inc()
return http.StatusInternalServerError, fmt.Errorf("error encoding credentials: %s", err.Error())
}

w.Header().Set("Content-Type", "application/json")
metrics.GetOrRegisterMeter("credentialsHandler.success", metrics.DefaultRegistry).Mark(1)
success.WithLabelValues("credentials").Inc()
return http.StatusOK, nil
}

Expand Down
5 changes: 5 additions & 0 deletions pkg/aws/metadata/handler_credentials_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"github.com/gorilla/mux"
"github.com/uswitch/kiam/pkg/aws/sts"
"github.com/uswitch/kiam/pkg/server"
"github.com/uswitch/kiam/pkg/statsd"
st "github.com/uswitch/kiam/pkg/testutil/server"
"net/http"
"net/http/httptest"
Expand All @@ -14,6 +15,10 @@ import (
"time"
)

func init() {
statsd.New("", "", time.Millisecond)
}

func TestReturnsCredentials(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
defer cancel()
Expand Down
11 changes: 6 additions & 5 deletions pkg/aws/metadata/handler_health.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,21 @@ package metadata
import (
"context"
"fmt"
"github.com/rcrowley/go-metrics"
"io/ioutil"
"net/http"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/uswitch/kiam/pkg/statsd"
)

type healthHandler struct {
endpoint string
}

func (h *healthHandler) Handle(ctx context.Context, w http.ResponseWriter, req *http.Request) (int, error) {
healthTimer := metrics.GetOrRegisterTimer("healthHandler", metrics.DefaultRegistry)
started := time.Now()
defer healthTimer.UpdateSince(started)
timer := prometheus.NewTimer(handlerTimer.WithLabelValues("health"))
defer timer.ObserveDuration()
defer statsd.Client.NewTiming().Send("handler.health")

req, err := http.NewRequest("GET", fmt.Sprintf("%s/latest/meta-data/instance-id", h.endpoint), nil)
if err != nil {
Expand Down
16 changes: 9 additions & 7 deletions pkg/aws/metadata/handler_role_name.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ import (
"fmt"
"github.com/cenkalti/backoff"
"github.com/gorilla/mux"
"github.com/rcrowley/go-metrics"
log "github.com/sirupsen/logrus"
"github.com/uswitch/kiam/pkg/server"
"net/http"
"net/url"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/uswitch/kiam/pkg/statsd"
)

type roleHandler struct {
Expand All @@ -50,9 +52,9 @@ func (h *roleHandler) Install(router *mux.Router) {
}

func (h *roleHandler) Handle(ctx context.Context, w http.ResponseWriter, req *http.Request) (int, error) {
roleNameTimings := metrics.GetOrRegisterTimer("roleNameHandler", metrics.DefaultRegistry)
startTime := time.Now()
defer roleNameTimings.UpdateSince(startTime)
timer := prometheus.NewTimer(handlerTimer.WithLabelValues("roleName"))
defer timer.ObserveDuration()
defer statsd.Client.NewTiming().Send("handler.role_name")

err := req.ParseForm()
if err != nil {
Expand All @@ -67,17 +69,17 @@ func (h *roleHandler) Handle(ctx context.Context, w http.ResponseWriter, req *ht
role, err := findRole(ctx, h.client, ip)

if err != nil {
metrics.GetOrRegisterMeter("roleNameHandler.findRoleError", metrics.DefaultRegistry).Mark(1)
findRoleError.WithLabelValues("roleName").Inc()
return http.StatusInternalServerError, err
}

if role == "" {
metrics.GetOrRegisterMeter("credentialsHandler.emptyRole", metrics.DefaultRegistry).Mark(1)
emptyRole.WithLabelValues("roleName").Inc()
return http.StatusNotFound, EmptyRoleError
}

fmt.Fprint(w, role)
metrics.GetOrRegisterMeter("roleNameHandler.success", metrics.DefaultRegistry).Mark(1)
success.WithLabelValues("roleName").Inc()

return http.StatusOK, nil
}
Expand Down
Loading

0 comments on commit ecf1dd2

Please sign in to comment.