Skip to content

Commit

Permalink
cilium: add metrics collection for signal package
Browse files Browse the repository at this point in the history
Add metrics in order to ease debugging/introspection of signal events.

Example output:

  # ./cilium/cilium metrics list | grep cilium_datapath_signals
  cilium_datapath_signals_handled_total              status="lost" data="" signal=""                                              1.000000
  cilium_datapath_signals_handled_total              signal="nat_fill_up" status="received" data="ipv4"                           4.000000

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
  • Loading branch information
borkmann authored and brb committed Oct 11, 2019
1 parent 617ba34 commit cf342e1
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 0 deletions.
25 changes: 25 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ const (
// LabelProtocol marks the L4 protocol (TCP, ANY) for the metric.
LabelProtocol = "protocol"

// LabelSignalType marks the signal name
LabelSignalType = "signal"

// LabelSignalData marks the signal data
LabelSignalData = "data"

// LabelStatus the label from completed task
LabelStatus = "status"

Expand Down Expand Up @@ -296,6 +302,11 @@ var (
// ConntrackGCDuration the duration of the conntrack GC process in milliseconds.
ConntrackGCDuration = NoOpObserverVec

// Signals

// SignalsHandled is the number of signals received.
SignalsHandled = NoOpCounterVec

// Services

// ServicesCount number of services
Expand Down Expand Up @@ -410,6 +421,7 @@ type Configuration struct {
ConntrackGCKeyFallbacksEnabled bool
ConntrackGCSizeEnabled bool
ConntrackGCDurationEnabled bool
SignalsHandledEnabled bool
ServicesCountEnabled bool
ErrorsWarningsEnabled bool
ControllerRunsEnabled bool
Expand Down Expand Up @@ -462,6 +474,7 @@ func DefaultMetrics() map[string]struct{} {
Namespace + "_" + SubsystemDatapath + "_conntrack_gc_key_fallbacks_total": {},
Namespace + "_" + SubsystemDatapath + "_conntrack_gc_entries": {},
Namespace + "_" + SubsystemDatapath + "_conntrack_gc_duration_seconds": {},
Namespace + "_" + SubsystemDatapath + "_signals_handled_total": {},
Namespace + "_services_events_total": {},
Namespace + "_errors_warnings_total": {},
Namespace + "_controllers_runs_total": {},
Expand Down Expand Up @@ -820,6 +833,18 @@ func CreateConfiguration(metricsEnabled []string) (Configuration, []prometheus.C
collectors = append(collectors, ConntrackGCDuration)
c.ConntrackGCDurationEnabled = true

case Namespace + "_" + SubsystemDatapath + "_signals_handled_total":
SignalsHandled = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: SubsystemDatapath,
Name: "signals_handled_total",
Help: "Number of times that the datapath signal handler process was run " +
"labeled by signal type, data and completion status",
}, []string{LabelSignalType, LabelSignalData, LabelStatus})

collectors = append(collectors, SignalsHandled)
c.SignalsHandledEnabled = true

case Namespace + "_services_events_total":
ServicesCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Expand Down
29 changes: 29 additions & 0 deletions pkg/signal/signal.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/cilium/cilium/pkg/byteorder"
"github.com/cilium/cilium/pkg/logging"
"github.com/cilium/cilium/pkg/logging/logfields"
"github.com/cilium/cilium/pkg/metrics"
)

const (
Expand All @@ -43,6 +44,7 @@ const (
SignalNatV4 = iota
// SignalNatV6 denotes NAT IPv6 table
SignalNatV6
SignalNatMax
)

// SignalData holds actual data the BPF program sent along with
Expand Down Expand Up @@ -70,6 +72,29 @@ var (
channels [SignalTypeMax]chan<- SignalData
)

var (
signalName = [SignalTypeMax]string{
SignalNatFillUp: "nat_fill_up",
}

signalNatProto = [SignalNatMax]string{
SignalNatV4: "ipv4",
SignalNatV6: "ipv6",
}
)

func signalCollectMetrics(sig *SignalMsg, signalStatus string) {
signalType := ""
signalData := ""
if sig != nil {
signalType = signalName[sig.Which]
if sig.Which == SignalNatFillUp {
signalData = signalNatProto[sig.Data]
}
}
metrics.SignalsHandled.WithLabelValues(signalType, signalData, signalStatus).Inc()
}

func signalReceive(msg *bpf.PerfEventSample, cpu int) {
sig := SignalMsg{}
if err := binary.Read(bytes.NewReader(msg.DataDirect()), byteorder.Native, &sig); err != nil {
Expand All @@ -78,15 +103,19 @@ func signalReceive(msg *bpf.PerfEventSample, cpu int) {
}
if channels[sig.Which] != nil {
channels[sig.Which] <- sig.Data
signalCollectMetrics(&sig, "received")
}
}

func signalLost(lost *bpf.PerfEventLost, cpu int) {
// Not much we can do here, with the given set of signals it is non-fatal,
// so we keep ignoring lost events right now.
signalCollectMetrics(nil, "lost")
}

func signalError(err *bpf.PerfEvent) {
signalCollectMetrics(nil, "error")

log.Errorf("BUG: Timeout while reading signal perf ring buffer: %s", err.Debug())
}

Expand Down

0 comments on commit cf342e1

Please sign in to comment.