Skip to content

Commit

Permalink
Adapt Prometheus scrape configs and rules and Plutono dashboards
Browse files Browse the repository at this point in the history
  • Loading branch information
rfranzke committed May 7, 2024
1 parent 44140d6 commit 6e2ae72
Show file tree
Hide file tree
Showing 13 changed files with 288 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ rules:
- mutatingwebhookconfigurations
- customresourcedefinitions
- networkpolicies
- statefulsets # TODO(rfranzke): Remove this after August 2024.
verbs:
- "*"
---
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{{- if not .Values.gep19Monitoring }}
apiVersion: v1
kind: ConfigMap
metadata:
Expand Down Expand Up @@ -125,3 +126,4 @@ data:
expr: histogram_quantile(0.99,sum by(verb, method, path, le) (rate(cilium_agent_api_process_time_seconds_bucket[10m])))
labels:
quantile: "0.99"
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{{- if .Values.gep19Monitoring }}
apiVersion: v1
kind: ConfigMap
metadata:
name: cilium-dashboards
namespace: {{ .Release.Namespace }}
labels:
dashboard.monitoring.gardener.cloud/shoot: "true"
data:
cilium-agent-metrics-dashboard.json: |-
{{- .Files.Get "cilium-agent-metrics-dashboard.json" | nindent 4 }}

cilium-operator-metrics--dashboard.json: |-
{{- .Files.Get "cilium-operator-metrics-dashboard.json" | nindent 4 }}

hubble-dashboard-metrics-dashboard.json: |-
{{- .Files.Get "hubble-metrics-dashboard.json" | nindent 4 }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{{- if .Values.gep19Monitoring }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: shoot-cilium-agent
namespace: {{ .Release.Namespace }}
labels:
prometheus: shoot
spec:
groups:
- name: recording-rules.rules
rules:
- record: cilium:api_latency
expr: histogram_quantile(0.99,sum by(verb, method, path, le) (rate(cilium_agent_api_process_time_seconds_bucket[10m])))
labels:
quantile: "0.99"
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{{- if .Values.gep19Monitoring }}
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: shoot-cilium-agent
namespace: {{ .Release.Namespace }}
labels:
prometheus: shoot
spec:
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
scheme: HTTPS
tlsConfig:
# This is needed because the kubelets' certificates are not are generated
# for a specific pod IP
insecureSkipVerify: true
kubernetesSDConfigs:
- apiServer: https://kube-apiserver
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
followRedirects: true
namespaces:
names:
- kube-system
role: endpoints
tlsConfig:
# This is needed because we do not fetch the correct cluster CA bundle right now
insecureSkipVerify: true
metricRelabelings:
- sourceLabels:
- __name__
action: keep
regex: ^(cilium_process_virtual_memory_bytes|cilium_process_resident_memory_bytes|cilium_process_open_fds|cilium_bpf_maps_virtual_memory_max_bytes|cilium_bpf_progs_virtual_memory_max_bytes|cilium_agent_api_process_time_seconds_count|cilium_agent_api_process_time_seconds_sum|cilium_agent_api_process_time_seconds_bucket|cilium_bpf_syscall_duration_seconds_count|cilium_bpf_syscall_duration_seconds_sum|cilium_bpf_map_ops_total|kvstore_operations_total|cilium_kvstore_operations_duration_seconds_sum|cilium_kvstore_events_queue_seconds_count|cilium_forward_count_total|cilium_forward_bytes_total|cilium_datapath_conntrack_gc_entries|cilium_ip_addresses|cilium_datapath_errors_total|cilium_services_events_total|cilium_unreachable_health_endpoints|cilium_unreachable_nodes|cilium_unreachable_health_endpoints|cilium_drop_count_total|cilium_nodes_all_events_received_total|cilium_drop_bytes_total|cilium_nodes_all_num|cilium_policy_l7_denied_total|cilium_policy_l7_forwarded_total|cilium_policy_l7_received_total|cilium_policy_l7_parse_errors_total|cilium_proxy_upstream_reply_seconds_sum|cilium_proxy_upstream_reply_seconds_count|cilium_triggers_policy_update_call_duration_seconds_sum|cilium_policy_endpoint_enforcement_status|cilium_proxy_redirects|cilium_triggers_policy_update_total|cilium_policy_count|cilium_policy_import_errors|cilium_policy_max_revision|cilium_endfpoint_regeneration_time_stats_seconds_bucket|cilium_endpoint_regenerations|cilium_endpoint_state|cilium_controllers_runs_total|cilium_controllers_failing|cilium_controllers_runs_duration_seconds_sum|cilium_controllers_runs_duration_seconds_count|cilium_k8s_client_api_latency_time_seconds_sum|cilium_k8s_client_api_latency_time_seconds_count|cilium_k8s_client_api_calls_counter|cilium_kubernetes_events_received_total|cilium_kubernetes_events_total|cilium_process_cpu_seconds_total|cilium_errors_warnings_total|cilium_endpoint_regeneration_time_stats_seconds_bucket)$
- sourceLabels:
- namespace
action: keep
regex: kube-system
relabelings:
- action: replace
replacement: cilium-agent-metrics
targetLabel: job
- sourceLabels: [__meta_kubernetes_pod_label_k8s_app,__meta_kubernetes_service_annotation_prometheus_io_scrape]
separator: ;
regex: cilium;true
replacement: $1
action: keep
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.*)
targetLabel: pod
replacement: $1
action: replace
- targetLabel: __address__
replacement: kube-apiserver:443
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.+);(.+)
targetLabel: __metrics_path__
replacement: /api/v1/namespaces/kube-system/pods/${1}:9090/proxy/metrics
action: replace
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{{- if .Values.gep19Monitoring }}
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: shoot-cilium-hubble
namespace: {{ .Release.Namespace }}
labels:
prometheus: shoot
spec:
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
scheme: HTTPS
tlsConfig:
# This is needed because the kubelets' certificates are not are generated
# for a specific pod IP
insecureSkipVerify: true
kubernetesSDConfigs:
- apiServer: https://kube-apiserver
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
followRedirects: true
namespaces:
names:
- kube-system
role: endpoints
tlsConfig:
# This is needed because we do not fetch the correct cluster CA bundle right now
insecureSkipVerify: true
metricRelabelings:
- sourceLabels:
- __name__
action: keep
regex: ^(hubble_flows_processed_total|hubble_drop_total|hubble_port_distribution_total|hubble_tcp_flags_total|hubble_icmp_total)$
- sourceLabels:
- namespace
action: keep
regex: kube-system
relabelings:
- action: replace
replacement: hubble-metrics
targetLabel: job
- sourceLabels: [__meta_kubernetes_pod_label_k8s_app,__meta_kubernetes_service_annotation_prometheus_io_scrape]
separator: ;
regex: cilium;true
replacement: $1
action: keep
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.*)
targetLabel: pod
replacement: $1
action: replace
- targetLabel: __address__
replacement: kube-apiserver:443
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.+);(.+)
targetLabel: __metrics_path__
replacement: /api/v1/namespaces/kube-system/pods/${1}:9091/proxy/metrics
action: replace
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{{- if .Values.gep19Monitoring }}
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: shoot-cilium-operator
namespace: {{ .Release.Namespace }}
labels:
prometheus: shoot
spec:
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
scheme: HTTPS
tlsConfig:
# This is needed because the kubelets' certificates are not are generated
# for a specific pod IP
insecureSkipVerify: true
kubernetesSDConfigs:
- apiServer: https://kube-apiserver
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
followRedirects: true
namespaces:
names:
- kube-system
role: endpoints
tlsConfig:
# This is needed because we do not fetch the correct cluster CA bundle right now
insecureSkipVerify: true
metricRelabelings:
- sourceLabels:
- __name__
action: keep
regex: ^(cilium_operator_process_cpu_seconds_total|cilium_operator_process_resident_memory_bytes)$
- sourceLabels:
- namespace
action: keep
regex: kube-system
relabelings:
- action: replace
replacement: cilium-operator-metrics
targetLabel: job
- sourceLabels: [__meta_kubernetes_pod_label_io_cilium_app,__meta_kubernetes_pod_annotation_prometheus_io_scrape]
separator: ;
regex: operator;true
replacement: $1
action: keep
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.*)
targetLabel: pod
replacement: $1
action: replace
- targetLabel: __address__
replacement: kube-apiserver:443
- sourceLabels: [__meta_kubernetes_pod_name,__meta_kubernetes_pod_annotation_prometheus_io_port]
separator: ;
regex: (.+);(.+)
targetLabel: __metrics_path__
replacement: /api/v1/namespaces/kube-system/pods/${1}:${2}/proxy/metrics
action: replace
{{- end }}
3 changes: 3 additions & 0 deletions charts/internal/cilium-monitoring/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,6 @@ allowedMetrics:
ciliumOperator:
- cilium_operator_process_cpu_seconds_total
- cilium_operator_process_resident_memory_bytes

# TODO(rfranzke): Remove this field after August 2024.
gep19Monitoring: false
9 changes: 9 additions & 0 deletions cmd/gardener-extension-networking-cilium/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants"
"github.com/gardener/gardener/pkg/logger"
"github.com/pkg/errors"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
monitoringv1alpha1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1alpha1"
"github.com/spf13/cobra"
corev1 "k8s.io/api/core/v1"
"k8s.io/component-base/version"
Expand Down Expand Up @@ -140,6 +142,13 @@ func NewControllerManagerCommand(ctx context.Context) *cobra.Command {
return fmt.Errorf("could not update manager scheme: %w", err)
}

if err := monitoringv1.AddToScheme(mgr.GetScheme()); err != nil {
return fmt.Errorf("could not update manager scheme: %w", err)
}
if err := monitoringv1alpha1.AddToScheme(mgr.GetScheme()); err != nil {
return fmt.Errorf("could not update manager scheme: %w", err)
}

reconcileOpts.Completed().Apply(&ciliumcontroller.DefaultAddOptions.IgnoreOperationAnnotation)
ciliumCtrlOpts.Completed().Apply(&ciliumcontroller.DefaultAddOptions.Controller)
configFileOpts.Completed().ApplyHealthCheckConfig(&healthcheck.AddOptions.HealthCheckConfig)
Expand Down
Loading

0 comments on commit 6e2ae72

Please sign in to comment.