Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEP-19] Adapt monitoring configuration #307

Merged
merged 2 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/renovate.json5
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"separateMinorPatch": true,
"packageRules": [
{
// Group calico image updates in one PR.
// Group cilium image updates in one PR.
"groupName": "cilium images",
"matchDatasources": ["docker"],
"matchPackagePatterns": ["quay\\.io\/cilium\/.+"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ rules:
- admissionregistration.k8s.io
- apiextensions.k8s.io
- networking.k8s.io
- monitoring.coreos.com
resources:
- namespaces
- events
Expand All @@ -85,6 +86,9 @@ rules:
- mutatingwebhookconfigurations
- customresourcedefinitions
- networkpolicies
- scrapeconfigs
- prometheusrules
- statefulsets # TODO(rfranzke): Remove this after August 2024.
verbs:
- "*"
---
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"datasource": "-- Plutono --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
Expand Down Expand Up @@ -8327,4 +8327,4 @@
"title": "Cilium v1.12 Agent Metrics",
"uid": "dtas",
"version": 3
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"datasource": "-- Plutono --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
Expand Down Expand Up @@ -1005,4 +1005,4 @@
"title": "Cilium v1.12 Operator Metrics",
"uid": "fsafdsf",
"version": 1
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"datasource": "-- Plutono --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
Expand Down Expand Up @@ -3568,4 +3568,4 @@
"title": "Cilium v1.12 Hubble Metrics",
"uid": "seafadsfdsa",
"version": 1
}
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{{- if not .Values.gep19Monitoring }}
apiVersion: v1
kind: ConfigMap
metadata:
Expand Down Expand Up @@ -125,3 +126,4 @@ data:
expr: histogram_quantile(0.99,sum by(verb, method, path, le) (rate(cilium_agent_api_process_time_seconds_bucket[10m])))
labels:
quantile: "0.99"
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{{- if .Values.gep19Monitoring }}
apiVersion: v1
kind: ConfigMap
metadata:
name: cilium-dashboards
namespace: {{ .Release.Namespace }}
labels:
dashboard.monitoring.gardener.cloud/shoot: "true"
data:
cilium-agent-metrics-dashboard.json: |-
{{- .Files.Get "cilium-agent-metrics-dashboard.json" | nindent 4 }}

cilium-operator-metrics--dashboard.json: |-
{{- .Files.Get "cilium-operator-metrics-dashboard.json" | nindent 4 }}

hubble-dashboard-metrics-dashboard.json: |-
{{- .Files.Get "hubble-metrics-dashboard.json" | nindent 4 }}
rfranzke marked this conversation as resolved.
Show resolved Hide resolved
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{{- if .Values.gep19Monitoring }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: shoot-cilium-agent
namespace: {{ .Release.Namespace }}
labels:
prometheus: shoot
spec:
groups:
- name: recording-rules.rules
rules:
- record: cilium:api_latency
expr: histogram_quantile(0.99,sum by(verb, method, path, le) (rate(cilium_agent_api_process_time_seconds_bucket[10m])))
labels:
quantile: "0.99"
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{{- if .Values.gep19Monitoring }}
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: shoot-cilium-agent
namespace: {{ .Release.Namespace }}
labels:
prometheus: shoot
spec:
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
scheme: HTTPS
tlsConfig:
# This is needed because the kubelets' certificates are not are generated
# for a specific pod IP
insecureSkipVerify: true
kubernetesSDConfigs:
- apiServer: https://kube-apiserver
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
followRedirects: true
namespaces:
names:
- kube-system
role: endpoints
tlsConfig:
# This is needed because we do not fetch the correct cluster CA bundle right now
insecureSkipVerify: true
metricRelabelings:
- sourceLabels:
- __name__
action: keep
regex: ^(cilium_process_virtual_memory_bytes|cilium_process_resident_memory_bytes|cilium_process_open_fds|cilium_bpf_maps_virtual_memory_max_bytes|cilium_bpf_progs_virtual_memory_max_bytes|cilium_agent_api_process_time_seconds_count|cilium_agent_api_process_time_seconds_sum|cilium_agent_api_process_time_seconds_bucket|cilium_bpf_syscall_duration_seconds_count|cilium_bpf_syscall_duration_seconds_sum|cilium_bpf_map_ops_total|kvstore_operations_total|cilium_kvstore_operations_duration_seconds_sum|cilium_kvstore_events_queue_seconds_count|cilium_forward_count_total|cilium_forward_bytes_total|cilium_datapath_conntrack_gc_entries|cilium_ip_addresses|cilium_datapath_errors_total|cilium_services_events_total|cilium_unreachable_health_endpoints|cilium_unreachable_nodes|cilium_unreachable_health_endpoints|cilium_drop_count_total|cilium_nodes_all_events_received_total|cilium_drop_bytes_total|cilium_nodes_all_num|cilium_policy_l7_denied_total|cilium_policy_l7_forwarded_total|cilium_policy_l7_received_total|cilium_policy_l7_parse_errors_total|cilium_proxy_upstream_reply_seconds_sum|cilium_proxy_upstream_reply_seconds_count|cilium_triggers_policy_update_call_duration_seconds_sum|cilium_policy_endpoint_enforcement_status|cilium_proxy_redirects|cilium_triggers_policy_update_total|cilium_policy_count|cilium_policy_import_errors|cilium_policy_max_revision|cilium_endfpoint_regeneration_time_stats_seconds_bucket|cilium_endpoint_regenerations|cilium_endpoint_state|cilium_controllers_runs_total|cilium_controllers_failing|cilium_controllers_runs_duration_seconds_sum|cilium_controllers_runs_duration_seconds_count|cilium_k8s_client_api_latency_time_seconds_sum|cilium_k8s_client_api_latency_time_seconds_count|cilium_k8s_client_api_calls_counter|cilium_kubernetes_events_received_total|cilium_kubernetes_events_total|cilium_process_cpu_seconds_total|cilium_errors_warnings_total|cilium_endpoint_regeneration_time_stats_seconds_bucket)$
- sourceLabels:
- namespace
action: keep
regex: kube-system
relabelings:
- action: replace
replacement: cilium-agent-metrics
targetLabel: job
- sourceLabels: [__meta_kubernetes_pod_label_k8s_app,__meta_kubernetes_service_annotation_prometheus_io_scrape]
separator: ;
regex: cilium;true
replacement: $1
action: keep
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.*)
targetLabel: pod
replacement: $1
action: replace
- targetLabel: __address__
replacement: kube-apiserver:443
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.+);(.+)
targetLabel: __metrics_path__
replacement: /api/v1/namespaces/kube-system/pods/${1}:9090/proxy/metrics
action: replace
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{{- if .Values.gep19Monitoring }}
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: shoot-cilium-hubble
namespace: {{ .Release.Namespace }}
labels:
prometheus: shoot
spec:
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
scheme: HTTPS
tlsConfig:
# This is needed because the kubelets' certificates are not are generated
# for a specific pod IP
insecureSkipVerify: true
kubernetesSDConfigs:
- apiServer: https://kube-apiserver
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
followRedirects: true
namespaces:
names:
- kube-system
role: endpoints
tlsConfig:
# This is needed because we do not fetch the correct cluster CA bundle right now
insecureSkipVerify: true
metricRelabelings:
- sourceLabels:
- __name__
action: keep
regex: ^(hubble_flows_processed_total|hubble_drop_total|hubble_port_distribution_total|hubble_tcp_flags_total|hubble_icmp_total)$
- sourceLabels:
- namespace
action: keep
regex: kube-system
relabelings:
- action: replace
replacement: hubble-metrics
targetLabel: job
- sourceLabels: [__meta_kubernetes_pod_label_k8s_app,__meta_kubernetes_service_annotation_prometheus_io_scrape]
separator: ;
regex: cilium;true
replacement: $1
action: keep
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.*)
targetLabel: pod
replacement: $1
action: replace
- targetLabel: __address__
replacement: kube-apiserver:443
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.+);(.+)
targetLabel: __metrics_path__
replacement: /api/v1/namespaces/kube-system/pods/${1}:9091/proxy/metrics
action: replace
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{{- if .Values.gep19Monitoring }}
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: shoot-cilium-operator
namespace: {{ .Release.Namespace }}
labels:
prometheus: shoot
spec:
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
scheme: HTTPS
tlsConfig:
# This is needed because the kubelets' certificates are not are generated
# for a specific pod IP
insecureSkipVerify: true
kubernetesSDConfigs:
- apiServer: https://kube-apiserver
authorization:
credentials:
name: shoot-access-prometheus-shoot
key: token
followRedirects: true
namespaces:
names:
- kube-system
role: endpoints
tlsConfig:
# This is needed because we do not fetch the correct cluster CA bundle right now
insecureSkipVerify: true
metricRelabelings:
- sourceLabels:
- __name__
action: keep
regex: ^(cilium_operator_process_cpu_seconds_total|cilium_operator_process_resident_memory_bytes)$
- sourceLabels:
- namespace
action: keep
regex: kube-system
relabelings:
- action: replace
replacement: cilium-operator-metrics
targetLabel: job
- sourceLabels: [__meta_kubernetes_pod_label_io_cilium_app,__meta_kubernetes_pod_annotation_prometheus_io_scrape]
separator: ;
regex: operator;true
replacement: $1
action: keep
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.*)
targetLabel: pod
replacement: $1
action: replace
- targetLabel: __address__
replacement: kube-apiserver:443
- sourceLabels: [__meta_kubernetes_pod_name,__meta_kubernetes_pod_annotation_prometheus_io_port]
separator: ;
regex: (.+);(.+)
targetLabel: __metrics_path__
replacement: /api/v1/namespaces/kube-system/pods/${1}:${2}/proxy/metrics
action: replace
{{- end }}
3 changes: 3 additions & 0 deletions charts/internal/cilium-monitoring/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,6 @@ allowedMetrics:
ciliumOperator:
- cilium_operator_process_cpu_seconds_total
- cilium_operator_process_resident_memory_bytes

# TODO(rfranzke): Remove this field after August 2024.
gep19Monitoring: false
9 changes: 9 additions & 0 deletions cmd/gardener-extension-networking-cilium/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants"
"github.com/gardener/gardener/pkg/logger"
"github.com/pkg/errors"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
monitoringv1alpha1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1alpha1"
"github.com/spf13/cobra"
corev1 "k8s.io/api/core/v1"
"k8s.io/component-base/version"
Expand Down Expand Up @@ -140,6 +142,13 @@ func NewControllerManagerCommand(ctx context.Context) *cobra.Command {
return fmt.Errorf("could not update manager scheme: %w", err)
}

if err := monitoringv1.AddToScheme(mgr.GetScheme()); err != nil {
return fmt.Errorf("could not update manager scheme: %w", err)
}
if err := monitoringv1alpha1.AddToScheme(mgr.GetScheme()); err != nil {
return fmt.Errorf("could not update manager scheme: %w", err)
}

reconcileOpts.Completed().Apply(&ciliumcontroller.DefaultAddOptions.IgnoreOperationAnnotation)
ciliumCtrlOpts.Completed().Apply(&ciliumcontroller.DefaultAddOptions.Controller)
configFileOpts.Completed().ApplyHealthCheckConfig(&healthcheck.AddOptions.HealthCheckConfig)
Expand Down
Loading