Skip to content

Commit

Permalink
Improve observability (#21)
Browse files Browse the repository at this point in the history
* Add k8sattributes processor to otel collector

* Fix monitoring part of kubernetes internals

* Fix adding k8s metadata to observability

* Add k8s metadata to telemetry

* fix collector affinity

* WIP relabel prometheus metrics

* Align metadata naming

* Fix linking traces to logs

* Monitor jupyter kernels directly - auto-instrumentation does not work

* Add dashboards

* Add dashboards (broken due to GitHub auth and branch)

---------

Co-authored-by: Frédéric Collonval <fcollonval@users.noreply.github.com>
  • Loading branch information
fcollonval and fcollonval authored Aug 9, 2024
1 parent a924b4d commit 664f10c
Show file tree
Hide file tree
Showing 10 changed files with 6,609 additions and 110 deletions.
1,183 changes: 1,183 additions & 0 deletions charts/datalayer-observer/dashboards/datalayer-service.json

Large diffs are not rendered by default.

2,796 changes: 2,796 additions & 0 deletions charts/datalayer-observer/dashboards/remote-kernels-details.json

Large diffs are not rendered by default.

2,317 changes: 2,317 additions & 0 deletions charts/datalayer-observer/dashboards/remote-kernels-overview.json

Large diffs are not rendered by default.

44 changes: 44 additions & 0 deletions charts/datalayer-observer/templates/collector-servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{{- if .Values.collector.serviceMonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ default .Values.observer.namespace .Release.Namespace }}-opentelemetry-collector
namespace: {{ default .Values.observer.namespace .Release.Namespace }}
labels:
app: opentelemetry-collector
monitoring.datalayer.io/enabled: "true"
monitoring.datalayer.io/instance: "observer"
{{- with .Values.collector.serviceMonitor.additionalLabels }}
{{ toYaml . | indent 4 }}
{{- end }}
spec:
selector:
matchLabels:
app.kubernetes.io/component: opentelemetry-collector
app.kubernetes.io/name: datalayer-collector-collector
endpoints:
# Collector metrics monitoring
- port: metrics
{{- if .Values.collector.serviceMonitor.interval }}
interval: {{ .Values.collector.serviceMonitor.interval }}
{{- end }}
# Scraping of collected metrics
- port: prometheus
{{- if .Values.collector.serviceMonitor.interval }}
interval: {{ .Values.collector.serviceMonitor.interval }}
{{- end }}
relabelings:
- regex: (container|endpoint|namespace|pod|service)
action: labeldrop
metricRelabelings:
- regex: (job|instance)
action: labeldrop
# Restore original label (hide the collector proxy)
- regex: "exported_(.*)"
action: "labelmap"
- regex: "exported_(.*)"
action: "labeldrop"
namespaceSelector:
matchNames:
- {{ default .Values.observer.namespace .Release.Namespace }}
{{- end }}
113 changes: 107 additions & 6 deletions charts/datalayer-observer/templates/collector.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,18 @@ metadata:
affinity: {{- toYaml . | nindent 8 }}
{{- end }}
spec:
# The opentelemetry-collector chart is a good place to look for good configuration
# Reference: https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-collector
mode: deployment # This configuration is omittable.
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: role.datalayer.io/system
operator: In
values:
- "true"
config:
receivers:
otlp:
Expand All @@ -22,12 +33,102 @@ spec:
# https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor
batch: {}
# https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/memorylimiterprocessor
# Default memory limiter configuration for the collector based on k8s resource limits.
memory_limiter:
check_interval: 1s
limit_mib: 4000
spike_limit_mib: 800
# check_interval is the time between measurements of memory usage.
check_interval: 5s
# By default limit_mib is set to 80% of ".Values.resources.limits.memory"
limit_percentage: 80
# By default spike_limit_mib is set to 25% of ".Values.resources.limits.memory"
spike_limit_percentage: 25

# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor
k8sattributes: {}
k8sattributes:
extract:
metadata:
- k8s.namespace.name
- k8s.deployment.name
- k8s.statefulset.name
- k8s.daemonset.name
- k8s.cronjob.name
- k8s.job.name
- k8s.node.name
- k8s.pod.name
- k8s.pod.uid
- k8s.pod.start_time
labels:
- tag_name: app.name
key: datalayer.io/app
from: pod
- tag_name: datalayer.pool.name
key: jupyterpool.datalayer.io/name
from: pod
- tag_name: datalayer.pool.status
key: jupyterpool.datalayer.io/pod-status
from: pod
- tag_name: datalayer.pool.user
key: jupyterpool.datalayer.io/user-uid
from: pod
- tag_name: datalayer.pool.type
key: jupyterpool.datalayer.io/kernel-type
from: pod
- tag_name: datalayer.pool.reservation
key: jupyterpool.datalayer.io/reservation-id
from: pod
passthrough: false
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection

# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/resourceprocessor
resource:
attributes:
# Align resource attributes names with prometheus metrics attributes names
- key: cluster
action: upsert
value: '{{ .Values.observer.env.DATALAYER_RUN_HOST }}'
- key: namespace
action: upsert
from_attribute: k8s.namespace.name
- key: pod
action: upsert
from_attribute: k8s.pod.name
- key: instance
action: upsert
from_attribute: k8s.pod.uid
- key: app
action: upsert
from_attribute: datalayer.app.name
# Opentelemetry service specification attributes
# https://opentelemetry.io/docs/specs/semconv/resource/#service
# service.name is already set by the code instrumentation in traces
- key: service.namespace
action: upsert
from_attribute: k8s.namespace.name
- key: service.instance.id
action: upsert
from_attribute: k8s.pod.uid
# Reduce the attributes duplication
- action: delete
pattern: k8s\..*

transform:
metric_statements:
# Set metric labels from resource attribute as prometheus does not handle them properly
- context: datapoint
statements:
- set(attributes["cluster"], "{{ .Values.observer.env.DATALAYER_RUN_HOST }}")
- set(attributes["namespace"], resource.attributes["k8s.namespace.name"])
- set(attributes["pod"], resource.attributes["k8s.pod.name"])
- set(attributes["instance"], resource.attributes["k8s.pod.uid"])
- set(attributes["app"], resource.attributes["datalayer.app.name"])
- set(attributes["service_name"], resource.attributes["service.name"])

exporters:
# https://github.com/open-telemetry/opentelemetry-collector/tree/main/exporter/debugexporter
Expand Down Expand Up @@ -58,10 +159,10 @@ spec:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, k8sattributes, batch]
processors: [memory_limiter, k8sattributes, resource, batch]
exporters: [otlp]
metrics:
receivers: [otlp]
processors: [memory_limiter, k8sattributes, batch]
processors: [memory_limiter, k8sattributes, transform, batch]
exporters: [prometheus]
{{- end }}
16 changes: 0 additions & 16 deletions charts/datalayer-observer/templates/kernels-instrumentation.yaml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ default .Values.observer.namespace .Release.Namespace }}-opentelemetry-collector
name: {{ default .Values.observer.namespace .Release.Namespace }}-jupyter-kernels
namespace: {{ default .Values.observer.namespace .Release.Namespace }}
labels:
app: opentelemetry-collector
app: jupyter-kernels
monitoring.datalayer.io/enabled: "true"
monitoring.datalayer.io/instance: "observer"
{{- with .Values.collector.serviceMonitor.additionalLabels }}
Expand All @@ -14,14 +14,14 @@ metadata:
spec:
selector:
matchLabels:
app.kubernetes.io/component: opentelemetry-collector
app.kubernetes.io/name: datalayer-collector-collector
datalayer.io/app: jupyterpool
endpoints:
- port: prometheus
# Collector metrics monitoring
- targetPort: 2300
{{- if .Values.collector.serviceMonitor.interval }}
interval: {{ .Values.collector.serviceMonitor.interval }}
{{- end }}
namespaceSelector:
matchNames:
- {{ default .Values.observer.namespace .Release.Namespace }}
- datalayer-jupyter
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{{- if index .Values "opentelemetry-operator" "enabled" }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: datalayer-logs-collector
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list"]
- apiGroups: ["apps"]
resources: ["replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["replicasets"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: datalayer-logs-collector
subjects:
- kind: ServiceAccount
name: datalayer-logs-collector-collector
namespace: {{ default .Values.observer.namespace .Release.Namespace }}
roleRef:
kind: ClusterRole
name: datalayer-logs-collector
apiGroup: rbac.authorization.k8s.io
{{- end }}
Loading

0 comments on commit 664f10c

Please sign in to comment.