Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve observability #21

Merged
merged 12 commits into from
Aug 9, 2024
1,183 changes: 1,183 additions & 0 deletions charts/datalayer-observer/dashboards/datalayer-service.json

Large diffs are not rendered by default.

2,796 changes: 2,796 additions & 0 deletions charts/datalayer-observer/dashboards/remote-kernels-details.json

Large diffs are not rendered by default.

2,317 changes: 2,317 additions & 0 deletions charts/datalayer-observer/dashboards/remote-kernels-overview.json

Large diffs are not rendered by default.

44 changes: 44 additions & 0 deletions charts/datalayer-observer/templates/collector-servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{{- if .Values.collector.serviceMonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ default .Values.observer.namespace .Release.Namespace }}-opentelemetry-collector
namespace: {{ default .Values.observer.namespace .Release.Namespace }}
labels:
app: opentelemetry-collector
monitoring.datalayer.io/enabled: "true"
monitoring.datalayer.io/instance: "observer"
{{- with .Values.collector.serviceMonitor.additionalLabels }}
{{ toYaml . | indent 4 }}
{{- end }}
spec:
selector:
matchLabels:
app.kubernetes.io/component: opentelemetry-collector
app.kubernetes.io/name: datalayer-collector-collector
endpoints:
# Collector metrics monitoring
- port: metrics
{{- if .Values.collector.serviceMonitor.interval }}
interval: {{ .Values.collector.serviceMonitor.interval }}
{{- end }}
# Scraping of collected metrics
- port: prometheus
{{- if .Values.collector.serviceMonitor.interval }}
interval: {{ .Values.collector.serviceMonitor.interval }}
{{- end }}
relabelings:
- regex: (container|endpoint|namespace|pod|service)
action: labeldrop
metricRelabelings:
- regex: (job|instance)
action: labeldrop
# Restore original label (hide the collector proxy)
- regex: "exported_(.*)"
action: "labelmap"
- regex: "exported_(.*)"
action: "labeldrop"
namespaceSelector:
matchNames:
- {{ default .Values.observer.namespace .Release.Namespace }}
{{- end }}
113 changes: 107 additions & 6 deletions charts/datalayer-observer/templates/collector.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,18 @@ metadata:
affinity: {{- toYaml . | nindent 8 }}
{{- end }}
spec:
# The opentelemetry-collector chart is a good place to look for good configuration
# Reference: https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-collector
mode: deployment # This configuration is omittable.
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: role.datalayer.io/system
operator: In
values:
- "true"
config:
receivers:
otlp:
Expand All @@ -22,12 +33,102 @@ spec:
# https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor
batch: {}
# https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/memorylimiterprocessor
# Default memory limiter configuration for the collector based on k8s resource limits.
memory_limiter:
check_interval: 1s
limit_mib: 4000
spike_limit_mib: 800
# check_interval is the time between measurements of memory usage.
check_interval: 5s
# By default limit_mib is set to 80% of ".Values.resources.limits.memory"
limit_percentage: 80
# By default spike_limit_mib is set to 25% of ".Values.resources.limits.memory"
spike_limit_percentage: 25

# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor
k8sattributes: {}
k8sattributes:
extract:
metadata:
- k8s.namespace.name
- k8s.deployment.name
- k8s.statefulset.name
- k8s.daemonset.name
- k8s.cronjob.name
- k8s.job.name
- k8s.node.name
- k8s.pod.name
- k8s.pod.uid
- k8s.pod.start_time
labels:
- tag_name: app.name
key: datalayer.io/app
from: pod
- tag_name: datalayer.pool.name
key: jupyterpool.datalayer.io/name
from: pod
- tag_name: datalayer.pool.status
key: jupyterpool.datalayer.io/pod-status
from: pod
- tag_name: datalayer.pool.user
key: jupyterpool.datalayer.io/user-uid
from: pod
- tag_name: datalayer.pool.type
key: jupyterpool.datalayer.io/kernel-type
from: pod
- tag_name: datalayer.pool.reservation
key: jupyterpool.datalayer.io/reservation-id
from: pod
passthrough: false
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection

# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/resourceprocessor
resource:
attributes:
# Align resource attributes names with prometheus metrics attributes names
- key: cluster
action: upsert
value: '{{ .Values.observer.env.DATALAYER_RUN_HOST }}'
- key: namespace
action: upsert
from_attribute: k8s.namespace.name
- key: pod
action: upsert
from_attribute: k8s.pod.name
- key: instance
action: upsert
from_attribute: k8s.pod.uid
- key: app
action: upsert
from_attribute: datalayer.app.name
# Opentelemetry service specification attributes
# https://opentelemetry.io/docs/specs/semconv/resource/#service
# service.name is already set by the code instrumentation in traces
- key: service.namespace
action: upsert
from_attribute: k8s.namespace.name
- key: service.instance.id
action: upsert
from_attribute: k8s.pod.uid
# Reduce the attributes duplication
- action: delete
pattern: k8s\..*

transform:
metric_statements:
# Set metric labels from resource attribute as prometheus does not handle them properly
- context: datapoint
statements:
- set(attributes["cluster"], "{{ .Values.observer.env.DATALAYER_RUN_HOST }}")
- set(attributes["namespace"], resource.attributes["k8s.namespace.name"])
- set(attributes["pod"], resource.attributes["k8s.pod.name"])
- set(attributes["instance"], resource.attributes["k8s.pod.uid"])
- set(attributes["app"], resource.attributes["datalayer.app.name"])
- set(attributes["service_name"], resource.attributes["service.name"])

exporters:
# https://github.com/open-telemetry/opentelemetry-collector/tree/main/exporter/debugexporter
Expand Down Expand Up @@ -58,10 +159,10 @@ spec:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, k8sattributes, batch]
processors: [memory_limiter, k8sattributes, resource, batch]
exporters: [otlp]
metrics:
receivers: [otlp]
processors: [memory_limiter, k8sattributes, batch]
processors: [memory_limiter, k8sattributes, transform, batch]
exporters: [prometheus]
{{- end }}
16 changes: 0 additions & 16 deletions charts/datalayer-observer/templates/kernels-instrumentation.yaml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ default .Values.observer.namespace .Release.Namespace }}-opentelemetry-collector
name: {{ default .Values.observer.namespace .Release.Namespace }}-jupyter-kernels
namespace: {{ default .Values.observer.namespace .Release.Namespace }}
labels:
app: opentelemetry-collector
app: jupyter-kernels
monitoring.datalayer.io/enabled: "true"
monitoring.datalayer.io/instance: "observer"
{{- with .Values.collector.serviceMonitor.additionalLabels }}
Expand All @@ -14,14 +14,14 @@ metadata:
spec:
selector:
matchLabels:
app.kubernetes.io/component: opentelemetry-collector
app.kubernetes.io/name: datalayer-collector-collector
datalayer.io/app: jupyterpool
endpoints:
- port: prometheus
# Collector metrics monitoring
- targetPort: 2300
{{- if .Values.collector.serviceMonitor.interval }}
interval: {{ .Values.collector.serviceMonitor.interval }}
{{- end }}
namespaceSelector:
matchNames:
- {{ default .Values.observer.namespace .Release.Namespace }}
- datalayer-jupyter
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{{- if index .Values "opentelemetry-operator" "enabled" }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: datalayer-logs-collector
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list"]
- apiGroups: ["apps"]
resources: ["replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["replicasets"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: datalayer-logs-collector
subjects:
- kind: ServiceAccount
name: datalayer-logs-collector-collector
namespace: {{ default .Values.observer.namespace .Release.Namespace }}
roleRef:
kind: ClusterRole
name: datalayer-logs-collector
apiGroup: rbac.authorization.k8s.io
{{- end }}
Loading
Loading