Skip to content

Commit 118dc45

Browse files
committed
add prometheus server config
1 parent b53fc14 commit 118dc45

File tree

7 files changed

+361
-2
lines changed

7 files changed

+361
-2
lines changed

deploy/kubernetes/daemonset.yaml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ apiVersion: apps/v1
22
kind: DaemonSet
33
metadata:
44
name: nvidia-gpu-monitoring-daemonset
5+
namespace: monitoring
56
labels:
67
k8s-app: nvidia-gpu-monitoring-daemonset
78
spec:
@@ -12,11 +13,14 @@ spec:
1213
metadata:
1314
labels:
1415
k8s-app: nvidia-gpu-monitoring-daemonset
16+
annotations:
17+
prometheus.io/extension: 'true'
18+
prometheus.io/port: '8080'
1519
spec:
1620
containers:
1721
- image: "gcr.io/dashpole-kubernetes-test/gpu-monitor:v2"
1822
name: gpu-monitor
19-
args: ["--socket=/podresources/kubelet.sock"]
23+
args: ["--socket=/podresources/kubelet.sock", "--v=10"]
2024
volumeMounts:
2125
- name: kubelet-podresources
2226
mountPath: /podresources
@@ -26,13 +30,17 @@ spec:
2630
mountPath: /home/kubernetes/bin/nvidia/lib64/
2731
securityContext:
2832
privileged: true
33+
ports:
34+
- name: http
35+
containerPort: 8080
36+
protocol: TCP
2937
env:
3038
- name: LD_LIBRARY_PATH
3139
value: "/home/kubernetes/bin/nvidia/lib64/"
3240
volumes:
3341
- name: kubelet-podresources
3442
hostPath:
35-
path: /var/lib/kubelet/pod-resources/
43+
path: /var/lib/kubelet/
3644
- name: dev
3745
hostPath:
3846
path: /dev

deploy/kubernetes/namespace.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
kind: Namespace
3+
metadata:
4+
name: monitoring
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Usage
2+
`# extra step for GKE`
3+
`EMAIL=your.google.cloud.email@example.org`
4+
`kubectl create clusterrolebinding prometheus-admin --clusterrole=cluster-admin --user=$EMAIL`
5+
6+
`# view prometheus console through service, after externalIP is created`
7+
`kubectl get svc --namespace=monitoring -w`
8+
9+
`# navigate to externalIP:9090`
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
data:
4+
prometheus.yaml: |
5+
# A scrape configuration for running Prometheus on a Kubernetes cluster.
6+
# This uses separate scrape configs for cluster components (i.e. API server, node)
7+
# and services to allow each to use different authentication configs.
8+
#
9+
# Kubernetes labels will be added as Prometheus labels on metrics via the
10+
# `labelmap` relabeling action.
11+
#
12+
# If you are using Kubernetes 1.7.2 or earlier, please take note of the comments
13+
# for the kubernetes-cadvisor job; you will need to edit or remove this job.
14+
15+
# Scrape config for API servers.
16+
#
17+
# Kubernetes exposes API servers as endpoints to the default/kubernetes
18+
# service so this uses `endpoints` role and uses relabelling to only keep
19+
# the endpoints associated with the default/kubernetes service using the
20+
# default named port `https`. This works for single API server deployments as
21+
# well as HA API server deployments.
22+
scrape_configs:
23+
- job_name: 'kubernetes-apiservers'
24+
25+
kubernetes_sd_configs:
26+
- role: endpoints
27+
28+
# Default to scraping over https. If required, just disable this or change to
29+
# `http`.
30+
scheme: https
31+
32+
# This TLS & bearer token file config is used to connect to the actual scrape
33+
# endpoints for cluster components. This is separate to discovery auth
34+
# configuration because discovery & scraping are two separate concerns in
35+
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
36+
# the cluster. Otherwise, more config options have to be provided within the
37+
# <kubernetes_sd_config>.
38+
tls_config:
39+
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
40+
# If your node certificates are self-signed or use a different CA to the
41+
# master CA, then disable certificate verification below. Note that
42+
# certificate verification is an integral part of a secure infrastructure
43+
# so this should only be disabled in a controlled environment. You can
44+
# disable certificate verification by uncommenting the line below.
45+
#
46+
# insecure_skip_verify: true
47+
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
48+
49+
# Keep only the default/kubernetes service endpoints for the https port. This
50+
# will add targets for each API server which Kubernetes adds an endpoint to
51+
# the default/kubernetes service.
52+
relabel_configs:
53+
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
54+
action: keep
55+
regex: default;kubernetes;https
56+
57+
# Scrape config for nodes (kubelet).
58+
#
59+
# Rather than connecting directly to the node, the scrape is proxied though the
60+
# Kubernetes apiserver. This means it will work if Prometheus is running out of
61+
# cluster, or can't connect to nodes for some other reason (e.g. because of
62+
# firewalling).
63+
#- job_name: 'kubernetes-nodes'
64+
65+
# Default to scraping over https. If required, just disable this or change to
66+
# `http`.
67+
# scheme: https
68+
69+
# This TLS & bearer token file config is used to connect to the actual scrape
70+
# endpoints for cluster components. This is separate to discovery auth
71+
# configuration because discovery & scraping are two separate concerns in
72+
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
73+
# the cluster. Otherwise, more config options have to be provided within the
74+
# <kubernetes_sd_config>.
75+
#tls_config:
76+
# ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
77+
#bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
78+
79+
#kubernetes_sd_configs:
80+
#- role: node
81+
82+
#relabel_configs:
83+
#- action: labelmap
84+
#regex: __meta_kubernetes_node_label_(.+)
85+
# - target_label: __address__
86+
#replacement: kubernetes.default.svc:443
87+
#- source_labels: [__meta_kubernetes_node_name]
88+
#regex: (.+)
89+
#target_label: __metrics_path__
90+
#replacement: /api/v1/nodes/${1}/proxy/metrics
91+
92+
# Scrape config for kubernetes monitoring extensions.
93+
#
94+
- job_name: 'kubernetes-monitoring-extension'
95+
96+
kubernetes_sd_configs:
97+
- role: pod
98+
99+
relabel_configs:
100+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_extension]
101+
action: keep
102+
regex: true
103+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
104+
action: replace
105+
target_label: __metrics_path__
106+
regex: (.+)
107+
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
108+
action: replace
109+
regex: ([^:]+)(?::\d+)?;(\d+)
110+
replacement: $1:$2
111+
target_label: __address__
112+
- source_labels: [__meta_kubernetes_pod_node_name]
113+
action: replace
114+
target_label: kubernetes_pod_node_name
115+
- source_labels: [__meta_kubernetes_namespace]
116+
action: keep
117+
regex: ^monitoring$
118+
119+
metric_relabel_configs:
120+
- source_labels: [pod_namespace, pod_name, container_name]
121+
action: keep
122+
regex: '.+;.+;.+'
123+
124+
# Scrape config for service endpoints.
125+
#
126+
# The relabeling allows the actual service scrape endpoint to be configured
127+
# via the following annotations:
128+
#
129+
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
130+
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
131+
# to set this to `https` & most likely set the `tls_config` of the scrape config.
132+
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
133+
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
134+
# service then set this appropriately.
135+
- job_name: 'kubernetes-service-endpoints'
136+
137+
kubernetes_sd_configs:
138+
- role: endpoints
139+
140+
relabel_configs:
141+
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
142+
action: keep
143+
regex: true
144+
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
145+
action: replace
146+
target_label: __scheme__
147+
regex: (https?)
148+
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
149+
action: replace
150+
target_label: __metrics_path__
151+
regex: (.+)
152+
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
153+
action: replace
154+
target_label: __address__
155+
regex: ([^:]+)(?::\d+)?;(\d+)
156+
replacement: $1:$2
157+
- action: labelmap
158+
regex: __meta_kubernetes_service_label_(.+)
159+
- source_labels: [__meta_kubernetes_namespace]
160+
action: replace
161+
target_label: kubernetes_namespace
162+
- source_labels: [__meta_kubernetes_service_name]
163+
action: replace
164+
target_label: kubernetes_name
165+
166+
# Example scrape config for probing services via the Blackbox Exporter.
167+
#
168+
# The relabeling allows the actual service scrape endpoint to be configured
169+
# via the following annotations:
170+
#
171+
# * `prometheus.io/probe`: Only probe services that have a value of `true`
172+
- job_name: 'kubernetes-services'
173+
174+
metrics_path: /probe
175+
params:
176+
module: [http_2xx]
177+
178+
kubernetes_sd_configs:
179+
- role: service
180+
181+
relabel_configs:
182+
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
183+
action: keep
184+
regex: true
185+
- source_labels: [__address__]
186+
target_label: __param_target
187+
- target_label: __address__
188+
replacement: blackbox-exporter.example.com:9115
189+
- source_labels: [__param_target]
190+
target_label: instance
191+
- action: labelmap
192+
regex: __meta_kubernetes_service_label_(.+)
193+
- source_labels: [__meta_kubernetes_namespace]
194+
target_label: kubernetes_namespace
195+
- source_labels: [__meta_kubernetes_service_name]
196+
target_label: kubernetes_name
197+
198+
# Example scrape config for pods
199+
#
200+
# The relabeling allows the actual pod scrape endpoint to be configured via the
201+
# following annotations:
202+
#
203+
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
204+
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
205+
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
206+
# pod's declared ports (default is a port-free target if none are declared).
207+
- job_name: 'kubernetes-pods'
208+
209+
kubernetes_sd_configs:
210+
- role: pod
211+
212+
relabel_configs:
213+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
214+
action: keep
215+
regex: true
216+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
217+
action: replace
218+
target_label: __metrics_path__
219+
regex: (.+)
220+
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
221+
action: replace
222+
regex: ([^:]+)(?::\d+)?;(\d+)
223+
replacement: $1:$2
224+
target_label: __address__
225+
- action: labelmap
226+
regex: __meta_kubernetes_pod_label_(.+)
227+
- source_labels: [__meta_kubernetes_namespace]
228+
action: replace
229+
target_label: kubernetes_namespace
230+
- source_labels: [__meta_kubernetes_pod_name]
231+
action: replace
232+
target_label: kubernetes_pod_name
233+
- source_labels: [__meta_kubernetes_container_name]
234+
action: replace
235+
target_label: kubernetes_container_name
236+
metadata:
237+
creationTimestamp: null
238+
name: prometheus-core
239+
namespace: monitoring
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
apiVersion: extensions/v1beta1
2+
kind: Deployment
3+
metadata:
4+
name: prometheus-core
5+
namespace: monitoring
6+
labels:
7+
app: prometheus
8+
component: core
9+
spec:
10+
replicas: 1
11+
template:
12+
metadata:
13+
name: prometheus-main
14+
labels:
15+
app: prometheus
16+
component: core
17+
spec:
18+
serviceAccountName: prometheus-k8s
19+
containers:
20+
- name: prometheus
21+
image: prom/prometheus:v1.7.0
22+
args:
23+
- '-storage.local.retention=12h'
24+
- '-storage.local.memory-chunks=500000'
25+
- '-config.file=/etc/prometheus/prometheus.yaml'
26+
ports:
27+
- name: webui
28+
containerPort: 9090
29+
resources:
30+
requests:
31+
cpu: 500m
32+
memory: 500M
33+
limits:
34+
cpu: 500m
35+
memory: 500M
36+
volumeMounts:
37+
- name: config-volume
38+
mountPath: /etc/prometheus
39+
volumes:
40+
- name: config-volume
41+
configMap:
42+
name: prometheus-core
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
---
2+
apiVersion: rbac.authorization.k8s.io/v1beta1
3+
kind: ClusterRoleBinding
4+
metadata:
5+
name: prometheus
6+
roleRef:
7+
apiGroup: rbac.authorization.k8s.io
8+
kind: ClusterRole
9+
name: prometheus
10+
subjects:
11+
- kind: ServiceAccount
12+
name: prometheus-k8s
13+
namespace: monitoring
14+
---
15+
apiVersion: rbac.authorization.k8s.io/v1beta1
16+
kind: ClusterRole
17+
metadata:
18+
name: prometheus
19+
rules:
20+
- apiGroups: [""]
21+
resources:
22+
- nodes
23+
- services
24+
- endpoints
25+
- pods
26+
verbs: ["get", "list", "watch"]
27+
- apiGroups: [""]
28+
resources:
29+
- configmaps
30+
verbs: ["get"]
31+
- nonResourceURLs: ["/metrics"]
32+
verbs: ["get"]
33+
---
34+
apiVersion: v1
35+
kind: ServiceAccount
36+
metadata:
37+
name: prometheus-k8s
38+
namespace: monitoring
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: prometheus
5+
namespace: monitoring
6+
labels:
7+
app: prometheus
8+
component: core
9+
annotations:
10+
prometheus.io/scrape: 'true'
11+
spec:
12+
type: LoadBalancer
13+
selector:
14+
app: prometheus
15+
component: core
16+
ports:
17+
- port: 9090
18+
protocol: TCP
19+
targetPort: 9090

0 commit comments

Comments
 (0)