Collects GPU capacity, reserved, free metrics from Kubernetes nodes and KubeVirt VMIs
This repository is provided for hobby and educational purposes only. If you plan to use it in a production environment, it will most likely require significant customization, testing, and security audits. Please fork and build your own application.
- Dynamic GPU type detection via node labels
- Metrics for total capacity, reserved, and free GPUs per node
- Flexible GPU device matching (can adjust mapping)
Add the following labels to GPU nodes:
When a node is labeled with
gpu-workload=false, it will be returned as0.0.
kubectl label node node01 gpu-workload=true
kubectl label node node01 nvidia.com/NVIDIA-H200-SXM=8
kubectl label node node02 gpu-workload=true
kubectl label node node02 amd.com/INSTINCT-MI300X=8
kubectl label node node01 gpu-workload=true
kubectl label node node03 intel.com/DCGPU-MAX1550=8
kubevirt_gpu_capacity{gpu_type="nvidia.com/NVIDIA-H200-SXM",node="node01"} 8.0
kubevirt_gpu_capacity{gpu_type="amd.com/INSTINCT-MI300X",node="node02"} 8.0
kubevirt_gpu_capacity{gpu_type="intel.com/DCGPU-MAX1550",node="node03"} 8.0
kubevirt_gpu_total_cluster_capacity{gpu_type="nvidia.com/NVIDIA-H200-SXM"} 8.0
kubevirt_gpu_total_cluster_capacity{gpu_type="amd.com/INSTINCT-MI300X"} 8.0
kubevirt_gpu_total_cluster_capacity{gpu_type="intel.com/DCGPU-MAX1550"} 8.0
kubevirt_gpu_reserved{gpu_type="nvidia.com/NVIDIA-H200-SXM",node="node01"} 4.0
kubevirt_gpu_reserved{gpu_type="amd.com/INSTINCT-MI300X",node="node02"} 2.0
kubevirt_gpu_reserved{gpu_type="intel.com/DCGPU-MAX1550",node="node03"} 1.0
kubevirt_gpu_instance{gpu_type="nvidia.com/NVIDIA-H200-SXM",node="node01"namespace="production",instance="vm-gpu01",address="10.244.2.88"} 4.0
kubevirt_gpu_instance{gpu_type="amd.com/INSTINCT-MI300X",node="node02",namespace="development",instance="vm-gpu02",address="10.244.2.99"} 2.0
kubevirt_gpu_instance{gpu_type="intel.com/DCGPU-MAX1550",node="node03",namespace="testing",instance="vm-gpu03",address="10.244.2.66"} 1.0
kubevirt_gpu_free{gpu_type="nvidia.com/NVIDIA-H200-SXM",node="node01"} 8.0
kubevirt_gpu_free{gpu_type="amd.com/INSTINCT-MI300X",node="node02"} 8.0
kubevirt_gpu_free{gpu_type="intel.com/DCGPU-MAX1550",node="node03"} 8.0
kubectl apply -f manifests/rbac.yaml
kubectl apply -f manifests/service.yaml
kubectl apply -f manifests/deployment.yaml
kubectl apply -f manifests/serviceMonitor.yamlExpose port 9100 to Prometheus.