diff --git a/.gitignore b/.gitignore index e660fd9..0fe0276 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ bin/ +go/ +*.snap diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..78aab39 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,64 @@ +## Changelog + +Full commit history per tag: https://github.com/vpenso/prometheus-slurm-exporter/commits/{tag number} + +* **0.19** + - Merge PR#50 + +* **0.18** + - Add CPU/Memory info per node (see PR#47) + +* **0.17** + - Add fair share collector + +* **0.16** + - Export more data per account/partition, fix squeue for pending jobs + - Merge PR#34 + +* **0.15** + - CPU allocation status per partition + +* **0.14** + - add stats about jobs per account/per user + +* **0.13** + - Merge pull request #32 from pdtpartners/faster-node-metrics + +* **0.12** + - Merge pull request #30 from omnivector-solutions/add_snap_packaging + +* **0.11** + - Merge PR#29 + - Add more backfill stats (see PR#27) + +* **0.10** + - Scheduler: keep track of the DBD agent queue size + +* **0.9** + - README: update to fix build problem raised with issue #26 + +* **0.8** + - Merge pull request #21 from cleargray/command-paths + +* **0.7** + - Update scheduler.go (fix issue #18) + +* **0.6** + - Merge pull request #13 from rug-cit-hpc/master + +* **0.5** + - [BUG]: count all job states (issue #9) + +* **0.4** + - Merge pull request #8 from MatMaul/pending-dep + +* **0.3** + - Fix issue #4 + +* **0.2** + - Fix issue #3 + +* **0.1** + - Basic prototype + - Merge PR#2 + - Add Grafana dashboard diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md deleted file mode 100644 index ed1a43b..0000000 --- a/CONTRIBUTORS.md +++ /dev/null @@ -1,5 +0,0 @@ -# List of Contributors - -* [Victor Penso](https://github.com/vpenso) -* [Matteo Dessalvi](https://github.com/mtds) - diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md new file mode 100644 index 0000000..e6b3b93 --- /dev/null +++ b/DEVELOPMENT.md @@ -0,0 +1,56 @@ +# Development + +Setup the development environment on a node with access to the Slurm user +command-line interface, in particular with the `sinfo`, `squeue`, and `sdiag` +commands. + +## Install Go from source + +```bash +export VERSION=1.15 OS=linux ARCH=amd64 +wget https://dl.google.com/go/go$VERSION.$OS-$ARCH.tar.gz +tar -xzvf go$VERSION.$OS-$ARCH.tar.gz +export PATH=$PWD/go/bin:$PATH +``` + +_Alternatively install Go using the packaging system of your Linux distribution._ + +## Clone this repository and build + +Use Git to clone the source code of the exporter, run all the tests and build the binary: + +```bash +# clone the source code +git clone https://github.com/vpenso/prometheus-slurm-exporter.git +cd prometheus-slurm-exporter +make +``` + +To just run the tests: + +```bash +make test +``` + +Start the exporter (foreground), and query all metrics: + +```bash +./bin/prometheus-slurm-exporter +``` + +If you wish to run the exporter on a different port, or the default port (8080) is already in use, run with the following argument: + +```bash +./bin/prometheus-slurm-exporter --listen-address="0.0.0.0:" +... + +# query all metrics (default port) +curl http://localhost:8080/metrics +``` + +## References + +* [GOlang Package Documentation](https://godoc.org/github.com/prometheus/client_golang/prometheus) +* [Metric Types](https://prometheus.io/docs/concepts/metric_types/) +* [Writing Exporters](https://prometheus.io/docs/instrumenting/writing_exporters/) +* [Available Exporters](https://prometheus.io/docs/instrumenting/exporters/) diff --git a/Makefile b/Makefile index 1c2ac7e..d7d24bc 100644 --- a/Makefile +++ b/Makefile @@ -1,20 +1,28 @@ PROJECT_NAME = prometheus-slurm-exporter -ifndef GOPATH - GOPATH=$(shell pwd):/usr/share/gocode -endif -GOFILES=cpus.go main.go nodes.go queue.go scheduler.go -GOBIN=bin/$(PROJECT_NAME) +SHELL := $(shell which bash) -eu -o pipefail -build: - mkdir -p $(shell pwd)/bin - @echo "Build $(GOFILES) to $(GOBIN)" - @GOPATH=$(GOPATH) go build -o $(GOBIN) $(GOFILES) +GOPATH := $(shell pwd)/go/modules +GOBIN := bin/$(PROJECT_NAME) +GOFILES := $(shell ls *.go) -test: - @GOPATH=$(GOPATH) go test -v *.go +.PHONY: build +build: test $(GOBIN) -run: - @GOPATH=$(GOPATH) go run $(GOFILES) +$(GOBIN): go/modules/pkg/mod $(GOFILES) + mkdir -p bin + @echo "Building $(GOBIN)" + go build -v -o $(GOBIN) + +go/modules/pkg/mod: go.mod + go mod download + +.PHONY: test +test: go/modules/pkg/mod $(GOFILES) + go test -v + +run: $(GOBIN) + $(GOBIN) clean: - if [ -f ${GOBIN} ] ; then rm -f ${GOBIN} ; fi + go clean -modcache + rm -fr bin/ go/ diff --git a/README.md b/README.md index 1d1938b..5bfe42a 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,26 @@ Prometheus collector and exporter for metrics extracted from the [Slurm](https:/ * **Other**: CPUs which are unavailable for use at the moment. * **Total**: total number of CPUs. -- [Information extracted from the SLURM **sinfo** command](https://slurm.schedmd.com/sinfo.html) +- Information extracted from the SLURM [**sinfo**](https://slurm.schedmd.com/sinfo.html) command. - [Slurm CPU Management User and Administrator Guide](https://slurm.schedmd.com/cpu_management.html) +### State of the GPUs + +* **Allocated**: GPUs which have been allocated to a job. +* **Other**: GPUs which are unavailable for use at the moment. +* **Total**: total number of GPUs. +* **Utilization**: total GPU utiliazation on the cluster. + +- Information extracted from the SLURM [**sinfo**](https://slurm.schedmd.com/sinfo.html) and [**sacct**](https://slurm.schedmd.com/sacct.html) command. +- [Slurm GRES scheduling](https://slurm.schedmd.com/gres.html) + +**NOTE**: since version **0.19**, GPU accounting has to be **explicitly** enabled adding the _-gpus-acct_ option to the command line otherwise it will not be activated. + +Be aware that: + +* According to issue #38, users reported that newer version of Slurm provides slightly different output and thus GPUs accounting may not work properly. +* Users who do not have GPUs and/or do not have accounting activated may want to keep GPUs accounting **off** (see issue #45). + ### State of the Nodes * **Allocated**: nodes which has been allocated to one or more jobs. @@ -29,12 +46,22 @@ Prometheus collector and exporter for metrics extracted from the [Slurm](https:/ * **Mixed**: nodes which have some of their CPUs ALLOCATED while others are IDLE. * **Resv**: these nodes are in an advanced reservation and not generally available. -[Information extracted from the SLURM **sinfo** command](https://slurm.schedmd.com/sinfo.html) +- Information extracted from the SLURM [**sinfo**](https://slurm.schedmd.com/sinfo.html) command. + +#### Additional info about node usage + +Since version **0.18**, the following information are also extracted and exported for **every** node known by Slurm: + +* CPUs: how many are _allocated_, _idle_, _other_ and in _total_. +* Memory: _allocated_ and in _total_. +* Labels: hostname and its Slurm status (e.g. _idle_, _mix_, _allocated_, _draining_, etc.). + +See the related [test data](https://github.com/vpenso/prometheus-slurm-exporter/blob/master/test_data/sinfo_mem.txt) to check the format of the information extracted from Slurm. ### Status of the Jobs * **PENDING**: Jobs awaiting for resource allocation. -* **PENDING_DEPENDENCY**: Jobs awaiting because of a unexecuted job dependency. +* **PENDING_DEPENDENCY**: Jobs awaiting because of an unexecuted job dependency. * **RUNNING**: Jobs currently allocated. * **SUSPENDED**: Job has an allocation but execution has been suspended and CPUs have been released for other jobs. * **CANCELLED**: Jobs which were explicitly cancelled by the user or system administrator. @@ -46,11 +73,23 @@ Prometheus collector and exporter for metrics extracted from the [Slurm](https:/ * **PREEMPTED**: Jobs terminated due to preemption. * **NODE_FAIL**: Jobs terminated due to failure of one or more allocated nodes. -[Information extracted from the SLURM **squeue** command](https://slurm.schedmd.com/squeue.html) +- Information extracted from the SLURM [**squeue**](https://slurm.schedmd.com/squeue.html) command. + +### State of the Partitions + +* Running/suspended Jobs per partitions, divided between Slurm accounts and users. +* CPUs total/allocated/idle per partition plus used CPU per user ID. + +### Jobs information per Account and User + +The following information about jobs are also extracted via [squeue](https://slurm.schedmd.com/squeue.html): + +* **Running/Pending/Suspended** jobs per SLURM Account. +* **Running/Pending/Suspended** jobs per SLURM User. ### Scheduler Information -* **Server Thread count**: The number of current active ``slurmctld`` threads. +* **Server Thread count**: The number of current active ``slurmctld`` threads. * **Queue size**: The length of the scheduler queue. * **DBD Agent queue size**: The length of the message queue for _SlurmDBD_. * **Last cycle**: Time in microseconds for last scheduling cycle. @@ -59,185 +98,32 @@ Prometheus collector and exporter for metrics extracted from the [Slurm](https:/ * **(Backfill) Last cycle**: Time in microseconds of last backfilling cycle. * **(Backfill) Mean cycle**: Mean of backfilling scheduling cycles in microseconds since last reset. * **(Backfill) Depth mean**: Mean of processed jobs during backfilling scheduling cycles since last reset. +* **(Backfill) Total Backfilled Jobs** (since last slurm start): number of jobs started thanks to backfilling since last Slurm start. +* **(Backfill) Total Backfilled Jobs** (since last stats cycle start): number of jobs started thanks to backfilling since last time stats where reset. +* **(Backfill) Total backfilled heterogeneous Job components**: number of heterogeneous job components started thanks to backfilling since last Slurm start. -[Information extracted from the SLURM **sdiag** command](https://slurm.schedmd.com/sdiag.html) +- Information extracted from the SLURM [**sdiag**](https://slurm.schedmd.com/sdiag.html) command. *DBD Agent queue size*: it is particularly important to keep track of it, since an increasing number of messages counted with this parameter almost always indicates three issues: -* the _SlurmDBD_ daemon is down; +* the _SlurmDBD_ daemon is down; * the database is either down or unreachable; * the status of the Slurm accounting DB may be inconsistent (e.g. ``sreport`` missing data, weird utilization of the cluster, etc.). -## How to build an RPM package from the relases - -Consult the [following document](packaging/rpm/README.md) under the ``packaging/rpm`` subdirectory. - -## How to build the exporter from the sources - -### Debian - -Install the Prometheus [Go client library](https://github.com/prometheus/client_golang) - - >>> apt install golang-github-prometheus-client-golang-dev - -Use the [Makefile](Makefile) to build and test the code. - -**Debian Jessie**: in this release, the Prometheus client library package was available only through the backport archives but the Debian maintainers discontinued it, as explained [here](https://lists.debian.org/debian-backports-announce/2018/07/msg00000.html). Now only __Debian Stretch__ is supported with the previous build method. - -### CentOS +### Share Information -Under CentOS not all the GOlang dependencies are available as packages. +Collect _share_ statistics for every Slurm account. Refer to the [manpage of the sshare command](https://slurm.schedmd.com/sshare.html) to get more information. -**GOPATH**: Since ``go`` version _1.13_ it is better to host the modules in a separate directory otherwise this will generate an error message: _$GOPATH/go.mod exists but should not_ - -In order to use the [Makefile](Makefile) provided with this repository you can proceed as follows: - -1. Install the Golang compiler plus GIT and make: -```bash -yum install git golang-bin make -``` - -2. Clone this repo and change into the source directory: -```bash -git clone https://github.com/vpenso/prometheus-slurm-exporter.git -cd prometheus-slurm-exporter -``` - -3. Build a module cache to host the necessary Golang dependencies using the [Go modules](https://blog.golang.org/using-go-modules): -```bash -GOPATH=/tmp/go-modules-cache go mod download -go: finding github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 -go: finding github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4 -go: finding github.com/beorn7/perks v1.0.1 -go: finding github.com/cespare/xxhash/v2 v2.1.0 -go: finding github.com/davecgh/go-spew v1.1.1 -go: finding github.com/go-kit/kit v0.9.0 -go: finding github.com/go-logfmt/logfmt v0.4.0 -go: finding github.com/go-stack/stack v1.8.0 -go: finding github.com/gogo/protobuf v1.1.1 -go: finding github.com/golang/protobuf v1.3.2 -go: finding github.com/google/go-cmp v0.3.0 -go: finding github.com/google/gofuzz v1.0.0 -go: finding github.com/json-iterator/go v1.1.7 -go: finding github.com/julienschmidt/httprouter v1.2.0 -go: finding github.com/konsorten/go-windows-terminal-sequences v1.0.1 -go: finding github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515 -go: finding github.com/matttproud/golang_protobuf_extensions v1.0.1 -go: finding github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd -go: finding github.com/modern-go/reflect2 v1.0.1 -go: finding github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223 -go: finding github.com/pkg/errors v0.8.1 -go: finding github.com/pmezard/go-difflib v1.0.0 -go: finding github.com/prometheus/client_golang v1.2.1 -go: finding github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4 -go: finding github.com/prometheus/common v0.7.0 -go: finding github.com/prometheus/procfs v0.0.5 -go: finding github.com/sirupsen/logrus v1.4.2 -go: finding github.com/stretchr/objx v0.1.1 -go: finding github.com/stretchr/testify v1.3.0 -go: finding golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 -go: finding golang.org/x/net v0.0.0-20190613194153-d28f0bde5980 -go: finding golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 -go: finding golang.org/x/sys v0.0.0-20191010194322-b09406accb47 -go: finding golang.org/x/text v0.3.0 -go: finding gopkg.in/alecthomas/kingpin.v2 v2.2.6 -go: finding gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 -go: finding gopkg.in/yaml.v2 v2.2.2 -``` - -4. Build the executable binary: -```bash -go build -go: downloading github.com/prometheus/client_golang v1.2.1 -go: downloading github.com/prometheus/common v0.7.0 -go: extracting github.com/prometheus/common v0.7.0 -go: downloading github.com/sirupsen/logrus v1.4.2 -go: downloading gopkg.in/alecthomas/kingpin.v2 v2.2.6 -go: extracting github.com/prometheus/client_golang v1.2.1 -go: downloading github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4 -go: downloading github.com/beorn7/perks v1.0.1 -go: downloading github.com/prometheus/procfs v0.0.5 -go: downloading github.com/cespare/xxhash/v2 v2.1.0 -go: downloading github.com/golang/protobuf v1.3.2 -go: downloading github.com/matttproud/golang_protobuf_extensions v1.0.1 -go: extracting github.com/beorn7/perks v1.0.1 -go: extracting gopkg.in/alecthomas/kingpin.v2 v2.2.6 -go: downloading github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4 -go: downloading github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 -go: extracting github.com/sirupsen/logrus v1.4.2 -go: extracting github.com/cespare/xxhash/v2 v2.1.0 -go: downloading golang.org/x/sys v0.0.0-20191010194322-b09406accb47 -go: extracting github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4 -go: extracting github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4 -go: extracting github.com/matttproud/golang_protobuf_extensions v1.0.1 -go: extracting github.com/prometheus/procfs v0.0.5 -go: extracting github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 -go: extracting github.com/golang/protobuf v1.3.2 -go: extracting golang.org/x/sys v0.0.0-20191010194322-b09406accb47 -go: finding github.com/prometheus/client_golang v1.2.1 -go: finding github.com/prometheus/common v0.7.0 -go: finding github.com/sirupsen/logrus v1.4.2 -go: finding gopkg.in/alecthomas/kingpin.v2 v2.2.6 -go: finding github.com/beorn7/perks v1.0.1 -go: finding github.com/cespare/xxhash/v2 v2.1.0 -go: finding github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4 -go: finding github.com/golang/protobuf v1.3.2 -go: finding github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 -go: finding golang.org/x/sys v0.0.0-20191010194322-b09406accb47 -go: finding github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4 -go: finding github.com/prometheus/procfs v0.0.5 -go: finding github.com/matttproud/golang_protobuf_extensions v1.0.1 -``` - -5. Run the test ( **optional** ): if Slurm command line tools (``sinfo``, ``squeue``, etc.) are not available the test will fail! -```bash -GOPATH=/tmp/gopath-for-cache make test -=== RUN TestCPUsMetrics ---- PASS: TestCPUsMetrics (0.00s) - cpus_test.go:29: &{alloc:5725 idle:877 other:34 total:6636} -=== RUN TestCPUssGetMetrics ---- PASS: TestCPUssGetMetrics (0.01s) - cpus_test.go:33: &{alloc:18956 idle:7852 other:12408 total:39216} -=== RUN TestNodesMetrics ---- PASS: TestNodesMetrics (0.03s) - nodes_test.go:29: &{alloc:250 comp:0 down:67 drain:28 err:0 fail:1 idle:319 maint:0 mix:44 resv:0} -=== RUN TestNodesGetMetrics ---- PASS: TestNodesGetMetrics (0.10s) - nodes_test.go:33: &{alloc:328 comp:0 down:230 drain:66 err:0 fail:0 idle:53 maint:0 mix:71 resv:0} -=== RUN TestParseQueueMetrics ---- PASS: TestParseQueueMetrics (0.01s) - queue_test.go:29: &{pending:4 pending_dep:0 running:28 suspended:1 cancelled:1 completing:2 completed:1 configuring:1 failed:1 timeout:1 preempted:1 node_fail:1} -=== RUN TestQueueGetMetrics ---- PASS: TestQueueGetMetrics (0.28s) - queue_test.go:33: &{pending:8280 pending_dep:3 running:7132 suspended:0 cancelled:1 completing:0 completed:180 configuring:0 failed:245 timeout:2 preempted:0 node_fail:0} -=== RUN TestSchedulerMetrics ---- PASS: TestSchedulerMetrics (0.02s) - scheduler_test.go:29: &{threads:3 queue_size:0 last_cycle:97209 mean_cycle:74593 cycle_per_minute:63 backfill_last_cycle:1.94289e+06 backfill_mean_cycle:1.96082e+06 backfill_depth_mean:29324} -=== RUN TestSchedulerGetMetrics ---- PASS: TestSchedulerGetMetrics (0.03s) - scheduler_test.go:33: &{threads:3 queue_size:0 last_cycle:20982 mean_cycle:32874 cycle_per_minute:23 backfill_last_cycle:991389 backfill_mean_cycle:1.7385e+06 backfill_depth_mean:11320} -PASS -ok github.com/vpenso/prometheus-slurm-exporter 0.495s -``` - -## Command line options +## Installation -The following is the list of the command line options available on this exporter: +* Read [DEVELOPMENT.md](DEVELOPMENT.md) in order to build the Prometheus Slurm Exporter. After a successful build copy the executable +`bin/prometheus-slurm-exporter` to a node with access to the Slurm command-line interface. -```bash -:~$ prometheus-slurm-exporter -h -Usage of ./prometheus-slurm-exporter: - -listen-address string - The address to listen on for HTTP requests. (default ":8080") - -log.format value - Set the log target and format. Example: "logger:syslog?appname=bob&local=7" or "logger:stdout?json=true" (default "logger:stderr") - -log.level value - Only log messages with the given severity or above. Valid levels: [debug, info, warn, error, fatal] (default "info") -``` +* A [Systemd Unit][sdu] file to run the executable as service is available in [lib/systemd/prometheus-slurm-exporter.service](lib/systemd/prometheus-slurm-exporter.service). -## Installation +* (**optional**) Distribute the exporter as a Snap package: consult the [following document](packages/snap/README.md). **NOTE**: this method requires the use of [Snap](https://snapcraft.io), which is built by [Canonical](https://canonical.com). -After successfully ran ``make``, you will have a binary called ``prometheus-slurm-exporter`` under the ``bin/`` subdirectory in your local copy of this repository. You can now copy this binary wherever you have installed the Slurm utilities (sinfo,squeue, sdiag) and then put it into execution, either interactively or through a Systemd unit (an example is available [here](lib/systemd/prometheus-slurm-exporter.service)). +[sdu]: https://www.freedesktop.org/software/systemd/man/systemd.service.html ## Prometheus Configuration for the SLURM exporter @@ -248,7 +134,7 @@ scrape_configs: # # SLURM resource manager: -# +# - job_name: 'my_slurm_exporter' scrape_interval: 30s @@ -262,7 +148,7 @@ scrape_configs: * **scrape_interval**: a 30 seconds interval will avoid possible 'overloading' on the SLURM master due to frequent calls of sdiag/squeue/sinfo commands through the exporter. * **scrape_timeout**: on a busy SLURM master a too short scraping timeout will abort the communication from the Prometheus server toward the exporter, thus generating a ``context_deadline_exceeded`` error. -The previous configuration file can be immediately used with a fresh installation of Promethues. At the same time, we highly recommend to include at least the ``global`` section into the configuration. Official documentation about __configuring Prometheus__ is [available here](https://prometheus.io/docs/prometheus/latest/configuration/configuration/). +The previous configuration file can be immediately used with a fresh installation of Prometheus. At the same time, we highly recommend to include at least the ``global`` section into the configuration. Official documentation about __configuring Prometheus__ is [available here](https://prometheus.io/docs/prometheus/latest/configuration/configuration/). **NOTE**: the Prometheus server is using __YAML__ as format for its configuration file, thus **indentation** is really important. Before reloading the Prometheus server it would be better to check the syntax: @@ -276,30 +162,22 @@ Checking prometheus.yml ## Grafana Dashboard -A [dashboard](https://grafana.com/dashboards/4323) is available in order to visualize the exported metrics through [Grafana](https://grafana.com). - -The following are screenshots of the dashboard: +A [dashboard](https://grafana.com/dashboards/4323) is available in order to +visualize the exported metrics through [Grafana](https://grafana.com): ![Status of the Nodes](images/Node_Status.png) -![Status of the Jobs](images/Job_Status.png) -![SLURM Scheduler Information](images/Scheduler_Info.png) -## Prometheus references +![Status of the Jobs](images/Job_Status.png) -* [GOlang Package Documentation](https://godoc.org/github.com/prometheus/client_golang/prometheus) -* [Metric Types](https://prometheus.io/docs/concepts/metric_types/) -* [Writing Exporters](https://prometheus.io/docs/instrumenting/writing_exporters/) -* [Available Exporters](https://prometheus.io/docs/instrumenting/exporters/) +![SLURM Scheduler Information](images/Scheduler_Info.png) ## License -Copyright 2017 Victor Penso, Matteo Dessalvi +Copyright 2017-2020 Victor Penso, Matteo Dessalvi This is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/. - - diff --git a/accounts.go b/accounts.go new file mode 100644 index 0000000..2bc4660 --- /dev/null +++ b/accounts.go @@ -0,0 +1,121 @@ +/* Copyright 2020 Victor Penso + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +package main + +import ( + "io/ioutil" + "os/exec" + "log" + "strings" + "strconv" + "regexp" + "github.com/prometheus/client_golang/prometheus" +) + +func AccountsData() []byte { + cmd := exec.Command("squeue","-a","-r","-h","-o %A|%a|%T|%C") + stdout, err := cmd.StdoutPipe() + if err != nil { + log.Fatal(err) + } + if err := cmd.Start(); err != nil { + log.Fatal(err) + } + out, _ := ioutil.ReadAll(stdout) + if err := cmd.Wait(); err != nil { + log.Fatal(err) + } + return out +} + +type JobMetrics struct { + pending float64 + running float64 + running_cpus float64 + suspended float64 +} + +func ParseAccountsMetrics(input []byte) map[string]*JobMetrics { + accounts := make(map[string]*JobMetrics) + lines := strings.Split(string(input), "\n") + for _, line := range lines { + if strings.Contains(line,"|") { + account := strings.Split(line,"|")[1] + _,key := accounts[account] + if !key { + accounts[account] = &JobMetrics{0,0,0,0} + } + state := strings.Split(line,"|")[2] + state = strings.ToLower(state) + cpus,_ := strconv.ParseFloat(strings.Split(line,"|")[3],64) + pending := regexp.MustCompile(`^pending`) + running := regexp.MustCompile(`^running`) + suspended := regexp.MustCompile(`^suspended`) + switch { + case pending.MatchString(state) == true: + accounts[account].pending++ + case running.MatchString(state) == true: + accounts[account].running++ + accounts[account].running_cpus += cpus + case suspended.MatchString(state) == true: + accounts[account].suspended++ + } + } + } + return accounts +} + +type AccountsCollector struct { + pending *prometheus.Desc + running *prometheus.Desc + running_cpus *prometheus.Desc + suspended *prometheus.Desc +} + +func NewAccountsCollector() *AccountsCollector { + labels := []string{"account"} + return &AccountsCollector{ + pending: prometheus.NewDesc("slurm_account_jobs_pending", "Pending jobs for account", labels, nil), + running: prometheus.NewDesc("slurm_account_jobs_running", "Running jobs for account", labels, nil), + running_cpus: prometheus.NewDesc("slurm_account_cpus_running", "Running cpus for account", labels, nil), + suspended: prometheus.NewDesc("slurm_account_jobs_suspended", "Suspended jobs for account", labels, nil), + } +} + +func (ac *AccountsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- ac.pending + ch <- ac.running + ch <- ac.running_cpus + ch <- ac.suspended +} + +func (ac *AccountsCollector) Collect(ch chan<- prometheus.Metric) { + am := ParseAccountsMetrics(AccountsData()) + for a := range am { + if am[a].pending > 0 { + ch <- prometheus.MustNewConstMetric(ac.pending, prometheus.GaugeValue, am[a].pending, a) + } + if am[a].running > 0 { + ch <- prometheus.MustNewConstMetric(ac.running, prometheus.GaugeValue, am[a].running, a) + } + if am[a].running_cpus > 0 { + ch <- prometheus.MustNewConstMetric(ac.running_cpus, prometheus.GaugeValue, am[a].running_cpus, a) + } + if am[a].suspended > 0 { + ch <- prometheus.MustNewConstMetric(ac.suspended, prometheus.GaugeValue, am[a].suspended, a) + } + } +} diff --git a/go.mod b/go.mod index a0210a2..fb8cd67 100644 --- a/go.mod +++ b/go.mod @@ -5,4 +5,5 @@ go 1.12 require ( github.com/prometheus/client_golang v1.2.1 github.com/prometheus/common v0.7.0 + github.com/stretchr/testify v1.3.0 ) diff --git a/gpus.go b/gpus.go new file mode 100644 index 0000000..ca3bcaf --- /dev/null +++ b/gpus.go @@ -0,0 +1,141 @@ +/* Copyright 2020 Joeri Hermans, Victor Penso, Matteo Dessalvi + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +package main + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" + "io/ioutil" + "os/exec" + "strings" + "strconv" +) + +type GPUsMetrics struct { + alloc float64 + idle float64 + total float64 + utilization float64 +} + +func GPUsGetMetrics() *GPUsMetrics { + return ParseGPUsMetrics() +} + +func ParseAllocatedGPUs() float64 { + var num_gpus = 0.0 + + args := []string{"-a", "-X", "--format=Allocgres", "--state=RUNNING", "--noheader", "--parsable2"} + output := string(Execute("sacct", args)) + if len(output) > 0 { + for _, line := range strings.Split(output, "\n") { + if len(line) > 0 { + line = strings.Trim(line, "\"") + descriptor := strings.TrimPrefix(line, "gpu:") + job_gpus, _ := strconv.ParseFloat(descriptor, 64) + num_gpus += job_gpus + } + } + } + + return num_gpus +} + +func ParseTotalGPUs() float64 { + var num_gpus = 0.0 + + args := []string{"-h", "-o \"%n %G\""} + output := string(Execute("sinfo", args)) + if len(output) > 0 { + for _, line := range strings.Split(output, "\n") { + if len(line) > 0 { + line = strings.Trim(line, "\"") + descriptor := strings.Fields(line)[1] + descriptor = strings.TrimPrefix(descriptor, "gpu:") + descriptor = strings.Split(descriptor, "(")[0] + node_gpus, _ := strconv.ParseFloat(descriptor, 64) + num_gpus += node_gpus + } + } + } + + return num_gpus +} + +func ParseGPUsMetrics() *GPUsMetrics { + var gm GPUsMetrics + total_gpus := ParseTotalGPUs() + allocated_gpus := ParseAllocatedGPUs() + gm.alloc = allocated_gpus + gm.idle = total_gpus - allocated_gpus + gm.total = total_gpus + gm.utilization = allocated_gpus / total_gpus + return &gm +} + +// Execute the sinfo command and return its output +func Execute(command string, arguments []string) []byte { + cmd := exec.Command(command, arguments...) + stdout, err := cmd.StdoutPipe() + if err != nil { + log.Fatal(err) + } + if err := cmd.Start(); err != nil { + log.Fatal(err) + } + out, _ := ioutil.ReadAll(stdout) + if err := cmd.Wait(); err != nil { + log.Fatal(err) + } + return out +} + +/* + * Implement the Prometheus Collector interface and feed the + * Slurm scheduler metrics into it. + * https://godoc.org/github.com/prometheus/client_golang/prometheus#Collector + */ + +func NewGPUsCollector() *GPUsCollector { + return &GPUsCollector{ + alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil), + idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil), + total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil), + utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil), + } +} + +type GPUsCollector struct { + alloc *prometheus.Desc + idle *prometheus.Desc + total *prometheus.Desc + utilization *prometheus.Desc +} + +// Send all metric descriptions +func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- cc.alloc + ch <- cc.idle + ch <- cc.total + ch <- cc.utilization +} +func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) { + cm := GPUsGetMetrics() + ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc) + ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle) + ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total) + ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization) +} diff --git a/main.go b/main.go index 2e45d2e..48291fc 100644 --- a/main.go +++ b/main.go @@ -1,4 +1,4 @@ -/* Copyright 2017 Victor Penso, Matteo Dessalvi +/* Copyright 2017-2020 Victor Penso, Matteo Dessalvi, Joeri Hermans This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,10 +25,15 @@ import ( func init() { // Metrics have to be registered to be exposed - prometheus.MustRegister(NewSchedulerCollector()) // from scheduler.go - prometheus.MustRegister(NewQueueCollector()) // from queue.go - prometheus.MustRegister(NewNodesCollector()) // from nodes.go - prometheus.MustRegister(NewCPUsCollector()) // from cpus.go + prometheus.MustRegister(NewAccountsCollector()) // from accounts.go + prometheus.MustRegister(NewCPUsCollector()) // from cpus.go + prometheus.MustRegister(NewNodesCollector()) // from nodes.go + prometheus.MustRegister(NewNodeCollector()) // from node.go + prometheus.MustRegister(NewPartitionsCollector()) // from partitions.go + prometheus.MustRegister(NewQueueCollector()) // from queue.go + prometheus.MustRegister(NewSchedulerCollector()) // from scheduler.go + prometheus.MustRegister(NewFairShareCollector()) // from sshare.go + prometheus.MustRegister(NewUsersCollector()) // from users.go } var listenAddress = flag.String( @@ -36,11 +41,23 @@ var listenAddress = flag.String( ":8080", "The address to listen on for HTTP requests.") +var gpuAcct = flag.Bool( + "gpus-acct", + false, + "Enable GPUs accounting") + func main() { flag.Parse() + + // Turn on GPUs accounting only if the corresponding command line option is set to true. + if *gpuAcct { + prometheus.MustRegister(NewGPUsCollector()) // from gpus.go + } + // The Handler function provides a default handler to expose metrics // via an HTTP server. "/metrics" is the usual endpoint for that. log.Infof("Starting Server: %s", *listenAddress) + log.Infof("GPUs Accounting: %t", *gpuAcct) http.Handle("/metrics", promhttp.Handler()) log.Fatal(http.ListenAndServe(*listenAddress, nil)) } diff --git a/node.go b/node.go new file mode 100644 index 0000000..bf2f759 --- /dev/null +++ b/node.go @@ -0,0 +1,137 @@ +/* Copyright 2021 Chris Read + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +package main + +import ( + "log" + "os/exec" + "sort" + "strconv" + "strings" + + "github.com/prometheus/client_golang/prometheus" +) + +// NodeMetrics stores metrics for each node +type NodeMetrics struct { + memAlloc uint64 + memTotal uint64 + cpuAlloc uint64 + cpuIdle uint64 + cpuOther uint64 + cpuTotal uint64 + nodeStatus string +} + +func NodeGetMetrics() map[string]*NodeMetrics { + return ParseNodeMetrics(NodeData()) +} + +// ParseNodeMetrics takes the output of sinfo with node data +// It returns a map of metrics per node +func ParseNodeMetrics(input []byte) map[string]*NodeMetrics { + nodes := make(map[string]*NodeMetrics) + lines := strings.Split(string(input), "\n") + + // Sort and remove all the duplicates from the 'sinfo' output + sort.Strings(lines) + linesUniq := RemoveDuplicates(lines) + + for _, line := range linesUniq { + node := strings.Fields(line) + nodeName := node[0] + nodeStatus := node[4] // mixed, allocated, etc. + + nodes[nodeName] = &NodeMetrics{0, 0, 0, 0, 0, 0, ""} + + memAlloc, _ := strconv.ParseUint(node[1], 10, 64) + memTotal, _ := strconv.ParseUint(node[2], 10, 64) + + + cpuInfo := strings.Split(node[3], "/") + cpuAlloc, _ := strconv.ParseUint(cpuInfo[0], 10, 64) + cpuIdle, _ := strconv.ParseUint(cpuInfo[1], 10, 64) + cpuOther, _ := strconv.ParseUint(cpuInfo[2], 10, 64) + cpuTotal, _ := strconv.ParseUint(cpuInfo[3], 10, 64) + + nodes[nodeName].memAlloc = memAlloc + nodes[nodeName].memTotal = memTotal + nodes[nodeName].cpuAlloc = cpuAlloc + nodes[nodeName].cpuIdle = cpuIdle + nodes[nodeName].cpuOther = cpuOther + nodes[nodeName].cpuTotal = cpuTotal + nodes[nodeName].nodeStatus = nodeStatus + } + + return nodes +} + +// NodeData executes the sinfo command to get data for each node +// It returns the output of the sinfo command +func NodeData() []byte { + cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong") + out, err := cmd.Output() + if err != nil { + log.Fatal(err) + } + return out +} + +type NodeCollector struct { + cpuAlloc *prometheus.Desc + cpuIdle *prometheus.Desc + cpuOther *prometheus.Desc + cpuTotal *prometheus.Desc + memAlloc *prometheus.Desc + memTotal *prometheus.Desc +} + +// NewNodeCollector creates a Prometheus collector to keep all our stats in +// It returns a set of collections for consumption +func NewNodeCollector() *NodeCollector { + labels := []string{"node","status"} + + return &NodeCollector{ + cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil), + cpuIdle: prometheus.NewDesc("slurm_node_cpu_idle", "Idle CPUs per node", labels, nil), + cpuOther: prometheus.NewDesc("slurm_node_cpu_other", "Other CPUs per node", labels, nil), + cpuTotal: prometheus.NewDesc("slurm_node_cpu_total", "Total CPUs per node", labels, nil), + memAlloc: prometheus.NewDesc("slurm_node_mem_alloc", "Allocated memory per node", labels, nil), + memTotal: prometheus.NewDesc("slurm_node_mem_total", "Total memory per node", labels, nil), + } +} + +// Send all metric descriptions +func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- nc.cpuAlloc + ch <- nc.cpuIdle + ch <- nc.cpuOther + ch <- nc.cpuTotal + ch <- nc.memAlloc + ch <- nc.memTotal +} + +func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) { + nodes := NodeGetMetrics() + for node := range nodes { + ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nodes[node].cpuAlloc), node, nodes[node].nodeStatus) + ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus) + ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nodes[node].cpuOther), node, nodes[node].nodeStatus) + ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus) + ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus) + ch <- prometheus.MustNewConstMetric(nc.memTotal, prometheus.GaugeValue, float64(nodes[node].memTotal), node, nodes[node].nodeStatus) + } +} diff --git a/node_test.go b/node_test.go new file mode 100644 index 0000000..b554ddc --- /dev/null +++ b/node_test.go @@ -0,0 +1,57 @@ +/* Copyright 2021 Chris Read + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +package main + +import ( + "io/ioutil" + "testing" + + "github.com/stretchr/testify/assert" +) + +/* +For this example data line: + +a048,79384,193000,3/13/0/16,mix + +We want output that looks like: + +slurm_node_cpus_allocated{name="a048",status="mix"} 3 +slurm_node_cpus_idle{name="a048",status="mix"} 3 +slurm_node_cpus_other{name="a048",status="mix"} 0 +slurm_node_cpus_total{name="a048",status="mix"} 16 +slurm_node_mem_allocated{name="a048",status="mix"} 179384 +slurm_node_mem_total{name="a048",status="mix"} 193000 + +*/ + +func TestNodeMetrics(t *testing.T) { + // Read the input data from a file + data, err := ioutil.ReadFile("test_data/sinfo_mem.txt") + if err != nil { + t.Fatalf("Can not open test data: %v", err) + } + metrics := ParseNodeMetrics(data) + t.Logf("%+v", metrics) + + assert.Contains(t, metrics, "b001") + assert.Equal(t, uint64(327680), metrics["b001"].memAlloc) + assert.Equal(t, uint64(386000), metrics["b001"].memTotal) + assert.Equal(t, uint64(32), metrics["b001"].cpuAlloc) + assert.Equal(t, uint64(0), metrics["b001"].cpuIdle) + assert.Equal(t, uint64(0), metrics["b001"].cpuOther) + assert.Equal(t, uint64(32), metrics["b001"].cpuTotal) +} diff --git a/nodes.go b/nodes.go index f20b2c9..0a88a9a 100644 --- a/nodes.go +++ b/nodes.go @@ -22,6 +22,7 @@ import ( "os/exec" "regexp" "sort" + "strconv" "strings" ) @@ -49,8 +50,10 @@ func RemoveDuplicates(s []string) []string { // Walk through the slice 's' and for each value we haven't seen so far, append it to 't'. for _, v := range s { if _, seen := m[v]; !seen { - t = append(t, v) - m[v] = true + if len(v) > 0 { + t = append(t, v) + m[v] = true + } } } @@ -67,7 +70,9 @@ func ParseNodesMetrics(input []byte) *NodesMetrics { for _, line := range lines_uniq { if strings.Contains(line, ",") { - state := strings.Split(line, ",")[1] + split := strings.Split(line, ",") + count, _ := strconv.ParseFloat(strings.TrimSpace(split[0]), 64) + state := split[1] alloc := regexp.MustCompile(`^alloc`) comp := regexp.MustCompile(`^comp`) down := regexp.MustCompile(`^down`) @@ -80,25 +85,25 @@ func ParseNodesMetrics(input []byte) *NodesMetrics { resv := regexp.MustCompile(`^res`) switch { case alloc.MatchString(state) == true: - nm.alloc++ + nm.alloc += count case comp.MatchString(state) == true: - nm.comp++ + nm.comp += count case down.MatchString(state) == true: - nm.down++ + nm.down += count case drain.MatchString(state) == true: - nm.drain++ + nm.drain += count case fail.MatchString(state) == true: - nm.fail++ + nm.fail += count case err.MatchString(state) == true: - nm.err++ + nm.err += count case idle.MatchString(state) == true: - nm.idle++ + nm.idle += count case maint.MatchString(state) == true: - nm.maint++ + nm.maint += count case mix.MatchString(state) == true: - nm.mix++ + nm.mix += count case resv.MatchString(state) == true: - nm.resv++ + nm.resv += count } } } @@ -107,7 +112,7 @@ func ParseNodesMetrics(input []byte) *NodesMetrics { // Execute the sinfo command and return its output func NodesData() []byte { - cmd := exec.Command("sinfo", "-h", "-o %n,%T") + cmd := exec.Command("sinfo", "-h", "-o %D,%T") stdout, err := cmd.StdoutPipe() if err != nil { log.Fatal(err) diff --git a/packages/README.md b/packages/README.md new file mode 100644 index 0000000..2764bb5 --- /dev/null +++ b/packages/README.md @@ -0,0 +1,8 @@ +# Packages + +* Build RPM packages from + [rpm/prometheus-slurm-exporter.spec](rpm/prometheus-slurm-exporter.spec) + following documentation in [rpm/README.md](rpm/README.md]). +* Build a [Snap](https://snapcraft.io) package from + [../snap/snapcraft.yaml](../snap/snapcraft.yaml) following documentation in + [snap/README.md](snap/README.md). diff --git a/packaging/rpm/README.md b/packages/rpm/README.md similarity index 96% rename from packaging/rpm/README.md rename to packages/rpm/README.md index 5db8e6e..a14d323 100644 --- a/packaging/rpm/README.md +++ b/packages/rpm/README.md @@ -31,7 +31,7 @@ cp lib/systemd/prometheus-slurm-exporter.service ~/rpmbuild/SOURCES 6. Copy the SPEC file in the proper directory: ```bash cd prometheus-slurm-exporter -cp packaging/rpm/*.spec ~/rpmbuild/SPECS +cp packages/rpm/*.spec ~/rpmbuild/SPECS ``` ### Build the RPM package diff --git a/packaging/rpm/prometheus-slurm-exporter.spec b/packages/rpm/prometheus-slurm-exporter.spec similarity index 100% rename from packaging/rpm/prometheus-slurm-exporter.spec rename to packages/rpm/prometheus-slurm-exporter.spec diff --git a/packages/snap/README.md b/packages/snap/README.md new file mode 100644 index 0000000..c4cef12 --- /dev/null +++ b/packages/snap/README.md @@ -0,0 +1,96 @@ +# Building the prometheus-slurm-exporter snap +Packaging and delivering the prometheus-slurm-exporter as a snap provides users of prometheus-slurm-exporter +a hardened, streamlined, and idempotent experience when consuming this software. See [snapcraft](https://snapcraft.io/) for more information on snaps. + + +### Prereqs +* [snapcraft](https://snapcraft.io) + ```bash + sudo snap install snapcraft --classic + ``` +* [lxd](https://linuxcontainers.org/) + ```bash + sudo snap install lxd + ``` + +### Build +From the root of this project, build the snap: +```bash +snapcraft --use-lxd +``` +Once the snap build has completed, list the current working directory to see the resultant snap artifact. +```bash +$ ls -la *.snap +-rw-r--r-- 1 bdx bdx 5562368 Aug 16 18:19 prometheus-slurm-exporter_0.11-1-g01dd959_amd64.snap +``` + +### Install locally built snap +```bash +sudo snap install prometheus-slurm-exporter_`git describe --tags`_amd64.snap --classic --dangerous +``` +* `--classic` - this snap uses classic confinement to allow it to find the slurm commands in the system. +* `--dangerous` - because we are installing this snap from a local resource and sha can't be verified by the snapstore. + +### Verify install +Use `ps` to verify the process is running. +```bash +$ ps aux | grep prometheus | head -1 +root 2271391 0.0 0.0 1453596 14012 ? SLsl 18:32 0:00 /snap/prometheus-slurm-exporter/x1/bin/prometheus-slurm-exporter +``` + +Use `netstat` to verify that the installed `prometheus-slurm-exporter` snap process is listening on port 8080. +```bash +$ sudo netstat -peanut | grep prometheus +tcp6 0 0 :::8080 :::* LISTEN 0 15042010 2271391/prometheus-slurm-exporter +``` + +Lastly, curl the metrics endpoint. +```bash +$ curl 127.0.0.1:8080/metrics +# TYPE slurm_cpus_total gauge +slurm_cpus_total 8 +# HELP slurm_nodes_alloc Allocated nodes +# TYPE slurm_nodes_alloc gauge +slurm_nodes_alloc 0 +# HELP slurm_nodes_comp Completing nodes +# TYPE slurm_nodes_comp gauge +slurm_nodes_comp 0 +# HELP slurm_nodes_down Down nodes +# TYPE slurm_nodes_down gauge +slurm_nodes_down 0 +# HELP slurm_nodes_drain Drain nodes +# TYPE slurm_nodes_drain gauge +slurm_nodes_drain 0 +# HELP slurm_nodes_err Error nodes +# TYPE slurm_nodes_err gauge +slurm_nodes_err 0 +# HELP slurm_nodes_fail Fail nodes +# TYPE slurm_nodes_fail gauge +slurm_nodes_fail 0 +# HELP slurm_nodes_idle Idle nodes +# TYPE slurm_nodes_idle gauge +slurm_nodes_idle 1 + +... + +# TYPE slurm_scheduler_backfilled_jobs_since_start_total gauge +slurm_scheduler_backfilled_jobs_since_start_total 0 +# HELP slurm_scheduler_cycle_per_minute Information provided by the Slurm sdiag command, number scheduler cycles per minute +# TYPE slurm_scheduler_cycle_per_minute gauge +slurm_scheduler_cycle_per_minute 1 +# HELP slurm_scheduler_dbd_queue_size Information provided by the Slurm sdiag command, length of the DBD agent queue +# TYPE slurm_scheduler_dbd_queue_size gauge +slurm_scheduler_dbd_queue_size 0 +# HELP slurm_scheduler_last_cycle Information provided by the Slurm sdiag command, scheduler last cycle time in (microseconds) +# TYPE slurm_scheduler_last_cycle gauge +slurm_scheduler_last_cycle 40 +# HELP slurm_scheduler_mean_cycle Information provided by the Slurm sdiag command, scheduler mean cycle time in (microseconds) +# TYPE slurm_scheduler_mean_cycle gauge +slurm_scheduler_mean_cycle 481 +... +``` + +To uninstall the prometheus-slurm-exporter snap: +```bash +sudo snap remove prometheus-slurm-exporter +``` diff --git a/partitions.go b/partitions.go new file mode 100644 index 0000000..16c6b36 --- /dev/null +++ b/partitions.go @@ -0,0 +1,149 @@ +/* Copyright 2020 Victor Penso + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +package main + +import ( + "io/ioutil" + "os/exec" + "log" + "strings" + "strconv" + "github.com/prometheus/client_golang/prometheus" +) + +func PartitionsData() []byte { + cmd := exec.Command("sinfo", "-h", "-o%R,%C") + stdout, err := cmd.StdoutPipe() + if err != nil { + log.Fatal(err) + } + if err := cmd.Start(); err != nil { + log.Fatal(err) + } + out, _ := ioutil.ReadAll(stdout) + if err := cmd.Wait(); err != nil { + log.Fatal(err) + } + return out +} + +func PartitionsPendingJobsData() []byte { + cmd := exec.Command("squeue","-a","-r","-h","-o%P","--states=PENDING") + stdout, err := cmd.StdoutPipe() + if err != nil { + log.Fatal(err) + } + if err := cmd.Start(); err != nil { + log.Fatal(err) + } + out, _ := ioutil.ReadAll(stdout) + if err := cmd.Wait(); err != nil { + log.Fatal(err) + } + return out +} + +type PartitionMetrics struct { + allocated float64 + idle float64 + other float64 + pending float64 + total float64 +} + +func ParsePartitionsMetrics() map[string]*PartitionMetrics { + partitions := make(map[string]*PartitionMetrics) + lines := strings.Split(string(PartitionsData()), "\n") + for _, line := range lines { + if strings.Contains(line,",") { + // name of a partition + partition := strings.Split(line,",")[0] + _,key := partitions[partition] + if !key { + partitions[partition] = &PartitionMetrics{0,0,0,0,0} + } + states := strings.Split(line,",")[1] + allocated,_ := strconv.ParseFloat(strings.Split(states,"/")[0],64) + idle,_ := strconv.ParseFloat(strings.Split(states,"/")[1],64) + other,_ := strconv.ParseFloat(strings.Split(states,"/")[2],64) + total,_ := strconv.ParseFloat(strings.Split(states,"/")[3],64) + partitions[partition].allocated = allocated + partitions[partition].idle = idle + partitions[partition].other = other + partitions[partition].total = total + } + } + // get list of pending jobs by partition name + list := strings.Split(string(PartitionsPendingJobsData()),"\n") + for _,partition := range list { + // accumulate the number of pending jobs + _,key := partitions[partition] + if key { + partitions[partition].pending += 1 + } + } + + + return partitions +} + +type PartitionsCollector struct { + allocated *prometheus.Desc + idle *prometheus.Desc + other *prometheus.Desc + pending *prometheus.Desc + total *prometheus.Desc +} + +func NewPartitionsCollector() *PartitionsCollector { + labels := []string{"partition"} + return &PartitionsCollector{ + allocated: prometheus.NewDesc("slurm_partition_cpus_allocated", "Allocated CPUs for partition", labels,nil), + idle: prometheus.NewDesc("slurm_partition_cpus_idle", "Idle CPUs for partition", labels,nil), + other: prometheus.NewDesc("slurm_partition_cpus_other", "Other CPUs for partition", labels,nil), + pending: prometheus.NewDesc("slurm_partition_jobs_pending", "Pending jobs for partition", labels,nil), + total: prometheus.NewDesc("slurm_partition_cpus_total", "Total CPUs for partition", labels,nil), + } +} + +func (pc *PartitionsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- pc.allocated + ch <- pc.idle + ch <- pc.other + ch <- pc.pending + ch <- pc.total +} + +func (pc *PartitionsCollector) Collect(ch chan<- prometheus.Metric) { + pm := ParsePartitionsMetrics() + for p := range pm { + if pm[p].allocated > 0 { + ch <- prometheus.MustNewConstMetric(pc.allocated, prometheus.GaugeValue, pm[p].allocated, p) + } + if pm[p].idle > 0 { + ch <- prometheus.MustNewConstMetric(pc.idle, prometheus.GaugeValue, pm[p].idle, p) + } + if pm[p].other > 0 { + ch <- prometheus.MustNewConstMetric(pc.other, prometheus.GaugeValue, pm[p].other, p) + } + if pm[p].pending > 0 { + ch <- prometheus.MustNewConstMetric(pc.pending, prometheus.GaugeValue, pm[p].pending, p) + } + if pm[p].total > 0 { + ch <- prometheus.MustNewConstMetric(pc.total, prometheus.GaugeValue, pm[p].total, p) + } + } +} diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml new file mode 100644 index 0000000..c9a5112 --- /dev/null +++ b/snap/snapcraft.yaml @@ -0,0 +1,27 @@ +name: prometheus-slurm-exporter +summary: Prometheus Slurm Exporter +description: | + Prometheus collector and exporter for metrics extracted from the Slurm resource scheduling system. + +adopt-info: prometheus-slurm-exporter + +grade: stable +confinement: classic + +base: core20 + +apps: + prometheus-slurm-exporter: + daemon: simple + environment: + PATH: $PATH:/snap/bin + command: bin/prometheus-slurm-exporter + +parts: + prometheus-slurm-exporter: + source: https://github.com/vpenso/prometheus-slurm-exporter.git + plugin: go + go-channel: 1.14/stable + override-build: | + snapcraftctl build + snapcraftctl set-version `git describe --tags` diff --git a/source_me.sh b/source_me.sh deleted file mode 100644 index 9640e02..0000000 --- a/source_me.sh +++ /dev/null @@ -1,43 +0,0 @@ -# -# Copyright 2012-2017 Victor Penso -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -# Find the correct path even if dereferenced by a link -__source=$0 - -if [[ "$__source" == *bash* ]]; then - __source=${BASH_SOURCE[0]} -fi - -__dir="$( dirname $__source )" -while [ -h $__source ] -do - __source="$( readlink "$__source" )" - [[ $__source != /* ]] && __source="$__dir/$__source" - __dir="$( cd -P "$( dirname "$__source" )" && pwd )" -done -__dir="$( cd -P "$( dirname "$__source" )" && pwd )" - -export SCRIPTS=$__dir - -unset __dir -unset __source - -export GOPATH=$SCRIPTS:/usr/share/gocode -export PATH=$SCRIPTS/bin:$PATH - -PATH=$(echo "$PATH" | awk -v RS=':' -v ORS=":" '!a[$1]++{if (NR > 1) printf ORS; printf $a[$1]}') - diff --git a/sshare.go b/sshare.go new file mode 100644 index 0000000..ecbaa69 --- /dev/null +++ b/sshare.go @@ -0,0 +1,86 @@ +/* Copyright 2021 Victor Penso + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +package main + +import ( + "io/ioutil" + "os/exec" + "log" + "strings" + "strconv" + "github.com/prometheus/client_golang/prometheus" +) + +func FairShareData() []byte { + cmd := exec.Command( "sshare", "-n", "-P", "-o", "account,fairshare" ) + stdout, err := cmd.StdoutPipe() + if err != nil { + log.Fatal(err) + } + if err := cmd.Start(); err != nil { + log.Fatal(err) + } + out, _ := ioutil.ReadAll(stdout) + if err := cmd.Wait(); err != nil { + log.Fatal(err) + } + return out +} + +type FairShareMetrics struct { + fairshare float64 +} + +func ParseFairShareMetrics() map[string]*FairShareMetrics { + accounts := make(map[string]*FairShareMetrics) + lines := strings.Split(string(FairShareData()), "\n") + for _, line := range lines { + if ! strings.HasPrefix(line," ") { + if strings.Contains(line,"|") { + account := strings.Trim(strings.Split(line,"|")[0]," ") + _,key := accounts[account] + if !key { + accounts[account] = &FairShareMetrics{0} + } + fairshare,_ := strconv.ParseFloat(strings.Split(line,"|")[1],64) + accounts[account].fairshare = fairshare + } + } + } + return accounts +} + +type FairShareCollector struct { + fairshare *prometheus.Desc +} + +func NewFairShareCollector() *FairShareCollector { + labels := []string{"account"} + return &FairShareCollector{ + fairshare: prometheus.NewDesc("slurm_account_fairshare","FairShare for account" , labels,nil), + } +} + +func (fsc *FairShareCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- fsc.fairshare +} + +func (fsc *FairShareCollector) Collect(ch chan<- prometheus.Metric) { + fsm := ParseFairShareMetrics() + for f := range fsm { + ch <- prometheus.MustNewConstMetric(fsc.fairshare, prometheus.GaugeValue, fsm[f].fairshare, f) + } +} diff --git a/test_data/sinfo_mem.txt b/test_data/sinfo_mem.txt new file mode 100644 index 0000000..88f170c --- /dev/null +++ b/test_data/sinfo_mem.txt @@ -0,0 +1,21 @@ +a048 163840 193000 16/0/0/16 mixed +a048 163840 193000 16/0/0/16 mixed +a048 163840 193000 16/0/0/16 idle +a048 163840 193000 16/0/0/16 idle +a049 163840 193000 16/0/0/16 idle +a049 163840 193000 16/0/0/16 idle +a049 163840 193000 16/0/0/16 idle +a049 163840 193000 16/0/0/16 idle +a050 163840 193000 16/0/0/16 idle +a050 163840 193000 16/0/0/16 idle +a050 163840 193000 16/0/0/16 idle +a051 163840 193000 16/0/0/16 idle +a051 163840 193000 16/0/0/16 idle +a051 163840 193000 16/0/0/16 idle +a052 0 193000 0/16/0/16 idle +b001 327680 386000 32/0/0/32 down +b001 327680 386000 32/0/0/32 down +b002 327680 386000 32/0/0/32 down +b002 327680 386000 32/0/0/32 idle +b003 296960 386000 29/3/0/32 down +b003 296960 386000 29/3/0/32 idle diff --git a/users.go b/users.go new file mode 100644 index 0000000..2b0e85e --- /dev/null +++ b/users.go @@ -0,0 +1,122 @@ +/* Copyright 2020 Victor Penso + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +package main + +import ( + "io/ioutil" + "os/exec" + "log" + "strings" + "strconv" + "regexp" + "github.com/prometheus/client_golang/prometheus" +) + +func UsersData() []byte { + cmd := exec.Command("squeue","-a","-r","-h","-o %A|%u|%T|%C") + stdout, err := cmd.StdoutPipe() + if err != nil { + log.Fatal(err) + } + if err := cmd.Start(); err != nil { + log.Fatal(err) + } + out, _ := ioutil.ReadAll(stdout) + if err := cmd.Wait(); err != nil { + log.Fatal(err) + } + return out +} + +type UserJobMetrics struct { + pending float64 + running float64 + running_cpus float64 + suspended float64 +} + +func ParseUsersMetrics(input []byte) map[string]*UserJobMetrics { + users := make(map[string]*UserJobMetrics) + lines := strings.Split(string(input), "\n") + for _, line := range lines { + if strings.Contains(line,"|") { + user := strings.Split(line,"|")[1] + _,key := users[user] + if !key { + users[user] = &UserJobMetrics{0,0,0,0} + } + state := strings.Split(line,"|")[2] + state = strings.ToLower(state) + cpus,_ := strconv.ParseFloat(strings.Split(line,"|")[3],64) + pending := regexp.MustCompile(`^pending`) + running := regexp.MustCompile(`^running`) + suspended := regexp.MustCompile(`^suspended`) + switch { + case pending.MatchString(state) == true: + users[user].pending++ + case running.MatchString(state) == true: + users[user].running++ + users[user].running_cpus += cpus + case suspended.MatchString(state) == true: + users[user].suspended++ + } + } + } + return users +} + +type UsersCollector struct { + pending *prometheus.Desc + running *prometheus.Desc + running_cpus *prometheus.Desc + suspended *prometheus.Desc +} + +func NewUsersCollector() *UsersCollector { + labels := []string{"user"} + return &UsersCollector { + pending: prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil), + running: prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil), + running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil), + suspended: prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil), + } +} + +func (uc *UsersCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- uc.pending + ch <- uc.running + ch <- uc.running_cpus + ch <- uc.suspended +} + +func (uc *UsersCollector) Collect(ch chan<- prometheus.Metric) { + um := ParseUsersMetrics(UsersData()) + for u := range um { + if um[u].pending > 0 { + ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u) + } + if um[u].running > 0 { + ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u) + } + if um[u].running_cpus > 0 { + ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u) + } + if um[u].suspended > 0 { + ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u) + } + } +} +