Skip to content

Commit cd6de29

Browse files
refactor: Improve slurm collector organization
* Remove unnecessary job props and labels. * Use slice of metrics to avoid using sync lock. * Move regex patterns to a separate file to be used in different collectors. * Update e2e test outputs and docs. Signed-off-by: Mahendra Paipuri <mahendra.paipuri@gmail.com>
1 parent 87c9c23 commit cd6de29

20 files changed

+790
-719
lines changed

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ ifeq ($(CGO_BUILD), 0)
139139
test-e2e: build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked
140140
@echo ">> running end-to-end tests"
141141
./scripts/e2e-test.sh -s exporter-cgroups-v1
142+
./scripts/e2e-test.sh -s exporter-cgroups-v1-memory-subsystem
142143
./scripts/e2e-test.sh -s exporter-cgroups-v2-nvidia-ipmiutil
143144
./scripts/e2e-test.sh -s exporter-cgroups-v2-amd-ipmitool
144145
./scripts/e2e-test.sh -s exporter-cgroups-v2-nogpu
@@ -187,6 +188,7 @@ ifeq ($(CGO_BUILD), 0)
187188
test-e2e-update: build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked
188189
@echo ">> updating end-to-end tests outputs"
189190
./scripts/e2e-test.sh -s exporter-cgroups-v1 -u || true
191+
./scripts/e2e-test.sh -s exporter-cgroups-v1-memory-subsystem -u || true
190192
./scripts/e2e-test.sh -s exporter-cgroups-v2-nvidia-ipmiutil -u || true
191193
./scripts/e2e-test.sh -s exporter-cgroups-v2-amd-ipmitool -u || true
192194
./scripts/e2e-test.sh -s exporter-cgroups-v2-nogpu -u || true

etc/slurm/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ An example [systemd service file](https://github.com/mahendrapaipuri/ceems/blob/
1313
is also provided in the repo that can be used along with these prolog and epilog scripts.
1414

1515
> [!IMPORTANT]
16-
> The CLI arguments `--collector.slurm.job-props-path` and `--collector.slurm.gpu-job-map-path`
17-
are hidden and cannot be seen in `ceems_exporter --help` output. However, these arguments
16+
> The CLI argument `--collector.slurm.gpu-job-map-path`
17+
is hidden and cannot be seen in `ceems_exporter --help` output. However, this argument
1818
exists in the exporter and can be used.
1919

2020
Even with such prolog and epilog scripts, operators should grant the user running CEEMS

etc/slurm/epilog.d/slurmjobprops.sh

Lines changed: 0 additions & 6 deletions
This file was deleted.

etc/slurm/prolog.d/slurmjobprops.sh

Lines changed: 0 additions & 9 deletions
This file was deleted.

init/systemd/ceems_exporter_no_privs.service

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ Type=simple
77
User=ceems
88
Group=ceems
99
ExecStart=/usr/local/bin/ceems_exporter \
10-
--collector.slurm.gpu.type=nvidia \
11-
--collector.slurm.job.props.path="/run/slurmjobprops" \
1210
--collector.slurm.gpu.job.map.path="/run/gpujobmap" \
1311
--collector.ipmi.dcmi.cmd="sudo /usr/sbin/ipmi-dcmi --get-system-power-statistics" \
1412
--log.level=debug

pkg/collector/helper.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ var (
2525
reParens = regexp.MustCompile(`\((.*)\)`)
2626
)
2727

28-
// Check if file exists.
28+
// fileExists checks if given file exists or not.
2929
func fileExists(filename string) bool {
3030
info, err := os.Stat(filename)
3131
if os.IsNotExist(err) {
@@ -35,17 +35,17 @@ func fileExists(filename string) bool {
3535
return !info.IsDir()
3636
}
3737

38-
// Find named matches in regex groups and return a map.
39-
func findNamedMatches(regex *regexp.Regexp, str string) map[string]string {
40-
match := regex.FindStringSubmatch(str)
38+
// // Find named matches in regex groups and return a map.
39+
// func findNamedMatches(regex *regexp.Regexp, str string) map[string]string {
40+
// match := regex.FindStringSubmatch(str)
4141

42-
results := map[string]string{}
43-
for i, name := range match {
44-
results[regex.SubexpNames()[i]] = name
45-
}
42+
// results := map[string]string{}
43+
// for i, name := range match {
44+
// results[regex.SubexpNames()[i]] = name
45+
// }
4646

47-
return results
48-
}
47+
// return results
48+
// }
4949

5050
// SanitizeMetricName sanitize the given metric name by replacing invalid characters by underscores.
5151
//

pkg/collector/perf.go

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,6 @@ import (
2121

2222
const perfCollectorSubsystem = "perf"
2323

24-
// slurm related regexes.
25-
var (
26-
slurmCgroupIDRegex = regexp.MustCompile("^.*/(?:.+?)job_([0-9]+)(?:.*$)")
27-
slurmIgnoreProcsRegex = regexp.MustCompile("slurmstepd:(.*)|sleep ([0-9]+)|/bin/bash (.*)/slurm_script")
28-
)
29-
3024
var (
3125
perfHardwareProfilerMap = map[string]perf.HardwareProfilerType{
3226
"CpuCycles": perf.CpuCyclesProfiler,
@@ -173,7 +167,7 @@ func NewPerfCollector(logger log.Logger) (Collector, error) {
173167

174168
if *collectorState[slurmCollectorSubsystem] {
175169
collector.manager = "slurm"
176-
collector.cgroupIDRegex = slurmCgroupIDRegex
170+
collector.cgroupIDRegex = slurmCgroupPathRegex
177171
collector.filterProcCmdRegex = slurmIgnoreProcsRegex
178172
}
179173

pkg/collector/perf_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func TestDiscoverProcess(t *testing.T) {
7070
collector := perfCollector{
7171
logger: log.NewNopLogger(),
7272
envVar: "ENABLE_PROFILING",
73-
cgroupIDRegex: slurmCgroupIDRegex,
73+
cgroupIDRegex: slurmCgroupPathRegex,
7474
filterProcCmdRegex: slurmIgnoreProcsRegex,
7575
perfHwProfilersEnabled: true,
7676
perfSwProfilersEnabled: true,
@@ -121,7 +121,7 @@ func TestNewProfilers(t *testing.T) {
121121

122122
collector := perfCollector{
123123
logger: log.NewNopLogger(),
124-
cgroupIDRegex: slurmCgroupIDRegex,
124+
cgroupIDRegex: slurmCgroupPathRegex,
125125
filterProcCmdRegex: slurmIgnoreProcsRegex,
126126
perfHwProfilersEnabled: true,
127127
perfSwProfilersEnabled: true,

pkg/collector/regexp.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package collector
2+
3+
import "regexp"
4+
5+
// Regular expressions of cgroup paths for different resource managers
6+
/*
7+
For v1 possibilities are /cpuacct/slurm/uid_1000/job_211
8+
/memory/slurm/uid_1000/job_211
9+
10+
For v2 possibilities are /system.slice/slurmstepd.scope/job_211
11+
/system.slice/slurmstepd.scope/job_211/step_interactive
12+
/system.slice/slurmstepd.scope/job_211/step_extern/user/task_0
13+
*/
14+
var (
15+
slurmCgroupPathRegex = regexp.MustCompile("^.*/slurm(?:.*?)/job_([0-9]+)(?:.*$)")
16+
slurmIgnoreProcsRegex = regexp.MustCompile("slurmstepd:(.*)|sleep ([0-9]+)|/bin/bash (.*)/slurm_script")
17+
)

0 commit comments

Comments
 (0)