Skip to content

Commit 31017d2

Browse files
Allow nodes to appear in more than one group/partition (#174)
* fix duplicated nodename * Nodes may appear in more than one group --------- Co-authored-by: jianguo-ukaea <jianguo.rao@ukaea.uk>
1 parent 02d913e commit 31017d2

File tree

6 files changed

+93
-4
lines changed

6 files changed

+93
-4
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ For each group (if used) or partition any nodes in an ansible inventory group `<
7373
- Nodes may have arbitrary hostnames but these should be lowercase to avoid a mismatch between inventory and actual hostname.
7474
- Nodes in a group are assumed to be homogenous in terms of processor and memory.
7575
- An inventory group may be empty or missing, but if it is not then the play must contain at least one node from it (used to set processor information).
76-
- Nodes may not appear in more than one group.
76+
7777

7878
`openhpc_job_maxtime`: Maximum job time limit, default `'60-0'` (60 days). See [slurm.conf](https://slurm.schedmd.com/slurm.conf.html) parameter `MaxTime` for format. The default is 60 days. The value should be quoted to avoid Ansible conversions.
7979

molecule/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ test11 | 1 | N | As for #5 but then deletes a n
2222
test12 | 1 | N | As for #5 but enabling job completion and testing `sacct -c`
2323
test13 | 1 | N | As for #5 but tests `openhpc_config` variable.
2424
test14 | 1 | N | As for #5 but also tests `extra_nodes` via State=DOWN nodes.
25+
test15 | 1 | Y | As for #5 but also tests `partitions with different name but with the same NodeName`.
2526

2627

2728
# Local Installation & Running

molecule/test15/converge.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
- name: Converge
3+
hosts: all
4+
vars:
5+
openhpc_enable:
6+
control: "{{ inventory_hostname in groups['testohpc_login'] }}"
7+
batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
8+
runtime: true
9+
openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
10+
openhpc_slurm_partitions:
11+
- name: "compute"
12+
partition_params:
13+
PreemptMode: requeue
14+
- name: beta
15+
groups:
16+
- name: "compute"
17+
partition_params:
18+
PreemptMode: 'OFF'
19+
Priority: 1000
20+
Default: false
21+
AllowAccounts: Group_own_thePartition
22+
openhpc_cluster_name: testohpc
23+
openhpc_slurm_configless: true
24+
tasks:
25+
- name: "Include ansible-role-openhpc"
26+
include_role:
27+
name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}"

molecule/test15/molecule.yml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
---
2+
driver:
3+
name: podman
4+
platforms:
5+
- name: testohpc-login-0
6+
image: ${MOLECULE_IMAGE}
7+
pre_build_image: true
8+
groups:
9+
- testohpc_login
10+
command: /sbin/init
11+
tmpfs:
12+
- /run
13+
- /tmp
14+
volumes:
15+
- /sys/fs/cgroup:/sys/fs/cgroup:ro
16+
network: net1
17+
- name: testohpc-compute-0
18+
image: ${MOLECULE_IMAGE}
19+
pre_build_image: true
20+
groups:
21+
- testohpc_compute
22+
command: /sbin/init
23+
tmpfs:
24+
- /run
25+
- /tmp
26+
volumes:
27+
- /sys/fs/cgroup:/sys/fs/cgroup:ro
28+
network: net1
29+
- name: testohpc-compute-1
30+
image: ${MOLECULE_IMAGE}
31+
pre_build_image: true
32+
groups:
33+
- testohpc_compute
34+
command: /sbin/init
35+
tmpfs:
36+
- /run
37+
- /tmp
38+
volumes:
39+
- /sys/fs/cgroup:/sys/fs/cgroup:ro
40+
network: net1
41+
provisioner:
42+
name: ansible
43+
verifier:
44+
name: ansible

molecule/test15/verify.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
---
2+
3+
- name: Check slurm hostlist
4+
hosts: testohpc_login
5+
vars:
6+
expected_sinfo: | # NB compute is default (*)
7+
'compute*,up,60-00:00:00,2,idle,testohpc-compute-[0-1]'
8+
'beta,up,60-00:00:00,2,idle,testohpc-compute-[0-1]'
9+
tasks:
10+
- name: Get slurm partition info
11+
command: sinfo --noheader --format="%P,%a,%l,%D,%t,%N" # using --format ensures we control whitespace
12+
register: sinfo
13+
- name:
14+
assert:
15+
that: "sinfo.stdout.split() == expected_sinfo.split()"
16+
fail_msg: "FAILED - got {{ sinfo.stdout.split() }} expected {{ expected_sinfo.split() }}"

templates/slurm.conf.j2

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,10 @@ NodeName={{ node }}
150150
# OpenHPC default configuration
151151
PropagateResourceLimitsExcept=MEMLOCK
152152
Epilog=/etc/slurm/slurm.epilog.clean
153+
{% set donehosts = [] %}
153154
{% for part in openhpc_slurm_partitions %}
154155
{% set nodelist = [] %}
155156
{% for group in part.get('groups', [part]) %}
156-
157157
{% set group_name = group.cluster_name|default(openhpc_cluster_name) ~ '_' ~ group.name %}
158158
# openhpc_slurm_partitions group: {{ group_name }}
159159
{% set inventory_group_hosts = groups.get(group_name, []) %}
@@ -164,9 +164,11 @@ Epilog=/etc/slurm/slurm.epilog.clean
164164
{% set ram_mb = (first_host_hv['ansible_memory_mb']['real']['total'] * (group.ram_multiplier | default(openhpc_ram_multiplier))) | int %}
165165
{% for hostlist in (inventory_group_hosts | hostlist_expression) %}
166166
{% set gres = ' Gres=%s' % (','.join(group.gres | map(attribute='conf') )) if 'gres' in group else '' %}
167-
167+
{% if hostlist not in donehosts %}
168168
NodeName={{ hostlist }} State=UNKNOWN RealMemory={{ group.get('ram_mb', ram_mb) }} Sockets={{first_host_hv['ansible_processor_count']}} CoresPerSocket={{ first_host_hv['ansible_processor_cores'] }} ThreadsPerCore={{ first_host_hv['ansible_processor_threads_per_core'] }}{{ gres }}
169+
{% endif %}
169170
{% set _ = nodelist.append(hostlist) %}
171+
{% set _ = donehosts.append(hostlist) %}
170172
{% endfor %}{# nodes #}
171173
{% endif %}{# inventory_group_hosts #}
172174
{% for extra_node_defn in group.get('extra_nodes', []) %}
@@ -184,5 +186,4 @@ PartitionName={{part.name}} Default={{ part.get('default', 'YES') }} MaxTime={{
184186
NodeName=nonesuch
185187

186188
{% if openhpc_slurm_configless %}SlurmctldParameters=enable_configless{% endif %}
187-
188189
ReturnToService=2

0 commit comments

Comments
 (0)