From ad4735c88f0b50b3368242d37d5c5682941823e6 Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Mon, 23 Nov 2020 23:11:35 -0800 Subject: [PATCH] testsuite: add hwloc GPU Id remapping test Use a Sierra hwloc xml which has 4 GPUs. Get a nested allocation using flux mini alloc with 2 GPUs and run - flux ion-resource ns-info 0 gpu 0 - flux ion-resource ns-info 0 gpu 1 to check how the self-discovered GPU Id 0 and 1 are remapped to. When we use high Id first match policy, they should map to 2 and 3 because the Fluxion scheduler will select GPU 2 and 3 (i.e., CUDA_VISIBLE_DEVICE=2,3 passed into the nested instance). Similarly, shen we use low Id first match policy, they should map to 0 and 1. --- t/t1016-nest-namespace.t | 91 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100755 t/t1016-nest-namespace.t diff --git a/t/t1016-nest-namespace.t b/t/t1016-nest-namespace.t new file mode 100755 index 000000000..4fdd7c298 --- /dev/null +++ b/t/t1016-nest-namespace.t @@ -0,0 +1,91 @@ +#!/bin/sh + +test_description='Test Id Namespace Remapping for Nested Instances' + +. `dirname $0`/sharness.sh + +hwloc_basepath=`readlink -e ${SHARNESS_TEST_SRCDIR}/data/hwloc-data` +# 1 brokers: 1 node, 2 sockets, 44 cores 4 gpus +excl_1N1B="${hwloc_basepath}/001N/exclusive/01-brokers-sierra2" + +export FLUX_SCHED_MODULE=none + +test_under_flux 1 + +test_expect_success 'namespace: load test resources' ' + load_test_resources ${excl_1N1B} +' + +test_expect_success 'namespace: loading resource and qmanager modules works' ' + load_resource load-allowlist=cluster,node,gpu,core policy=high && + load_qmanager +' + +test_expect_success 'namespace: gpu id remapping works with hwloc (pol=hi)' ' + cat >nest.sh <<-EOF && + #!/bin/sh + flux module load sched-fluxion-resource load-allowlist=cluster,node,gpu,core + flux module load sched-fluxion-qmanager + flux resource list + flux ion-resource ns-info 0 gpu 0 + flux ion-resource ns-info 0 gpu 1 + echo \${CUDA_VISIBLE_DEVICES} +EOF + cat >exp1 <<-EOF && + 2 + 3 + 2-3 +EOF + chmod u+x nest.sh && + jobid=$(flux mini batch --output=kvs -n1 -N1 -c22 -g2 ./nest.sh) && + flux job wait-event -t10 ${jobid} release && + flux job attach ${jobid} > out1.a && + tail -3 out1.a > out1.a.fin && + diff out1.a.fin exp1 +' + +test_expect_success 'namespace: parent CUDA_VISIBLE_DEVICES has no effect' ' + export CUDA_VISIBLE_DEVICES="0,1,2,3" && + jobid=$(flux mini batch --output=kvs -n1 -N1 -c22 -g2 ./nest.sh) && + flux job wait-event -t10 ${jobid} release && + flux job attach ${jobid} > out1.b && + tail -3 out1.b > out1.b.fin && + diff out1.b.fin exp1 +' + +test_expect_success 'namespace: removing resource and qmanager modules' ' + remove_resource +' + +test_expect_success 'namespace: loading resource and qmanager modules works' ' + load_resource load-allowlist=cluster,node,gpu,core policy=low && + load_qmanager +' + +test_expect_success 'namespace: gpu id remapping works with hwloc (pol=low)' ' + cat >exp2 <<-EOF && + 0 + 1 + 0-1 +EOF + jobid=$(flux mini batch --output=kvs -n1 -N1 -c22 -g2 ./nest.sh) && + flux job wait-event -t10 ${jobid} release && + flux job attach ${jobid} > out2.a && + tail -3 out2.a > out2.a.fin && + diff out2.a.fin exp2 +' + +test_expect_success 'namespace: parent CUDA_VISIBLE_DEVICES has no effect' ' + export CUDA_VISIBLE_DEVICES=-1 && + jobid=$(flux mini batch --output=kvs -n1 -N1 -c22 -g2 ./nest.sh) && + flux job wait-event -t10 ${jobid} release && + flux job attach ${jobid} > out2.b && + tail -3 out2.b > out2.b.fin && + diff out2.b.fin exp2 +' + +test_expect_success 'namespace: removing resource and qmanager modules' ' + remove_resource +' + +test_done