Add multi-GPU unit tests

nandnor93 · nandnor93 · commit 209ab792a3e7 · 2021-06-10T23:45:09.000+09:00
If multiple GPUs are detected, conducts tests on each device.
This could detect bugs relating to cudaSetDevice().

Additionary, the test data size is enlarged, as CUDA memory errors
tend to be latent with small data.
diff --git a/test/test_device.py b/test/test_device.py
@@ -16,53 +16,56 @@ def error(self, cuda_t, cpu_t):
     def test_fixed_point(self):
         for wl, fl in [(5, 4), (3, 2)]:
             for rounding in ["nearest"]:
-                t_max = 1 - (2 ** (-fl))
-                to_quantize_cuda = torch.linspace(
-                    -t_max, t_max, steps=20, device="cuda"
-                )
-                to_quantize_cpu = to_quantize_cuda.clone().to("cpu")
-                fixed_quantized_cuda = fixed_point_quantize(
-                    to_quantize_cuda, wl=wl, fl=fl, rounding=rounding
-                )
-                fixed_quantized_cpu = fixed_point_quantize(
-                    to_quantize_cpu, wl=wl, fl=fl, rounding=rounding
-                )
-                mse = self.error(fixed_quantized_cuda, fixed_quantized_cpu)
-                self.assertTrue(mse < 1e-15)
-                # self.assertTrue(torch.eq(fixed_quantized_cuda.cpu(), fixed_quantized_cpu).all().item())
-
-    def test_block_floating_point(self):
-        for wl in [5, 3]:
-            for rounding in ["nearest"]:
-                for dim in [-1, 0, 1]:
-                    t_max = 1 - (2 ** (-4))
+                for device in [("cuda:%d" % d) for d in range(torch.cuda.device_count())]:
+                    t_max = 1 - (2 ** (-fl))
                     to_quantize_cuda = torch.linspace(
-                        -t_max, t_max, steps=20, device="cuda"
+                        -t_max, t_max, steps=1200, device=torch.device(device)
                     )
                     to_quantize_cpu = to_quantize_cuda.clone().to("cpu")
-                    block_quantized_cuda = block_quantize(
-                        to_quantize_cuda, wl=wl, rounding=rounding
+                    fixed_quantized_cuda = fixed_point_quantize(
+                        to_quantize_cuda, wl=wl, fl=fl, rounding=rounding
                     )
-                    block_quantized_cpu = block_quantize(
-                        to_quantize_cpu, wl=wl, rounding=rounding
+                    fixed_quantized_cpu = fixed_point_quantize(
+                        to_quantize_cpu, wl=wl, fl=fl, rounding=rounding
                     )
-                    mse = self.error(block_quantized_cuda, block_quantized_cpu)
-                    self.assertTrue(mse < 1e-15)
-                    # self.assertTrue(torch.eq(block_quantized_cuda.cpu(), block_quantized_cpu).all().item())
+                    mse = self.error(fixed_quantized_cuda, fixed_quantized_cpu)
+                    self.assertTrue(mse < 1e-15, msg="%.2e MSE on device '%s'" % (mse, device))
+                    # self.assertTrue(torch.eq(fixed_quantized_cuda.cpu(), fixed_quantized_cpu).all().item())
+
+    def test_block_floating_point(self):
+        for wl in [5, 3]:
+            for rounding in ["nearest"]:
+                for dim in [-1, 0, 1]:
+                    for device in [("cuda:%d" % d) for d in range(torch.cuda.device_count())]:
+                        t_max = 1 - (2 ** (-4))
+                        to_quantize_cuda = torch.linspace(
+                            -t_max, t_max, steps=1200, device=torch.device(device)
+                        )
+                        to_quantize_cpu = to_quantize_cuda.clone().to("cpu")
+                        block_quantized_cuda = block_quantize(
+                            to_quantize_cuda, wl=wl, rounding=rounding
+                        )
+                        block_quantized_cpu = block_quantize(
+                            to_quantize_cpu, wl=wl, rounding=rounding
+                        )
+                        mse = self.error(block_quantized_cuda, block_quantized_cpu)
+                        self.assertTrue(mse < 1e-15, msg="%.2e MSE on device '%s'" % (mse, device))
+                        # self.assertTrue(torch.eq(block_quantized_cuda.cpu(), block_quantized_cpu).all().item())
 
     def test_floating_point(self):
         for man, exp in [(2, 5), (6, 9)]:
             for rounding in ["nearest"]:
-                to_quantize_cuda = torch.rand(20).cuda()
-                to_quantize_cpu = to_quantize_cuda.clone().to("cpu")
-                float_quantized_cuda = float_quantize(
-                    to_quantize_cuda, man=man, exp=exp, rounding=rounding
-                )
-                float_quantized_cpu = float_quantize(
-                    to_quantize_cpu, man=man, exp=exp, rounding=rounding
-                )
-                mse = self.error(float_quantized_cuda, float_quantized_cpu)
-                self.assertTrue(mse < 1e-15)
+                for device in [("cuda:%d" % d) for d in range(torch.cuda.device_count())]:
+                    to_quantize_cuda = torch.rand(1200).to(torch.device(device))
+                    to_quantize_cpu = to_quantize_cuda.clone().to("cpu")
+                    float_quantized_cuda = float_quantize(
+                        to_quantize_cuda, man=man, exp=exp, rounding=rounding
+                    )
+                    float_quantized_cpu = float_quantize(
+                        to_quantize_cpu, man=man, exp=exp, rounding=rounding
+                    )
+                    mse = self.error(float_quantized_cuda, float_quantized_cpu)
+                    self.assertTrue(mse < 1e-15, msg="%.2e MSE on device '%s'" % (mse, device))
 
 
 if __name__ == "__main__":