Parallel test with pytest-xdist (pytorch#518)

gau-nernst · web-flow · commit cb240c361fea · 2024-07-18T10:29:34.000-07:00
* try pytest-xdist

* run multi_gpu tests separately

* each worker run 1 file

* use loadscope to balance load better (for test_integration.py)

* change back to loadfile

* per-test schedule

* per-test sharding. avoid name collision
diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
@@ -68,4 +68,5 @@ jobs:
         pip install ${{ matrix.torch-spec }}
         pip install -r dev-requirements.txt
         pip install .
-        pytest test --verbose -s
+        pytest test --verbose -s -m "not multi_gpu" --dist load --tx popen//env:CUDA_VISIBLE_DEVICES=0 --tx popen//env:CUDA_VISIBLE_DEVICES=1 --tx popen//env:CUDA_VISIBLE_DEVICES=2 --tx popen//env:CUDA_VISIBLE_DEVICES=3
+        pytest test --verbose -s -m "multi_gpu"
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -7,6 +7,7 @@ transformers
 hypothesis # Avoid test derandomization warning
 sentencepiece # for gpt-fast tokenizer
 expecttest
+pytest-xdist
 
 # For prototype features and benchmarks
 bitsandbytes #needed for testing triton quant / dequant ops for 8-bit optimizers
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    multi_gpu: marks tests as require multi GPUs (deselect with '-m "not multi_gpu"')
diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py
@@ -486,6 +486,7 @@ class TestQLoRA(FSDPTest):
     def world_size(self) -> int:
         return 2
 
+    @pytest.mark.multi_gpu
     @pytest.mark.skipif(
         version.parse(torch.__version__).base_version < "2.4.0",
         reason="torch >= 2.4 required",
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -985,7 +985,10 @@ def forward(self, x):
         # save quantized state_dict
         api(model)
 
-        torch.save(model.state_dict(), "test.pth")
+        # unique filename to avoid collision in parallel tests
+        ckpt_name = f"{api.__name__}_{test_device}_{test_dtype}_test.pth"
+
+        torch.save(model.state_dict(), ckpt_name)
         # get quantized reference
         model_qc = torch.compile(model, mode="max-autotune")
         ref_q = model_qc(x).detach()
@@ -998,8 +1001,8 @@ def forward(self, x):
         api(model)
 
         # load quantized state_dict
-        state_dict = torch.load("test.pth", mmap=True)
-        os.remove("test.pth")
+        state_dict = torch.load(ckpt_name, mmap=True)
+        os.remove(ckpt_name)
 
         model.load_state_dict(state_dict, assign=True)
         model = model.to(device=test_device, dtype=test_dtype).eval()
diff --git a/test/prototype/test_low_bit_optim.py b/test/prototype/test_low_bit_optim.py
@@ -163,6 +163,7 @@ class TestFSDP2(FSDPTest):
     def world_size(self) -> int:
         return 2
 
+    @pytest.mark.multi_gpu
     @pytest.mark.skipif(not TORCH_VERSION_AFTER_2_4, reason="torch >= 2.4 required")
     @skip_if_lt_x_gpu(2)
     def test_fsdp2(self):

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[pytest]`
	`2`	`+markers =`
	`3`	`+ multi_gpu: marks tests as require multi GPUs (deselect with '-m "not multi_gpu"')`