Skip to content

Commit

Permalink
split out multi gpu tests from single gpu (NVIDIA-Merlin#999)
Browse files Browse the repository at this point in the history
* run only tests needing multiple GPUs on the 2GPU runners

* add config files

* remove multigpu marker

* eof fixes

* fix indenting

---------

Co-authored-by: Karl Higley <karlb@nvidia.com>
  • Loading branch information
nv-alaiacano and karlhigley authored Jun 2, 2023
1 parent 97ead97 commit 725b031
Show file tree
Hide file tree
Showing 9 changed files with 125 additions and 70 deletions.
4 changes: 4 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[flake8]
per-file-ignores =
# line too long
tests/unit/examples/test_scaling_criteo_merlin_models_hugectr.py: E501,
18 changes: 17 additions & 1 deletion .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ on:
jobs:
gpu-ci:
runs-on: 1GPU

steps:
- uses: actions/checkout@v3
with:
Expand All @@ -28,3 +27,20 @@ jobs:
branch=${raw/origin\/}
fi
cd ${{ github.workspace }}; tox -e test-gpu -- $branch
gpu-ci-multigpu:
runs-on: 2GPU
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Run tests
run: |
ref_type=${{ github.ref_type }}
branch=main
if [[ $ref_type == "tag"* ]]
then
raw=$(git branch -r --contains ${{ github.ref_name }})
branch=${raw/origin\/}
fi
cd ${{ github.workspace }}; tox -e test-gpu-multigpu -- $branch
4 changes: 4 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]
markers =
multigpu: Tests only run in multiple-GPU environments
singlegpu: Optional marker to run tests in single-GPU environments. Usually used when running in both single- and multi-GPU.
48 changes: 26 additions & 22 deletions tests/unit/examples/test_getting_started_hugectr.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import os

from testbook import testbook
import pandas as pd
import os
import shutil
import numpy as np

import numpy as np
import pandas as pd
import pytest
from merlin.systems.triton.utils import run_triton_server
from testbook import testbook

from tests.conftest import REPO_ROOT
import pytest

pytest.importorskip("hugectr")
# flake8: noqa
Expand All @@ -18,8 +17,7 @@ def test_func():
INPUT_DATA_DIR = "/tmp/input/getting_started/"
MODEL_DIR = os.path.join(INPUT_DATA_DIR, "model/movielens_hugectr")
with testbook(
REPO_ROOT
/ "examples/getting-started-movielens/01-Download-Convert.ipynb",
REPO_ROOT / "examples/getting-started-movielens/01-Download-Convert.ipynb",
execute=False,
) as tb1:
tb1.cells.pop(7)
Expand All @@ -33,24 +31,27 @@ def test_func():
shutil.rmtree(INPUT_DATA_DIR)
os.makedirs(f"{INPUT_DATA_DIR}ml-25m", exist_ok=True)
pd.DataFrame(
data={'movieId': list(range(56632)), 'genres': ['abcdefghijkl'[i] for i in np.random.randint(0, 12, 56632)] ,'title': ['_'] * 56632}
).to_csv(f'{INPUT_DATA_DIR}ml-25m/movies.csv', index=False)
data={
"movieId": list(range(56632)),
"genres": ["abcdefghijkl"[i] for i in np.random.randint(0, 12, 56632)],
"title": ["_"] * 56632,
}
).to_csv(f"{INPUT_DATA_DIR}ml-25m/movies.csv", index=False)
pd.DataFrame(
data={
'userId': np.random.randint(0, 162542, 1_000_000),
'movieId': np.random.randint(0, 56632, 1_000_000),
'rating': np.random.rand(1_000_000) * 5,
'timestamp': ['_'] * 1_000_000
}
).to_csv(f'{INPUT_DATA_DIR}ml-25m/ratings.csv', index=False)
"userId": np.random.randint(0, 162542, 1_000_000),
"movieId": np.random.randint(0, 56632, 1_000_000),
"rating": np.random.rand(1_000_000) * 5,
"timestamp": ["_"] * 1_000_000,
}
).to_csv(f"{INPUT_DATA_DIR}ml-25m/ratings.csv", index=False)
tb1.execute()
assert os.path.isfile("/tmp/input/getting_started/movies_converted.parquet")
assert os.path.isfile("/tmp/input/getting_started/train.parquet")
assert os.path.isfile("/tmp/input/getting_started/valid.parquet")

with testbook(
REPO_ROOT
/ "examples/getting-started-movielens/02-ETL-with-NVTabular.ipynb",
REPO_ROOT / "examples/getting-started-movielens/02-ETL-with-NVTabular.ipynb",
execute=False,
) as tb2:
tb2.inject(
Expand All @@ -65,8 +66,7 @@ def test_func():
assert os.path.isdir("/tmp/input/getting_started/workflow")

with testbook(
REPO_ROOT
/ "examples/getting-started-movielens/03-Training-with-HugeCTR.ipynb",
REPO_ROOT / "examples/getting-started-movielens/03-Training-with-HugeCTR.ipynb",
execute=False,
) as tb3:
tb3.inject(
Expand All @@ -77,7 +77,7 @@ def test_func():
)
tb3.execute_cell(list(range(0, 21)))
os.environ["INPUT_DATA_DIR"] = INPUT_DATA_DIR
os.system('python train_hugeCTR.py')
os.system("python train_hugeCTR.py")
tb3.execute_cell(list(range(21, len(tb3.cells))))

with testbook(
Expand All @@ -92,5 +92,9 @@ def test_func():
"""
)
tb4.execute_cell(list(range(0, 13)))
with run_triton_server(os.path.join(INPUT_DATA_DIR, "model"), grpc_port=8001, backend_config=f'hugectr,ps={MODEL_DIR}/ps.json'):
with run_triton_server(
os.path.join(INPUT_DATA_DIR, "model"),
grpc_port=8001,
backend_config=f"hugectr,ps={MODEL_DIR}/ps.json",
):
tb4.execute_cell(list(range(13, len(tb4.cells))))
39 changes: 19 additions & 20 deletions tests/unit/examples/test_getting_started_pytorch.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import os

from testbook import testbook
import pandas as pd
import os
import shutil

import numpy as np
import pandas as pd
import pytest
from testbook import testbook

from merlin.systems.triton.utils import run_triton_server
from tests.conftest import REPO_ROOT
import pytest

pytest.importorskip("torch")
# flake8: noqa
Expand All @@ -17,8 +15,7 @@
def test_func():
INPUT_DATA_DIR = "/tmp/input/getting_started/"
with testbook(
REPO_ROOT
/ "examples/getting-started-movielens/01-Download-Convert.ipynb",
REPO_ROOT / "examples/getting-started-movielens/01-Download-Convert.ipynb",
execute=False,
) as tb1:
tb1.cells.pop(7)
Expand All @@ -32,24 +29,27 @@ def test_func():
shutil.rmtree(INPUT_DATA_DIR)
os.makedirs(f"{INPUT_DATA_DIR}ml-25m", exist_ok=True)
pd.DataFrame(
data={'movieId': list(range(56632)), 'genres': ['abcdefghijkl'[i] for i in np.random.randint(0, 12, 56632)] ,'title': ['_'] * 56632}
).to_csv(f'{INPUT_DATA_DIR}ml-25m/movies.csv', index=False)
data={
"movieId": list(range(56632)),
"genres": ["abcdefghijkl"[i] for i in np.random.randint(0, 12, 56632)],
"title": ["_"] * 56632,
}
).to_csv(f"{INPUT_DATA_DIR}ml-25m/movies.csv", index=False)
pd.DataFrame(
data={
'userId': np.random.randint(0, 162542, 1_000_000),
'movieId': np.random.randint(0, 56632, 1_000_000),
'rating': np.random.rand(1_000_000) * 5,
'timestamp': ['_'] * 1_000_000
}
).to_csv(f'{INPUT_DATA_DIR}ml-25m/ratings.csv', index=False)
"userId": np.random.randint(0, 162542, 1_000_000),
"movieId": np.random.randint(0, 56632, 1_000_000),
"rating": np.random.rand(1_000_000) * 5,
"timestamp": ["_"] * 1_000_000,
}
).to_csv(f"{INPUT_DATA_DIR}ml-25m/ratings.csv", index=False)
tb1.execute()
assert os.path.isfile("/tmp/input/getting_started/movies_converted.parquet")
assert os.path.isfile("/tmp/input/getting_started/train.parquet")
assert os.path.isfile("/tmp/input/getting_started/valid.parquet")

with testbook(
REPO_ROOT
/ "examples/getting-started-movielens/02-ETL-with-NVTabular.ipynb",
REPO_ROOT / "examples/getting-started-movielens/02-ETL-with-NVTabular.ipynb",
execute=False,
) as tb2:
tb2.inject(
Expand All @@ -64,8 +64,7 @@ def test_func():
assert os.path.isdir("/tmp/input/getting_started/workflow")

with testbook(
REPO_ROOT
/ "examples/getting-started-movielens/03-Training-with-PyTorch.ipynb",
REPO_ROOT / "examples/getting-started-movielens/03-Training-with-PyTorch.ipynb",
execute=False,
) as tb3:
tb3.inject(
Expand Down
47 changes: 25 additions & 22 deletions tests/unit/examples/test_getting_started_tensorflow.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,22 @@
import os

from testbook import testbook
import pandas as pd
import os
import shutil
import numpy as np

import numpy as np
import pandas as pd
import pytest
from merlin.systems.triton.utils import run_triton_server
from testbook import testbook

from tests.conftest import REPO_ROOT
import pytest

pytest.importorskip("tensorflow")
# flake8: noqa


def test_func():
INPUT_DATA_DIR = "/tmp/input/getting_started/"
MODEL_DIR = os.path.join(INPUT_DATA_DIR, "model")
with testbook(
REPO_ROOT
/ "examples/getting-started-movielens/01-Download-Convert.ipynb",
REPO_ROOT / "examples/getting-started-movielens/01-Download-Convert.ipynb",
execute=False,
) as tb1:
tb1.cells.pop(7)
Expand All @@ -33,24 +30,27 @@ def test_func():
shutil.rmtree(INPUT_DATA_DIR)
os.makedirs(f"{INPUT_DATA_DIR}ml-25m", exist_ok=True)
pd.DataFrame(
data={'movieId': list(range(56632)), 'genres': ['abcdefghijkl'[i] for i in np.random.randint(0, 12, 56632)] ,'title': ['_'] * 56632}
).to_csv(f'{INPUT_DATA_DIR}ml-25m/movies.csv', index=False)
data={
"movieId": list(range(56632)),
"genres": ["abcdefghijkl"[i] for i in np.random.randint(0, 12, 56632)],
"title": ["_"] * 56632,
}
).to_csv(f"{INPUT_DATA_DIR}ml-25m/movies.csv", index=False)
pd.DataFrame(
data={
'userId': np.random.randint(0, 162542, 1_000_000),
'movieId': np.random.randint(0, 56632, 1_000_000),
'rating': np.random.rand(1_000_000) * 5,
'timestamp': ['_'] * 1_000_000
}
).to_csv(f'{INPUT_DATA_DIR}ml-25m/ratings.csv', index=False)
"userId": np.random.randint(0, 162542, 1_000_000),
"movieId": np.random.randint(0, 56632, 1_000_000),
"rating": np.random.rand(1_000_000) * 5,
"timestamp": ["_"] * 1_000_000,
}
).to_csv(f"{INPUT_DATA_DIR}ml-25m/ratings.csv", index=False)
tb1.execute()
assert os.path.isfile("/tmp/input/getting_started/movies_converted.parquet")
assert os.path.isfile("/tmp/input/getting_started/train.parquet")
assert os.path.isfile("/tmp/input/getting_started/valid.parquet")

with testbook(
REPO_ROOT
/ "examples/getting-started-movielens/02-ETL-with-NVTabular.ipynb",
REPO_ROOT / "examples/getting-started-movielens/02-ETL-with-NVTabular.ipynb",
execute=False,
) as tb2:
tb2.inject(
Expand All @@ -65,8 +65,7 @@ def test_func():
assert os.path.isdir("/tmp/input/getting_started/workflow")

with testbook(
REPO_ROOT
/ "examples/getting-started-movielens/03-Training-with-TF.ipynb",
REPO_ROOT / "examples/getting-started-movielens/03-Training-with-TF.ipynb",
execute=False,
) as tb3:
tb3.inject(
Expand All @@ -89,5 +88,9 @@ def test_func():
os.environ["INPUT_DATA_DIR"] = "{INPUT_DATA_DIR}"
"""
)
with run_triton_server(os.path.join(MODEL_DIR, "ensemble"), grpc_port=8001, backend_config=f'tensorflow,version=2'):
with run_triton_server(
os.path.join(MODEL_DIR, "ensemble"),
grpc_port=8001,
backend_config="tensorflow,version=2",
):
tb4.execute()
2 changes: 2 additions & 0 deletions tests/unit/examples/test_scaling_criteo_merlin_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

import pytest
from testbook import testbook

from tests.conftest import REPO_ROOT

pytest.importorskip("tensorflow")


@pytest.mark.multigpu
def test_func():
with testbook(
REPO_ROOT / "examples" / "scaling-criteo" / "02-ETL-with-NVTabular.ipynb",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# import os

import pytest

# from testbook import testbook
# from tests.conftest import REPO_ROOT

pytest.importorskip("hugectr")


# @pytest.mark.multigpu
# def test_test_scaling_criteo_merlin_models_hugectr():
# with testbook(
# REPO_ROOT / "examples" / "scaling-criteo" / "02-ETL-with-NVTabular.ipynb",
Expand All @@ -20,7 +21,7 @@
# os.environ["INPUT_DATA_DIR"] = "/tmp/test_merlin_criteo_hugectr/input/criteo/"
# os.environ["OUTPUT_DATA_DIR"] = "/tmp/test_merlin_criteo_hugectr/output/criteo/"
# os.environ["USE_HUGECTR"] = "True"

# os.system("mkdir -p /tmp/test_merlin_criteo_hugectr/input/criteo")
# os.system("mkdir -p /tmp/test_merlin_criteo_hugectr/output/criteo/")

Expand Down Expand Up @@ -76,15 +77,15 @@
# import shutil
# from merlin.systems.triton.utils import run_triton_server, send_triton_request
# outputs = ["OUTPUT0"]

# with run_triton_server(
# "/tmp/test_merlin_criteo_hugectr/output/criteo/model_inference/",
# backend_config='hugectr,ps=/tmp/test_merlin_criteo_hugectr/output/criteo/model_inference/ps.json'
# ) as client:
# response = send_triton_request(
# input_schema, batch.fillna(0), outputs, client=client, triton_model="criteo_ens"
# )

# response = response["OUTPUT0"]
# """
# )
Expand Down
24 changes: 23 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,29 @@ commands =
python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git
python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git

python -m pytest --cov-report term --cov merlin -rxs tests/unit
python -m pytest -m "not multigpu" --cov-report term --cov merlin -rxs tests/unit
python -m pytest -m "singlegpu" --cov-report term --cov merlin -rxs tests/unit

[testenv:test-gpu-multigpu]
passenv =
OPAL_PREFIX
setenv =
TF_GPU_ALLOCATOR=cuda_malloc_async
sitepackages=true
; Runs in: multi-gpu github actions runners
; Runs GPU-based tests marked with `pytest.mark.multigpu`
deps =
pytest
pytest-cov
commands =
python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/systems.git
python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/models.git
python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/NVTabular.git
python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git
python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git

python -m pytest -m "multigpu" --cov-report term --cov merlin -rxs tests/unit


[testenv:docs]
; Runs in: Github Actions
Expand Down

0 comments on commit 725b031

Please sign in to comment.