Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci/tests: cleaning standalone script #19141

Merged
merged 20 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ jobs:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
PIP_CACHE_DIR: "/var/tmp/pip"
PL_RUN_CUDA_TESTS: "1"
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
Expand Down Expand Up @@ -126,19 +127,16 @@ jobs:
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
displayName: "Adjust tests & examples"

- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
workingDirectory: tests/tests_fabric
env:
PL_RUN_CUDA_TESTS: "1"
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50
workingDirectory: tests/tests_fabric/
displayName: "Testing: fabric standard"
timeoutInMinutes: "10"

- bash: bash run_standalone_tests.sh
workingDirectory: tests/tests_fabric
- bash: bash ../run_standalone_tests.sh "."
workingDirectory: tests/tests_fabric/
env:
PL_RUN_CUDA_TESTS: "1"
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
displayName: "Testing: fabric standalone tests"
displayName: "Testing: fabric standalone"
timeoutInMinutes: "10"

- bash: |
Expand All @@ -152,12 +150,12 @@ jobs:
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
ls -l
workingDirectory: tests/tests_fabric
workingDirectory: tests/tests_fabric/
displayName: "Statistics"

- script: |
set -e
bash run_fabric_examples.sh --accelerator=cuda --devices=1
bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp
workingDirectory: examples
workingDirectory: examples/
displayName: "Testing: fabric examples"
7 changes: 2 additions & 5 deletions .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ jobs:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
PIP_CACHE_DIR: "/var/tmp/pip"
PL_RUN_CUDA_TESTS: "1"
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
Expand Down Expand Up @@ -154,16 +155,13 @@ jobs:

- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
workingDirectory: tests/tests_pytorch
env:
PL_RUN_CUDA_TESTS: "1"
displayName: "Testing: PyTorch standard"
timeoutInMinutes: "35"

- bash: bash run_standalone_tests.sh
- bash: bash ../run_standalone_tests.sh "."
workingDirectory: tests/tests_pytorch
env:
PL_USE_MOCKED_MNIST: "1"
PL_RUN_CUDA_TESTS: "1"
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
displayName: "Testing: PyTorch standalone tests"
timeoutInMinutes: "35"
Expand All @@ -172,7 +170,6 @@ jobs:
workingDirectory: tests/tests_pytorch
env:
PL_USE_MOCKED_MNIST: "1"
PL_RUN_CUDA_TESTS: "1"
displayName: "Testing: PyTorch standalone tasks"
timeoutInMinutes: "10"

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ addopts = [
"--ignore=legacy/checkpoints",
]
markers = [
"cloud:Run the cloud tests for example",
"cloud: Run the cloud tests for example",
"standalone: Run the test in single pytest execution",
Borda marked this conversation as resolved.
Show resolved Hide resolved
]
filterwarnings = [
"error::FutureWarning",
Expand Down
4 changes: 2 additions & 2 deletions src/lightning/pytorch/utilities/testing/_runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from lightning_utilities.core.imports import RequirementCache

from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
from lightning.fabric.utilities.testing import _runif_reasons as FabricRunIf
from lightning.fabric.utilities.testing import _runif_reasons as fabric_run_if
from lightning.pytorch.accelerators.cpu import _PSUTIL_AVAILABLE
from lightning.pytorch.callbacks.progress.rich_progress import _RICH_AVAILABLE
from lightning.pytorch.core.module import _ONNX_AVAILABLE
Expand Down Expand Up @@ -68,7 +68,7 @@ def _runif_reasons(

"""

reasons, kwargs = FabricRunIf(
reasons, kwargs = fabric_run_if(
min_cuda_gpus=min_cuda_gpus,
min_torch=min_torch,
max_torch=max_torch,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,19 @@ source="${PL_STANDALONE_TESTS_SOURCE:-"lightning"}"
# this environment variable allows special tests to run
export PL_RUN_STANDALONE_TESTS=1
# python arguments
defaults="-m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120"
defaults=" -m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120 "
echo "Using defaults: ${defaults}"

# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py')
# get the testing location as the fist argument
test_path=$1
printf "source path: $test_path\n"

# file paths, remove duplicates
files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq)

# get the list of parametrizations. we need to call them separately. the last two lines are removed.
# note: if there's a syntax error, this will fail with some garbled output
if [[ "$OSTYPE" == "darwin"* ]]; then
parametrizations=$(python3 -m pytest $files --collect-only --quiet "$@" | tail -r | sed -e '1,3d' | tail -r)
else
parametrizations=$(python3 -m pytest $files --collect-only --quiet "$@" | head -n -2)
fi
# remove the "tests/tests_pytorch/" path suffixes
path_suffix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/" # https://stackoverflow.com/a/8223345
parametrizations=${parametrizations//$path_suffix/}
# collect all tests with parametrization based filtering with PL_RUN_STANDALONE_TESTS
standalone_tests=$(python -m pytest $test_path -q --collect-only --pythonwarnings ignore)
printf "Collected tests: \n $standalone_tests"
# match only lines with tests
parametrizations=$(grep -oP '\S+::test_\S+' <<< "$standalone_tests")
# convert the list to be array
parametrizations_arr=($parametrizations)

report=''
Expand All @@ -61,30 +55,25 @@ function show_batched_output {
}
trap show_batched_output EXIT # show the output on exit

# remove the "tests/tests_pytorch/" path suffixes
path_prefix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/" # https://stackoverflow.com/a/8223345
for i in "${!parametrizations_arr[@]}"; do
parametrization=${parametrizations_arr[$i]}
parametrization=${parametrizations_arr[$i]//$path_prefix/}
prefix="$((i+1))/${#parametrizations_arr[@]}"

# check blocklist
if [[ "${parametrization}" == *"test_pytorch_profiler_nested_emit_nvtx"* ]]; then
echo "$prefix: Skipping $parametrization"
report+="Skipped\t$parametrization\n"
# do not continue the loop because we might need to wait for batched jobs
else
echo "$prefix: Running $parametrization"
echo "$prefix: Running $parametrization"

# fix the port to avoid race condition when batched distributed tests select the port randomly
export MASTER_PORT=$((29500 + $i % $test_batch_size))
# fix the port to avoid race condition when batched distributed tests select the port randomly
export MASTER_PORT=$((29500 + $i % $test_batch_size))

# execute the test in the background
# redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
# output to std{out,err} because the outputs would be garbled together
python3 ${defaults} "$parametrization" &>> standalone_test_output.txt &
# save the PID in an array
pids[${i}]=$!
# add row to the final report
report+="Ran\t$parametrization\n"
fi
# execute the test in the background
# redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
# output to std{out,err} because the outputs would be garbled together
python ${defaults} "$parametrization" &>> standalone_test_output.txt &
# save the PID in an array
pids[${i}]=$!
# add row to the final report
report+="Ran\t$parametrization\n"

if ((($i + 1) % $test_batch_size == 0)); then
# wait for running tests
Expand Down
33 changes: 17 additions & 16 deletions tests/tests_fabric/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,22 +192,23 @@ def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.C

for kwarg, env_var in options.items():
# this will compute the intersection of all tests selected per environment variable
if os.getenv(env_var, "0") == "1":
conditions.append(env_var)
for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items
already_skipped = any(marker.name == "skip" for marker in test.own_markers)
if already_skipped:
# the test was going to be skipped anyway, filter it out
items.pop(i)
skipped += 1
continue
has_runif_with_kwarg = any(
marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
)
if not has_runif_with_kwarg:
# the test has `@RunIf(kwarg=True)`, filter it out
items.pop(i)
filtered += 1
if os.getenv(env_var, "0") != "1":
continue
conditions.append(env_var)
for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items
already_skipped = any(marker.name == "skip" for marker in test.own_markers)
if already_skipped:
# the test was going to be skipped anyway, filter it out
items.pop(i)
skipped += 1
continue
has_runif_with_kwarg = any(
marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
)
if not has_runif_with_kwarg:
# the test has `@RunIf(kwarg=True)`, filter it out
items.pop(i)
filtered += 1

if config.option.verbose >= 0 and (filtered or skipped):
writer = config.get_terminal_writer()
Expand Down
1 change: 0 additions & 1 deletion tests/tests_fabric/run_standalone_tests.sh

This file was deleted.

6 changes: 3 additions & 3 deletions tests/tests_fabric/run_tpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ python3 -c "from lightning.fabric.accelerators.xla import _XLA_AVAILABLE; print(
python3 -c "from lightning.fabric.accelerators import XLAAccelerator; assert XLAAccelerator.is_available()"
echo "Sanity check passed!"

echo "--- Running Fabric tests ---"
echo "--- Running Regular tests ---"
Borda marked this conversation as resolved.
Show resolved Hide resolved
cd tests/tests_fabric
PL_RUN_TPU_TESTS=1 python3 -m coverage run --source=lightning -m pytest -vv --durations=0 --timeout 60 ./

echo "--- Running standalone Fabric tests ---"
PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
echo "--- Running Standalone tests ---"
Borda marked this conversation as resolved.
Show resolved Hide resolved
PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash ../run_standalone_tests.sh "."

echo "--- Generating coverage ---"
python3 -m coverage xml
Expand Down
36 changes: 18 additions & 18 deletions tests/tests_pytorch/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,7 @@ def thread_police_duuu_daaa_duuu_daaa():
continue
elif thread.name == "fsspecIO":
continue
else:
raise AssertionError(f"Test left zombie thread: {thread}")
raise AssertionError(f"Test left zombie thread: {thread}")
Borda marked this conversation as resolved.
Show resolved Hide resolved


def mock_cuda_count(monkeypatch, n: int) -> None:
Expand Down Expand Up @@ -318,22 +317,23 @@ def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.C

for kwarg, env_var in options.items():
# this will compute the intersection of all tests selected per environment variable
if os.getenv(env_var, "0") == "1":
conditions.append(env_var)
for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items
already_skipped = any(marker.name == "skip" for marker in test.own_markers)
if already_skipped:
# the test was going to be skipped anyway, filter it out
items.pop(i)
skipped += 1
continue
has_runif_with_kwarg = any(
marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
)
if not has_runif_with_kwarg:
# the test has `@RunIf(kwarg=True)`, filter it out
items.pop(i)
filtered += 1
if os.getenv(env_var, "0") != "1":
continue
conditions.append(env_var)
for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items
already_skipped = any(marker.name == "skip" for marker in test.own_markers)
if already_skipped:
# the test was going to be skipped anyway, filter it out
items.pop(i)
skipped += 1
continue
has_runif_with_kwarg = any(
marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
)
if not has_runif_with_kwarg:
# the test has `@RunIf(kwarg=True)`, filter it out
items.pop(i)
filtered += 1

if config.option.verbose >= 0 and (filtered or skipped):
writer = config.get_terminal_writer()
Expand Down
29 changes: 15 additions & 14 deletions tests/tests_pytorch/profilers/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,20 +471,21 @@ def look_for_trace(trace_dir):
assert look_for_trace(tmpdir / "lightning_logs" / "version_0")


@RunIf(min_cuda_gpus=1, standalone=True)
def test_pytorch_profiler_nested_emit_nvtx():
"""This test check emit_nvtx is correctly supported."""
profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True)
model = BoringModel()
trainer = Trainer(
fast_dev_run=True,
profiler=profiler,
accelerator="gpu",
devices=1,
enable_progress_bar=False,
enable_model_summary=False,
)
trainer.fit(model)
# @RunIf(min_cuda_gpus=1, standalone=True)
Borda marked this conversation as resolved.
Show resolved Hide resolved
# @pytest.mark.skipif(torch.cuda.get_device_capability()[0] >= 8)
# def test_pytorch_profiler_nested_emit_nvtx():
# """This test check emit_nvtx is correctly supported."""
# profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True)
# model = BoringModel()
# trainer = Trainer(
# fast_dev_run=True,
# profiler=profiler,
# accelerator="gpu",
# devices=1,
# enable_progress_bar=False,
# enable_model_summary=False,
# )
# trainer.fit(model)


def test_register_record_function(tmpdir):
Expand Down
15 changes: 10 additions & 5 deletions tests/tests_pytorch/run_standalone_tasks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,16 @@ set -e
# this environment variable allows special tests to run
export PL_RUN_STANDALONE_TESTS=1

can_run_nvprof=$(python -c "import torch; print(torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8)")
if [[ $can_run_nvprof == "True" ]]; then
echo "Running profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx"
nvprof --profile-from-start off -o trace_name.prof -- python -m coverage run --source lightning.pytorch --append -m pytest --no-header profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
fi
#can_run_nvprof=$(python -c "import torch; print(torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8)")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note this was not run in any our CI

#if [[ $can_run_nvprof == "True" ]]; then
# echo "Running profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx"
# nvprof --profile-from-start off \
# -o trace_name.prof \
# -- python -m coverage run \
# --source lightning.pytorch \
# --append -m pytest \
# --no-header profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
#fi

# test that a user can manually launch individual processes
echo "Running manual ddp launch test"
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_pytorch/run_tpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ cd tests/tests_pytorch
PL_RUN_TPU_TESTS=1 python3 -m coverage run --source=lightning -m pytest -vv --durations=0 --timeout 60 ./

echo "--- Running standalone PL tests ---"
PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash ../run_standalone_tests.sh "."

echo "--- Generating coverage ---"
python3 -m coverage xml
Expand Down
Loading