Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adaptive stress tests when GPU memory capacity is insufficient #3916

Merged
merged 7 commits into from
Jul 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions python/cuml/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@

import pytest
import os
import subprocess

import numpy as np
import cupy as cp

from math import ceil
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import make_classification as skl_make_clas
Expand All @@ -30,6 +32,9 @@

def pytest_configure(config):
cp.cuda.set_allocator(None)
# max_gpu_memory: Capacity of the GPU memory in GB
pytest.max_gpu_memory = get_gpu_memory()
pytest.adapt_stress_test = 'CUML_ADAPT_STRESS_TESTS' in os.environ


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -146,3 +151,20 @@ def exact_shap_classification_dataset():
test_size=3,
random_state_generator=42,
random_state_train_test_split=42)


def get_gpu_memory():
bash_command = "nvidia-smi --query-gpu=memory.total --format=csv"
output = subprocess.check_output(bash_command,
shell=True).decode("utf-8")
lines = output.split("\n")
lines.pop(0)
gpus_memory = []
for line in lines:
tokens = line.split(" ")
if len(tokens) > 1:
gpus_memory.append(int(tokens[0]))
gpus_memory.sort()
max_gpu_memory = ceil(gpus_memory[-1] / 1024)

return max_gpu_memory
10 changes: 9 additions & 1 deletion python/cuml/test/dask/test_tsvd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -30,6 +30,14 @@
def test_pca_fit(data_info, input_type, client):

nrows, ncols, n_parts = data_info
if nrows == int(9e6) and pytest.max_gpu_memory < 48:
if pytest.adapt_stress_test:
nrows = nrows * pytest.max_gpu_memory // 256
ncols = ncols * pytest.max_gpu_memory // 256
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

from cuml.dask.decomposition import TruncatedSVD as daskTPCA
from sklearn.decomposition import TruncatedSVD

Expand Down
14 changes: 14 additions & 0 deletions python/cuml/test/test_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@
stress_param("int32")])
def test_dbscan(datatype, use_handle, nrows, ncols,
max_mbytes_per_batch, out_dtype):
if nrows == 500000 and pytest.max_gpu_memory < 32:
if pytest.adapt_stress_test:
nrows = nrows * pytest.max_gpu_memory // 32
else:
pytest.skip("Insufficient GPU memory for this test. "
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

n_samples = nrows
n_feats = ncols
X, y = make_blobs(n_samples=n_samples, cluster_std=0.01,
Expand Down Expand Up @@ -117,6 +124,13 @@ def test_dbscan_precomputed(datatype, nrows, max_mbytes_per_batch, out_dtype):
# Vary the eps to get a range of core point counts
@pytest.mark.parametrize('eps', [0.05, 0.1, 0.5])
def test_dbscan_sklearn_comparison(name, nrows, eps):
if nrows == 500000 and name == 'blobs' and pytest.max_gpu_memory < 32:
if pytest.adapt_stress_test:
nrows = nrows * pytest.max_gpu_memory // 32
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

default_base = {'quantile': .2,
'eps': eps,
'damping': .9,
Expand Down
10 changes: 8 additions & 2 deletions python/cuml/test/test_lars.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -21,7 +21,7 @@
array_equal,
unit_param,
quality_param,
stress_param,
stress_param
)

from sklearn.datasets import load_boston
Expand Down Expand Up @@ -115,6 +115,12 @@ def test_lars_model(datatype, nrows, column_info, precompute, normalize):
@pytest.mark.parametrize("precompute", [True, False])
def test_lars_collinear(datatype, nrows, column_info, precompute):
ncols, n_info = column_info
if nrows == 500000 and ncols == 1000 and pytest.max_gpu_memory < 32:
if pytest.adapt_stress_test:
nrows = nrows * pytest.max_gpu_memory // 32
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

X_train, X_test, y_train, y_test = make_regression_dataset(
datatype, nrows, ncols, n_info
Expand Down
9 changes: 8 additions & 1 deletion python/cuml/test/test_mbsgd_regressor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,6 +37,13 @@
'500000-1000-500-f32', '500000-1000-500-f64'])
def make_dataset(request):
nrows, ncols, n_info, datatype = request.param
if nrows == 500000 and datatype == np.float64 and \
pytest.max_gpu_memory < 32:
if pytest.adapt_stress_test:
nrows = nrows * pytest.max_gpu_memory // 32
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")
X, y = make_regression(n_samples=nrows, n_informative=n_info,
n_features=ncols, random_state=0)
X = cp.array(X).astype(datatype)
Expand Down
25 changes: 23 additions & 2 deletions python/cuml/test/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,16 @@ def test_pca_defaults(n_samples, n_features, sparse):
stress_param('blobs')])
def test_pca_fit_then_transform(datatype, input_type,
name, use_handle):
blobs_n_samples = 500000
if name == 'blobs' and pytest.max_gpu_memory < 32:
if pytest.adapt_stress_test:
blobs_n_samples = int(blobs_n_samples * pytest.max_gpu_memory / 32)
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

if name == 'blobs':
X, y = make_blobs(n_samples=500000,
X, y = make_blobs(n_samples=blobs_n_samples,
n_features=1000, random_state=0)

elif name == 'iris':
Expand Down Expand Up @@ -154,9 +161,17 @@ def test_pca_fit_then_transform(datatype, input_type,
stress_param('blobs')])
def test_pca_fit_transform(datatype, input_type,
name, use_handle):
blobs_n_samples = 500000

if name == 'blobs' and pytest.max_gpu_memory < 32:
if pytest.adapt_stress_test:
blobs_n_samples = int(blobs_n_samples * pytest.max_gpu_memory / 32)
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

if name == 'blobs':
X, y = make_blobs(n_samples=500000,
X, y = make_blobs(n_samples=blobs_n_samples,
n_features=1000, random_state=0)

elif name == 'iris':
Expand Down Expand Up @@ -223,6 +238,12 @@ def test_pca_inverse_transform(datatype, input_type,
@pytest.mark.parametrize('return_sparse', [True, False])
@pytest.mark.parametrize('cupy_input', [True, False])
def test_sparse_pca_inputs(nrows, ncols, whiten, return_sparse, cupy_input):
if ncols == 20000 and pytest.max_gpu_memory < 48:
if pytest.adapt_stress_test:
ncols = int(ncols * pytest.max_gpu_memory / 48)
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

if return_sparse:
pytest.skip("Loss of information in converting to cupy sparse csr")
Expand Down
31 changes: 31 additions & 0 deletions python/cuml/test/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,16 @@ def assert_model(pickled_model, X_test):
stress_param([500000, 1000, 500])])
@pytest.mark.parametrize('fit_intercept', [True, False])
def test_regressor_pickle(tmpdir, datatype, keys, data_size, fit_intercept):
if data_size[0] == 500000 and datatype == np.float64 and \
("LogisticRegression" in keys or "Ridge" in keys) and \
pytest.max_gpu_memory < 32:
if pytest.adapt_stress_test:
data_size[0] = data_size[0] * pytest.max_gpu_memory // 640
data_size[1] = data_size[1] * pytest.max_gpu_memory // 640
data_size[2] = data_size[2] * pytest.max_gpu_memory // 640
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")
result = {}

def create_mod():
Expand Down Expand Up @@ -384,6 +394,14 @@ def test_unfit_clone(model_name):
@pytest.mark.parametrize('data_info', [unit_param([500, 20, 10, 5]),
stress_param([500000, 1000, 500, 50])])
def test_neighbors_pickle(tmpdir, datatype, keys, data_info):
if data_info[0] == 500000 and pytest.max_gpu_memory < 32 and \
("KNeighborsClassifier" in keys or "KNeighborsRegressor" in keys):
if pytest.adapt_stress_test:
data_info[0] = data_info[0] * pytest.max_gpu_memory // 32
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

result = {}

def create_mod():
Expand Down Expand Up @@ -414,6 +432,13 @@ def assert_model(pickled_model, X_test):
50])])
@pytest.mark.parametrize('keys', k_neighbors_models.keys())
def test_k_neighbors_classifier_pickle(tmpdir, datatype, data_info, keys):
if data_info[0] == 500000 and "NearestNeighbors" in keys and \
pytest.max_gpu_memory < 32:
if pytest.adapt_stress_test:
data_info[0] = data_info[0] * pytest.max_gpu_memory // 32
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")
result = {}

def create_mod():
Expand Down Expand Up @@ -476,6 +501,12 @@ def assert_model(loaded_model, X):
@pytest.mark.parametrize('data_size', [unit_param([500, 20, 10]),
stress_param([500000, 1000, 500])])
def test_dbscan_pickle(tmpdir, datatype, keys, data_size):
if data_size[0] == 500000 and pytest.max_gpu_memory < 32:
if pytest.adapt_stress_test:
data_size[0] = data_size[0] * pytest.max_gpu_memory // 32
else:
pytest.skip("Insufficient GPU memory for this test."
"Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")
result = {}

def create_mod():
Expand Down