Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inference for float64 random forests using FIL #4739

Merged
merged 2 commits into from
May 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 2 additions & 13 deletions python/cuml/ensemble/randomforest_common.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,8 @@ class BaseRandomForestModel(Base):
check_rows=self.n_rows, check_cols=1)

if self.dtype == np.float64:
warnings.warn("To use pickling or GPU-based prediction first "
"train using float32 data to fit the estimator")
warnings.warn("To use pickling first train using float32 data "
"to fit the estimator")

max_feature_val = self._get_max_feat_val()
if type(self.min_samples_leaf) == float:
Expand Down Expand Up @@ -348,18 +348,7 @@ class BaseRandomForestModel(Base):
_, n_rows, n_cols, dtype = \
input_to_cuml_array(X, order='F',
check_cols=self.n_cols)

if dtype == np.float64 and not convert_dtype:
warnings.warn("GPU based predict only accepts "
"np.float32 data. The model was "
"trained on np.float64 data hence "
"cannot use GPU-based prediction! "
"\nDefaulting to CPU-based Prediction. "
"\nTo predict on float-64 data, set "
"parameter predict_model = 'CPU'")
return self._predict_model_on_cpu(X, convert_dtype=convert_dtype)
treelite_handle = self._obtain_treelite_handle()

storage_type = \
_check_fil_parameter_validity(depth=self.max_depth,
fil_sparse_format=fil_sparse_format,
Expand Down
26 changes: 2 additions & 24 deletions python/cuml/ensemble/randomforestclassifier.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -562,10 +562,7 @@ class RandomForestClassifier(BaseRandomForestModel,
----------
X : {}
predict_model : String (default = 'GPU')
'GPU' to predict using the GPU, 'CPU' otherwise. The 'GPU' can only
be used if the model was trained on float32 data and `X` is float32
or convert_dtype is set to True. Also the 'GPU' should only be
used for classification problems.
'GPU' to predict using the GPU, 'CPU' otherwise.
algo : string (default = ``'auto'``)
This is optional and required only while performing the
predict operation on the GPU.
Expand Down Expand Up @@ -605,16 +602,6 @@ class RandomForestClassifier(BaseRandomForestModel,
if predict_model == "CPU":
preds = self._predict_model_on_cpu(X,
convert_dtype=convert_dtype)
elif self.dtype == np.float64:
warnings.warn("GPU based predict only accepts "
"np.float32 data. The model was "
"trained on np.float64 data hence "
"cannot use GPU-based prediction! "
"\nDefaulting to CPU-based Prediction. "
"\nTo predict on float-64 data, set "
"parameter predict_model = 'CPU'")
preds = self._predict_model_on_cpu(X,
convert_dtype=convert_dtype)
else:
preds = \
self._predict_model_on_gpu(X=X, output_class=True,
Expand All @@ -633,8 +620,7 @@ class RandomForestClassifier(BaseRandomForestModel,
fil_sparse_format='auto') -> CumlArray:
"""
Predicts class probabilites for X. This function uses the GPU
implementation of predict. Therefore, data with 'dtype = np.float32'
should be used with this function.
implementation of predict.

Parameters
----------
Expand Down Expand Up @@ -671,14 +657,6 @@ class RandomForestClassifier(BaseRandomForestModel,
-------
y : {}
"""
if self.dtype == np.float64:
raise TypeError("GPU based predict only accepts np.float32 data. \
In order use the GPU predict the model should \
also be trained using a np.float32 dataset. \
If you would like to use np.float64 dtype \
then please use the CPU based predict by \
setting predict_model = 'CPU'")

preds_proba = \
self._predict_model_on_gpu(X, output_class=True,
algo=algo,
Expand Down
14 changes: 1 addition & 13 deletions python/cuml/ensemble/randomforestregressor.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -544,9 +544,7 @@ class RandomForestRegressor(BaseRandomForestModel,
----------
X : {}
predict_model : String (default = 'GPU')
'GPU' to predict using the GPU, 'CPU' otherwise. The GPU can only
be used if the model was trained on float32 data and `X` is float32
or convert_dtype is set to True.
'GPU' to predict using the GPU, 'CPU' otherwise.
algo : string (default = 'auto')
This is optional and required only while performing the
predict operation on the GPU.
Expand Down Expand Up @@ -582,16 +580,6 @@ class RandomForestRegressor(BaseRandomForestModel,
"""
if predict_model == "CPU":
preds = self._predict_model_on_cpu(X, convert_dtype)
elif self.dtype == np.float64:
warnings.warn("GPU based predict only accepts "
"np.float32 data. The model was "
"trained on np.float64 data hence "
"cannot use GPU-based prediction! "
"\nDefaulting to CPU-based Prediction. "
"\nTo predict on float-64 data, set "
"parameter predict_model = 'CPU'")
preds = self._predict_model_on_cpu(X,
convert_dtype=convert_dtype)
else:
preds = self._predict_model_on_gpu(
X=X,
Expand Down
77 changes: 21 additions & 56 deletions python/cuml/tests/test_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def test_tweedie_convergence(max_depth, split_criterion):
@pytest.mark.parametrize(
"max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)]
)
@pytest.mark.parametrize("datatype", [np.float32])
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("max_features", [1.0, "auto", "log2", "sqrt"])
def test_rf_classification(small_clf, datatype, max_samples, max_features):
use_handle = True
Expand Down Expand Up @@ -310,7 +310,7 @@ def test_rf_classification(small_clf, datatype, max_samples, max_features):
@pytest.mark.parametrize(
"max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)]
)
@pytest.mark.parametrize("datatype", [np.float32])
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize(
"max_features,n_bins",
[
Expand Down Expand Up @@ -379,7 +379,7 @@ def test_rf_regression(
assert fil_r2 >= (cu_r2 - 0.02)


@pytest.mark.parametrize("datatype", [np.float32])
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
def test_rf_classification_seed(small_clf, datatype):

X, y = small_clf
Expand Down Expand Up @@ -455,30 +455,13 @@ def test_rf_classification_float64(small_clf, datatype, convert_dtype):
assert cu_acc >= (sk_acc - 0.07)

# predict using cuML's GPU based prediction
if datatype[0] == np.float32 and convert_dtype:
fil_preds = cuml_model.predict(
X_test, predict_model="GPU", convert_dtype=convert_dtype
)
fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
fil_preds = cuml_model.predict(
X_test, predict_model="GPU", convert_dtype=convert_dtype
)
fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

fil_acc = accuracy_score(y_test, fil_preds)
assert fil_acc >= (cu_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
# if GPU predict cannot be used, display warning and use CPU predict
elif datatype[1] == np.float64:
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
fil_preds = cuml_model.predict(
X_test, predict_model="GPU",
convert_dtype=convert_dtype
)
assert("GPU based predict only accepts "
"np.float32 data. The model was "
"trained on np.float64 data hence "
"cannot use GPU-based prediction! "
"\nDefaulting to CPU-based Prediction. "
"\nTo predict on float-64 data, set "
"parameter predict_model = 'CPU'"
in str(w[-1].message))
fil_acc = accuracy_score(y_test, fil_preds)
assert fil_acc >= (cu_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa


@pytest.mark.parametrize(
Expand Down Expand Up @@ -513,30 +496,12 @@ def test_rf_regression_float64(large_reg, datatype):
assert cu_r2 >= (sk_r2 - 0.09)

# predict using cuML's GPU based prediction
if datatype[0] == np.float32:
fil_preds = cuml_model.predict(
X_test, predict_model="GPU", convert_dtype=True
)
fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
assert fil_r2 >= (cu_r2 - 0.02)

# because datatype[0] != np.float32 or datatype[0] != datatype[1]
# display warning when GPU-predict cannot be used and revert to CPU-predict
elif datatype[1] == np.float64:
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
fil_preds = cuml_model.predict(
X_test, predict_model="GPU"
)
assert("GPU based predict only accepts "
"np.float32 data. The model was "
"trained on np.float64 data hence "
"cannot use GPU-based prediction! "
"\nDefaulting to CPU-based Prediction. "
"\nTo predict on float-64 data, set "
"parameter predict_model = 'CPU'"
in str(w[-1].message))
fil_preds = cuml_model.predict(
X_test, predict_model="GPU", convert_dtype=True
)
fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
assert fil_r2 >= (cu_r2 - 0.02)


def check_predict_proba(test_proba, baseline_proba, y_test, rel_err):
Expand Down Expand Up @@ -624,13 +589,13 @@ def rf_classification(
check_predict_proba(cu_proba_gpu, sk_proba, y_test, 0.1)


@pytest.mark.parametrize("datatype", [(np.float32, np.float32)])
@pytest.mark.parametrize("datatype", [(np.float32, np.float64)])
@pytest.mark.parametrize("array_type", ["dataframe", "numpy"])
def test_rf_classification_multi_class(mclass_clf, datatype, array_type):
rf_classification(datatype, array_type, 1.0, 1.0, mclass_clf)


@pytest.mark.parametrize("datatype", [(np.float32, np.float32)])
@pytest.mark.parametrize("datatype", [(np.float32, np.float64)])
@pytest.mark.parametrize("max_samples", [unit_param(1.0), stress_param(0.95)])
@pytest.mark.parametrize("max_features", [1.0, "auto", "log2", "sqrt"])
def test_rf_classification_proba(
Expand All @@ -639,7 +604,7 @@ def test_rf_classification_proba(
rf_classification(datatype, "numpy", max_features, max_samples, small_clf)


@pytest.mark.parametrize("datatype", [np.float32])
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize(
"fil_sparse_format", ["not_supported", True, "auto", False]
)
Expand Down Expand Up @@ -727,7 +692,7 @@ def test_rf_classification_sparse(
assert fil_acc >= (sk_acc - 0.07)


@pytest.mark.parametrize("datatype", [np.float32])
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize(
"fil_sparse_format", ["not_supported", True, "auto", False]
)
Expand Down Expand Up @@ -817,12 +782,12 @@ def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo):

@pytest.mark.xfail(reason="Need rapidsai/rmm#415 to detect memleak robustly")
@pytest.mark.memleak
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("fil_sparse_format", [True, False, "auto"])
@pytest.mark.parametrize(
"n_iter", [unit_param(5), quality_param(30), stress_param(80)]
)
def test_rf_memory_leakage(small_clf, fil_sparse_format, n_iter):
datatype = np.float32
def test_rf_memory_leakage(small_clf, datatype, fil_sparse_format, n_iter):
use_handle = True

X, y = small_clf
Expand Down