Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add copy_X parameter to LinearRegression #5495

Merged
merged 14 commits into from
Jul 31, 2023
36 changes: 33 additions & 3 deletions python/cuml/linear_model/linear_regression.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@ class LinearRegression(LinearPredictMixin,
fit_intercept : boolean (default = True)
If True, LinearRegression tries to correct for the global mean of y.
If False, the model expects that you have centered the data.
copy_X : bool, default=True
If True, it is guaranteed that a copy of X is created, leaving the
original X unchanged. However, if set to False, X may be modified
directly, which would reduce the memory usage of the estimator.
normalize : boolean (default = False)
This parameter is ignored when `fit_intercept` is set to False.
If True, the predictors in X will be normalized by dividing by the
Expand Down Expand Up @@ -253,14 +257,22 @@ class LinearRegression(LinearPredictMixin,

For an additional example see `the OLS notebook
<https://github.com/rapidsai/cuml/blob/main/notebooks/linear_regression_demo.ipynb>`__.

.. note:: Starting from version 23.08, the new 'copy_X' parameter defaults
to 'True', ensuring a copy of X is created after passing it to
fit(), preventing any changes to the input, but with increased
memory usage. This represents a change in behavior from previous
versions. With `copy_X=False` a copy might still be created if
necessary.
"""

_cpu_estimator_import_path = 'sklearn.linear_model.LinearRegression'
coef_ = CumlArrayDescriptor(order='F')
intercept_ = CumlArrayDescriptor(order='F')

@device_interop_preparation
def __init__(self, *, algorithm='eig', fit_intercept=True, normalize=False,
def __init__(self, *, algorithm='eig', fit_intercept=True,
copy_X=None, normalize=False,
handle=None, verbose=False, output_type=None):
if handle is None and algorithm == 'eig':
# if possible, create two streams, so that eigenvalue decomposition
Expand All @@ -284,6 +296,17 @@ class LinearRegression(LinearPredictMixin,
raise TypeError(msg.format(algorithm))

self.intercept_value = 0.0
if copy_X is None:
warnings.warn(
"Starting from version 23.08, the new 'copy_X' parameter defaults "
"to 'True', ensuring a copy of X is created after passing it to "
"fit(), preventing any changes to the input, but with increased "
"memory usage. This represents a change in behavior from previous "
"versions. With `copy_X=False` a copy might still be created if "
"necessary. Explicitly set 'copy_X' to either True or False to "
"suppress this warning.", UserWarning)
copy_X = True
self.copy_X = copy_X

def _get_algorithm_int(self, algorithm):
return {
Expand All @@ -303,8 +326,15 @@ class LinearRegression(LinearPredictMixin,

"""
cdef uintptr_t X_ptr, y_ptr, sample_weight_ptr

need_explicit_copy = self.copy_X and hasattr(X, "__cuda_array_interface__") \
and (len(X.shape) < 2 or X.shape[1] == 1)
Copy link
Contributor Author

@viclafargue viclafargue Jul 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the issue only appear when the input is a vector?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only when this condition is False do we not already create a copy. So checking this ensures that we only create an explicit copy when copy_X is True and none of the other conditions are met which would have triggered a copy anyways.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I responded to a previous version of your question. Yes, an implicit copy is always created when the input is not a vector.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it not make more sense to have this logic be part of the input_to_cuml_array utility function?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In principle yes, but that would potentially change the behavior of many more classes and would require more thorough investigation and testing.


X_m, n_rows, self.n_features_in_, self.dtype = \
input_to_cuml_array(X, check_dtype=[np.float32, np.float64])
input_to_cuml_array(X,
check_dtype=[np.float32, np.float64],
deepcopy=need_explicit_copy)

X_ptr = X_m.ptr
self.feature_names_in_ = X_m.index

Expand Down Expand Up @@ -461,7 +491,7 @@ class LinearRegression(LinearPredictMixin,

def get_param_names(self):
return super().get_param_names() + \
['algorithm', 'fit_intercept', 'normalize']
['algorithm', 'fit_intercept', 'copy_X', 'normalize']

def get_attr_names(self):
return ['coef_', 'intercept_', 'n_features_in_', 'feature_names_in_']
Expand Down
41 changes: 41 additions & 0 deletions python/cuml/tests/test_linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from contextlib import nullcontext
from distutils.version import LooseVersion
from functools import lru_cache

Expand Down Expand Up @@ -138,6 +139,11 @@ def cuml_compatible_dataset(X_train, X_test, y_train, _=None):
)


_ALGORITHMS = ["svd", "eig", "qr", "svd-qr", "svd-jacobi"]

algorithms = st.sampled_from(_ALGORITHMS)


@pytest.mark.parametrize("ntargets", [1, 2])
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("algorithm", ["eig", "svd"])
Expand Down Expand Up @@ -972,3 +978,38 @@ def test_elasticnet_solvers_eq(datatype, alpha, l1_ratio, nrows, column_info):
assert qn.score(X_test, cd_res) > 0.95
# coefficients of the two models should be close
assert np.corrcoef(cd.coef_, qn.coef_)[0, 1] > 0.98


@given(
dataset=standard_regression_datasets(
n_features=st.integers(min_value=1, max_value=10),
dtypes=floating_dtypes(sizes=(32, 64)),
),
algorithm=algorithms,
xp=st.sampled_from([np, cp]),
copy=st.sampled_from((True, False, None, ...)),
)
@example(make_regression(n_features=1), "svd", cp, True)
@example(make_regression(n_features=1), "svd", cp, False)
@example(make_regression(n_features=1), "svd", cp, None)
@example(make_regression(n_features=1), "svd", cp, ...)
@example(make_regression(n_features=1), "svd", np, False)
@example(make_regression(n_features=2), "svd", cp, False)
@example(make_regression(n_features=2), "eig", np, False)
def test_linear_regression_input_copy(dataset, algorithm, xp, copy):
X, y = dataset
X, y = xp.asarray(X), xp.asarray(y)
X_copy = X.copy()

with (pytest.warns(UserWarning) if copy in (None, ...) else nullcontext()):
if copy is ...: # no argument
cuLR = cuLinearRegression(algorithm=algorithm)
else:
cuLR = cuLinearRegression(algorithm=algorithm, copy_X=copy)

cuLR.fit(X, y)

if (X.shape[1] == 1 and xp is cp) and copy is False:
assert not array_equal(X, X_copy)
else:
assert array_equal(X, X_copy)