rapidsai · rapids-bot · Jul 31, 2023 · Jul 10, 2023 · Jul 11, 2023 · Jul 11, 2023
@@ -200,6 +200,10 @@ class LinearRegression(LinearPredictMixin,
     fit_intercept : boolean (default = True)
         If True, LinearRegression tries to correct for the global mean of y.
         If False, the model expects that you have centered the data.
+    copy_X : bool, default=True
+        If True, it is guaranteed that a copy of X is created, leaving the
+        original X unchanged. However, if set to False, X may be modified
+        directly, which would reduce the memory usage of the estimator.
     normalize : boolean (default = False)
         This parameter is ignored when `fit_intercept` is set to False.
         If True, the predictors in X will be normalized by dividing by the
@@ -253,14 +257,22 @@ class LinearRegression(LinearPredictMixin,
 
     For an additional example see `the OLS notebook
     <https://github.com/rapidsai/cuml/blob/main/notebooks/linear_regression_demo.ipynb>`__.
+
+    .. note:: Starting from version 23.08, the new 'copy_X' parameter defaults
+              to 'True', ensuring a copy of X is created after passing it to
+              fit(), preventing any changes to the input, but with increased
+              memory usage. This represents a change in behavior from previous
+              versions. With `copy_X=False` a copy might still be created if
+              necessary.
     """
 
     _cpu_estimator_import_path = 'sklearn.linear_model.LinearRegression'
     coef_ = CumlArrayDescriptor(order='F')
     intercept_ = CumlArrayDescriptor(order='F')
 
     @device_interop_preparation
-    def __init__(self, *, algorithm='eig', fit_intercept=True, normalize=False,
+    def __init__(self, *, algorithm='eig', fit_intercept=True,
+                 copy_X=None, normalize=False,
                  handle=None, verbose=False, output_type=None):
         if handle is None and algorithm == 'eig':
             # if possible, create two streams, so that eigenvalue decomposition
@@ -284,6 +296,17 @@ class LinearRegression(LinearPredictMixin,
             raise TypeError(msg.format(algorithm))
 
         self.intercept_value = 0.0
+        if copy_X is None:
+            warnings.warn(
+                "Starting from version 23.08, the new 'copy_X' parameter defaults "
+                "to 'True', ensuring a copy of X is created after passing it to "
+                "fit(), preventing any changes to the input, but with increased "
+                "memory usage. This represents a change in behavior from previous "
+                "versions. With `copy_X=False` a copy might still be created if "
+                "necessary. Explicitly set 'copy_X' to either True or False to "
+                "suppress this warning.", UserWarning)
+            copy_X = True
+        self.copy_X = copy_X
 
     def _get_algorithm_int(self, algorithm):
         return {
@@ -303,8 +326,15 @@ class LinearRegression(LinearPredictMixin,
 
         """
         cdef uintptr_t X_ptr, y_ptr, sample_weight_ptr
+
+        need_explicit_copy = self.copy_X and hasattr(X, "__cuda_array_interface__") \
+            and (len(X.shape) < 2 or X.shape[1] == 1)
+
         X_m, n_rows, self.n_features_in_, self.dtype = \
-            input_to_cuml_array(X, check_dtype=[np.float32, np.float64])
+            input_to_cuml_array(X,
+                                check_dtype=[np.float32, np.float64],
+                                deepcopy=need_explicit_copy)
+
         X_ptr = X_m.ptr
         self.feature_names_in_ = X_m.index
 
@@ -461,7 +491,7 @@ class LinearRegression(LinearPredictMixin,
 
     def get_param_names(self):
         return super().get_param_names() + \
-            ['algorithm', 'fit_intercept', 'normalize']
+            ['algorithm', 'fit_intercept', 'copy_X', 'normalize']
 
     def get_attr_names(self):
         return ['coef_', 'intercept_', 'n_features_in_', 'feature_names_in_']

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from contextlib import nullcontext
 from distutils.version import LooseVersion
 from functools import lru_cache
 
@@ -138,6 +139,11 @@ def cuml_compatible_dataset(X_train, X_test, y_train, _=None):
     )
 
 
+_ALGORITHMS = ["svd", "eig", "qr", "svd-qr", "svd-jacobi"]
+
+algorithms = st.sampled_from(_ALGORITHMS)
+
+
 @pytest.mark.parametrize("ntargets", [1, 2])
 @pytest.mark.parametrize("datatype", [np.float32, np.float64])
 @pytest.mark.parametrize("algorithm", ["eig", "svd"])
@@ -972,3 +978,38 @@ def test_elasticnet_solvers_eq(datatype, alpha, l1_ratio, nrows, column_info):
     assert qn.score(X_test, cd_res) > 0.95
     # coefficients of the two models should be close
     assert np.corrcoef(cd.coef_, qn.coef_)[0, 1] > 0.98
+
+
+@given(
+    dataset=standard_regression_datasets(
+        n_features=st.integers(min_value=1, max_value=10),
+        dtypes=floating_dtypes(sizes=(32, 64)),
+    ),
+    algorithm=algorithms,
+    xp=st.sampled_from([np, cp]),
+    copy=st.sampled_from((True, False, None, ...)),
+)
+@example(make_regression(n_features=1), "svd", cp, True)
+@example(make_regression(n_features=1), "svd", cp, False)
+@example(make_regression(n_features=1), "svd", cp, None)
+@example(make_regression(n_features=1), "svd", cp, ...)
+@example(make_regression(n_features=1), "svd", np, False)
+@example(make_regression(n_features=2), "svd", cp, False)
+@example(make_regression(n_features=2), "eig", np, False)
+def test_linear_regression_input_copy(dataset, algorithm, xp, copy):
+    X, y = dataset
+    X, y = xp.asarray(X), xp.asarray(y)
+    X_copy = X.copy()
+
+    with (pytest.warns(UserWarning) if copy in (None, ...) else nullcontext()):
+        if copy is ...:  # no argument
+            cuLR = cuLinearRegression(algorithm=algorithm)
+        else:
+            cuLR = cuLinearRegression(algorithm=algorithm, copy_X=copy)
+
+    cuLR.fit(X, y)
+
+    if (X.shape[1] == 1 and xp is cp) and copy is False:
+        assert not array_equal(X, X_copy)
+    else:
+        assert array_equal(X, X_copy)