Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] avoid data copy where possible #2383

Merged
merged 10 commits into from
Sep 26, 2019
27 changes: 14 additions & 13 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,10 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
elif isinstance(data, Series):
if _get_bad_pandas_dtypes([data.dtypes]):
raise ValueError('Series.dtypes must be int, float or bool')
if hasattr(data.values, 'values'): # SparseArray
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix FutureWarning:

FutureWarning: The SparseArray.values attribute is deprecated and will be removed in a future version. You can use `np.asarray(...)` or the `.to_dense()` method instead.

return data.values.values.astype(dtype)
if data.dtype == np.float32 or data.dtype == np.float64:
return np.array(data, dtype=data.dtype, copy=False)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@guolinke According to this note

* \brief Set vector to a content in info.
* \note
* - \a monotone_constraints only works for ``C_API_DTYPE_INT8``;
* - \a group only works for ``C_API_DTYPE_INT32``;
* - \a label and \a weight only work for ``C_API_DTYPE_FLOAT32``;
* - \a init_score and \a feature_penalty only work for ``C_API_DTYPE_FLOAT64``.

it seems that we cannot allow these memory savings, right?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry for missing this comment.
yes, the field in dataset don't allow arbitrary data types.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I could have a parameter, maybe called new_type with default value None. And when it is not none, convert to the new type if it is not that type.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@guolinke Seems we always need to force the type, except training/test data, but it's not going through this function. So, I can simply remove that if statement.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping @guolinke

else:
return data.values.astype(dtype)
return np.array(data, dtype=dtype, copy=False) # SparseArray should be supported as well
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whenever I see a comment like this, I think "that should definitely be a unit test". Do we already have a unit test in the Python package that covers that case where the input data is a sparse array? cd py-pkg && git grep -i sparse didn't return any test files.

@StrikerRUS I think either this test should be added as part of accepting this PR or we should create an issue documenting it. Otherwise we may reintroduce a bug in the behavior in the future without knowing it. What do you think?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jameslamb

Do we already have a unit test in the Python package that covers that case where the input data is a sparse array?

Yep! Even 2! 😄

@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_sparse(self):
import pandas as pd
X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.SparseArray(np.random.permutation([True, False] * 150))})
y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
params = {
'objective': 'binary',
'verbose': -1
}
lgb_train = lgb.Dataset(X, y)
gbm = lgb.train(params, lgb_train, num_boost_round=10)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense)

@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_sparse(self):
import pandas as pd
X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.SparseArray(np.random.permutation([True, False] * 150))})
y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
gbm = lgb.sklearn.LGBMClassifier().fit(X, y)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha oh ok 😊 Tells you how much I have worked on the Python side of the project...I assumed tests would be in python-package (that is how many Python packages are set up), not in a separate folder branched off the repo root. That's what I get for blindly git grep-ing without actually looking at the repo. I am ashamed haha

Thanks!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

approved ✅

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests had already been there when I joined the project. So, I can't say much about it... I guess, the rule about tests inside the package is crucial for mono-language repos.

else:
raise TypeError("Wrong type({0}) for {1}.\n"
"It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
Expand Down Expand Up @@ -296,7 +296,9 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in the following fields: "
+ ', '.join(data.columns[bad_indices]))
data = data.values.astype('float')
data = data.values
if data.dtype != np.float32 and data.dtype != np.float64:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not simple 'float':

import numpy as np

arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
arr.dtype == 'float'  # False

data = data.astype(np.float32)
else:
if feature_name == 'auto':
feature_name = None
Expand All @@ -311,7 +313,9 @@ def _label_from_pandas(label):
raise ValueError('DataFrame for label cannot have multiple columns')
if _get_bad_pandas_dtypes(label.dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float').flatten()
label = np.ravel(label.values)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if label.dtype != np.float32 and label.dtype != np.float64:
label = label.astype(np.float32)
return label


Expand Down Expand Up @@ -534,8 +538,7 @@ def __pred_for_np2d(self, mat, num_iteration, predict_type):
def inner_predict(mat, num_iteration, predict_type, preds=None):
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
else: # change non-float data to float data, need to copy
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data, _ = c_float_array(data)
n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type)
Expand Down Expand Up @@ -875,8 +878,7 @@ def __init_from_np2d(self, mat, params_str, ref_dataset):
self.handle = ctypes.c_void_p()
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
# change non-float data to float data, need to copy
else: # change non-float data to float data, need to copy
data = np.array(mat.reshape(mat.size), dtype=np.float32)

ptr_data, type_ptr_data, _ = c_float_array(data)
Expand Down Expand Up @@ -914,8 +916,7 @@ def __init_from_list_np2d(self, mats, params_str, ref_dataset):

if mat.dtype == np.float32 or mat.dtype == np.float64:
mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
# change non-float data to float data, need to copy
else: # change non-float data to float data, need to copy
mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)

chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
Expand Down Expand Up @@ -1011,7 +1012,7 @@ def construct(self):
used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
assert used_indices.flags.c_contiguous
if self.reference.group is not None:
group_info = np.array(self.reference.group).astype(int)
group_info = np.array(self.reference.group).astype(np.int32, copy=False)
_, self.group = np.unique(np.repeat(range_(len(group_info)), repeats=group_info)[self.used_indices],
return_counts=True)
self.handle = ctypes.c_void_p()
Expand Down Expand Up @@ -2508,7 +2509,7 @@ def feature_importance(self, importance_type='split', iteration=None):
ctypes.c_int(importance_type_int),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if importance_type_int == 0:
return result.astype(int)
return result.astype(np.int32)
else:
return result

Expand Down
6 changes: 3 additions & 3 deletions python-package/lightgbm/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,17 +307,17 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
if hasattr(folds, 'split'):
group_info = full_data.get_group()
if group_info is not None:
group_info = np.array(group_info, dtype=int)
group_info = np.array(group_info, dtype=np.int32, copy=False)
flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
else:
flatted_group = np.zeros(num_data, dtype=int)
flatted_group = np.zeros(num_data, dtype=np.int32)
folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group)
else:
if 'objective' in params and params['objective'] == 'lambdarank':
if not SKLEARN_INSTALLED:
raise LightGBMError('Scikit-learn is required for lambdarank cv.')
# lambdarank task, split according to groups
group_info = np.array(full_data.get_group(), dtype=int)
group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
group_kfold = _LGBMGroupKFold(n_splits=nfold)
folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group)
Expand Down