diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 9d5104fe0299..c4766eeac49f 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -612,92 +612,6 @@ def test_custom_objective_safety(): bad_bst_multi.update(fobj=_bad_gradients) -def test_numpy_regular_dtypes(): - pd = pytest.importorskip('pandas') - uints = ['uint8', 'uint16', 'uint32', 'uint64'] - ints = ['int8', 'int16', 'int32', 'int64'] - bool_and_floats = ['bool', 'float16', 'float32', 'float64'] - rng = np.random.RandomState(42) - - n_samples = 100 - # data as float64 - df = pd.DataFrame({ - 'x1': rng.randint(0, 2, n_samples), - 'x2': rng.randint(1, 3, n_samples), - 'x3': 10 * rng.randint(1, 3, n_samples), - 'x4': 100 * rng.randint(1, 3, n_samples), - }) - df = df.astype(np.float64) - y = df['x1'] * (df['x2'] + df['x3'] + df['x4']) - ds = lgb.Dataset(df, y) - params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1} - bst = lgb.train(params, ds, num_boost_round=5) - preds = bst.predict(df) - - # test all features were used - assert bst.trees_to_dataframe()['split_feature'].nunique() == df.shape[1] - # test the score is better than predicting the mean - baseline = np.full_like(y, y.mean()) - assert mean_squared_error(y, preds) < mean_squared_error(y, baseline) - - # test all predictions are equal using different input dtypes - for target_dtypes in [uints, ints, bool_and_floats]: - df2 = df.astype({f'x{i}': dtype for i, dtype in enumerate(target_dtypes, start=1)}) - assert df2.dtypes.tolist() == target_dtypes - ds2 = lgb.Dataset(df2, y) - bst2 = lgb.train(params, ds2, num_boost_round=5) - preds2 = bst2.predict(df2) - np.testing.assert_allclose(preds, preds2) - - -def test_pandas_nullable_dtypes(): - pd = pytest.importorskip('pandas') - rng = np.random.RandomState(0) - df = pd.DataFrame( - { - 'x1': rng.randint(1, 3, size=100), - 'x2': np.linspace(-1, 1, 100), - 'x3': pd.arrays.SparseArray(rng.randint(0, 11, size=100)), - 'x4': rng.rand(100) < 0.5, - } - ) - # introduce some missing values - df.loc[1, 'x1'] = np.nan - df.loc[2, 'x2'] = np.nan - df.loc[3, 'x4'] = np.nan - # the previous line turns x3 into object dtype in recent versions of pandas - df['x4'] = df['x4'].astype(np.float64) - y = df['x1'] * df['x2'] + df['x3'] * (1 + df['x4']) - y = y.fillna(0) - - # train with regular dtypes - params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1} - ds = lgb.Dataset(df, y) - bst = lgb.train(params, ds, num_boost_round=5) - preds = bst.predict(df) - - # convert to nullable dtypes - df2 = df.copy() - df2['x1'] = df2['x1'].astype('Int32') - df2['x2'] = df2['x2'].astype('Float64') - df2['x4'] = df2['x4'].astype('boolean') - - # test training succeeds - ds_nullable_dtypes = lgb.Dataset(df2, y) - bst_nullable_dtypes = lgb.train(params, ds_nullable_dtypes, num_boost_round=5) - preds_nullable_dtypes = bst_nullable_dtypes.predict(df2) - - trees_df = bst.trees_to_dataframe() - # test all features were used - assert trees_df['split_feature'].nunique() == df.shape[1] - # test the score is better than predicting the mean - baseline = np.full_like(y, y.mean()) - assert mean_squared_error(y, preds) < mean_squared_error(y, baseline) - - # test equal predictions - np.testing.assert_allclose(preds, preds_nullable_dtypes) - - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) def test_no_copy_when_single_float_dtype_dataframe(dtype): pd = pytest.importorskip('pandas') diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index a74056b2c948..cbc293fd3a4a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3224,3 +3224,89 @@ def test_force_split_with_feature_fraction(tmp_path): for tree in tree_info: tree_structure = tree["tree_structure"] assert tree_structure['split_feature'] == 0 + + +def test_numpy_regular_dtypes(): + pd = pytest.importorskip('pandas') + uints = ['uint8', 'uint16', 'uint32', 'uint64'] + ints = ['int8', 'int16', 'int32', 'int64'] + bool_and_floats = ['bool', 'float16', 'float32', 'float64'] + rng = np.random.RandomState(42) + + n_samples = 100 + # data as float64 + df = pd.DataFrame({ + 'x1': rng.randint(0, 2, n_samples), + 'x2': rng.randint(1, 3, n_samples), + 'x3': 10 * rng.randint(1, 3, n_samples), + 'x4': 100 * rng.randint(1, 3, n_samples), + }) + df = df.astype(np.float64) + y = df['x1'] * (df['x2'] + df['x3'] + df['x4']) + ds = lgb.Dataset(df, y) + params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1} + bst = lgb.train(params, ds, num_boost_round=5) + preds = bst.predict(df) + + # test all features were used + assert bst.trees_to_dataframe()['split_feature'].nunique() == df.shape[1] + # test the score is better than predicting the mean + baseline = np.full_like(y, y.mean()) + assert mean_squared_error(y, preds) < mean_squared_error(y, baseline) + + # test all predictions are equal using different input dtypes + for target_dtypes in [uints, ints, bool_and_floats]: + df2 = df.astype({f'x{i}': dtype for i, dtype in enumerate(target_dtypes, start=1)}) + assert df2.dtypes.tolist() == target_dtypes + ds2 = lgb.Dataset(df2, y) + bst2 = lgb.train(params, ds2, num_boost_round=5) + preds2 = bst2.predict(df2) + np.testing.assert_allclose(preds, preds2) + + +def test_pandas_nullable_dtypes(): + pd = pytest.importorskip('pandas') + rng = np.random.RandomState(0) + df = pd.DataFrame( + { + 'x1': rng.randint(1, 3, size=100), + 'x2': np.linspace(-1, 1, 100), + 'x3': pd.arrays.SparseArray(rng.randint(0, 11, size=100)), + 'x4': rng.rand(100) < 0.5, + } + ) + # introduce some missing values + df.loc[1, 'x1'] = np.nan + df.loc[2, 'x2'] = np.nan + df.loc[3, 'x4'] = np.nan + # the previous line turns x3 into object dtype in recent versions of pandas + df['x4'] = df['x4'].astype(np.float64) + y = df['x1'] * df['x2'] + df['x3'] * (1 + df['x4']) + y = y.fillna(0) + + # train with regular dtypes + params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1} + ds = lgb.Dataset(df, y) + bst = lgb.train(params, ds, num_boost_round=5) + preds = bst.predict(df) + + # convert to nullable dtypes + df2 = df.copy() + df2['x1'] = df2['x1'].astype('Int32') + df2['x2'] = df2['x2'].astype('Float64') + df2['x4'] = df2['x4'].astype('boolean') + + # test training succeeds + ds_nullable_dtypes = lgb.Dataset(df2, y) + bst_nullable_dtypes = lgb.train(params, ds_nullable_dtypes, num_boost_round=5) + preds_nullable_dtypes = bst_nullable_dtypes.predict(df2) + + trees_df = bst.trees_to_dataframe() + # test all features were used + assert trees_df['split_feature'].nunique() == df.shape[1] + # test the score is better than predicting the mean + baseline = np.full_like(y, y.mean()) + assert mean_squared_error(y, preds) < mean_squared_error(y, baseline) + + # test equal predictions + np.testing.assert_allclose(preds, preds_nullable_dtypes)