Skip to content

Commit

Permalink
move tests that use lgb.train to test_engine
Browse files Browse the repository at this point in the history
  • Loading branch information
jmoralez committed Jan 25, 2022
1 parent 2db4d75 commit 8bf1617
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 86 deletions.
86 changes: 0 additions & 86 deletions tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,92 +612,6 @@ def test_custom_objective_safety():
bad_bst_multi.update(fobj=_bad_gradients)


def test_numpy_regular_dtypes():
pd = pytest.importorskip('pandas')
uints = ['uint8', 'uint16', 'uint32', 'uint64']
ints = ['int8', 'int16', 'int32', 'int64']
bool_and_floats = ['bool', 'float16', 'float32', 'float64']
rng = np.random.RandomState(42)

n_samples = 100
# data as float64
df = pd.DataFrame({
'x1': rng.randint(0, 2, n_samples),
'x2': rng.randint(1, 3, n_samples),
'x3': 10 * rng.randint(1, 3, n_samples),
'x4': 100 * rng.randint(1, 3, n_samples),
})
df = df.astype(np.float64)
y = df['x1'] * (df['x2'] + df['x3'] + df['x4'])
ds = lgb.Dataset(df, y)
params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1}
bst = lgb.train(params, ds, num_boost_round=5)
preds = bst.predict(df)

# test all features were used
assert bst.trees_to_dataframe()['split_feature'].nunique() == df.shape[1]
# test the score is better than predicting the mean
baseline = np.full_like(y, y.mean())
assert mean_squared_error(y, preds) < mean_squared_error(y, baseline)

# test all predictions are equal using different input dtypes
for target_dtypes in [uints, ints, bool_and_floats]:
df2 = df.astype({f'x{i}': dtype for i, dtype in enumerate(target_dtypes, start=1)})
assert df2.dtypes.tolist() == target_dtypes
ds2 = lgb.Dataset(df2, y)
bst2 = lgb.train(params, ds2, num_boost_round=5)
preds2 = bst2.predict(df2)
np.testing.assert_allclose(preds, preds2)


def test_pandas_nullable_dtypes():
pd = pytest.importorskip('pandas')
rng = np.random.RandomState(0)
df = pd.DataFrame(
{
'x1': rng.randint(1, 3, size=100),
'x2': np.linspace(-1, 1, 100),
'x3': pd.arrays.SparseArray(rng.randint(0, 11, size=100)),
'x4': rng.rand(100) < 0.5,
}
)
# introduce some missing values
df.loc[1, 'x1'] = np.nan
df.loc[2, 'x2'] = np.nan
df.loc[3, 'x4'] = np.nan
# the previous line turns x3 into object dtype in recent versions of pandas
df['x4'] = df['x4'].astype(np.float64)
y = df['x1'] * df['x2'] + df['x3'] * (1 + df['x4'])
y = y.fillna(0)

# train with regular dtypes
params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1}
ds = lgb.Dataset(df, y)
bst = lgb.train(params, ds, num_boost_round=5)
preds = bst.predict(df)

# convert to nullable dtypes
df2 = df.copy()
df2['x1'] = df2['x1'].astype('Int32')
df2['x2'] = df2['x2'].astype('Float64')
df2['x4'] = df2['x4'].astype('boolean')

# test training succeeds
ds_nullable_dtypes = lgb.Dataset(df2, y)
bst_nullable_dtypes = lgb.train(params, ds_nullable_dtypes, num_boost_round=5)
preds_nullable_dtypes = bst_nullable_dtypes.predict(df2)

trees_df = bst.trees_to_dataframe()
# test all features were used
assert trees_df['split_feature'].nunique() == df.shape[1]
# test the score is better than predicting the mean
baseline = np.full_like(y, y.mean())
assert mean_squared_error(y, preds) < mean_squared_error(y, baseline)

# test equal predictions
np.testing.assert_allclose(preds, preds_nullable_dtypes)


@pytest.mark.parametrize('dtype', [np.float32, np.float64])
def test_no_copy_when_single_float_dtype_dataframe(dtype):
pd = pytest.importorskip('pandas')
Expand Down
86 changes: 86 additions & 0 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3224,3 +3224,89 @@ def test_force_split_with_feature_fraction(tmp_path):
for tree in tree_info:
tree_structure = tree["tree_structure"]
assert tree_structure['split_feature'] == 0


def test_numpy_regular_dtypes():
pd = pytest.importorskip('pandas')
uints = ['uint8', 'uint16', 'uint32', 'uint64']
ints = ['int8', 'int16', 'int32', 'int64']
bool_and_floats = ['bool', 'float16', 'float32', 'float64']
rng = np.random.RandomState(42)

n_samples = 100
# data as float64
df = pd.DataFrame({
'x1': rng.randint(0, 2, n_samples),
'x2': rng.randint(1, 3, n_samples),
'x3': 10 * rng.randint(1, 3, n_samples),
'x4': 100 * rng.randint(1, 3, n_samples),
})
df = df.astype(np.float64)
y = df['x1'] * (df['x2'] + df['x3'] + df['x4'])
ds = lgb.Dataset(df, y)
params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1}
bst = lgb.train(params, ds, num_boost_round=5)
preds = bst.predict(df)

# test all features were used
assert bst.trees_to_dataframe()['split_feature'].nunique() == df.shape[1]
# test the score is better than predicting the mean
baseline = np.full_like(y, y.mean())
assert mean_squared_error(y, preds) < mean_squared_error(y, baseline)

# test all predictions are equal using different input dtypes
for target_dtypes in [uints, ints, bool_and_floats]:
df2 = df.astype({f'x{i}': dtype for i, dtype in enumerate(target_dtypes, start=1)})
assert df2.dtypes.tolist() == target_dtypes
ds2 = lgb.Dataset(df2, y)
bst2 = lgb.train(params, ds2, num_boost_round=5)
preds2 = bst2.predict(df2)
np.testing.assert_allclose(preds, preds2)


def test_pandas_nullable_dtypes():
pd = pytest.importorskip('pandas')
rng = np.random.RandomState(0)
df = pd.DataFrame(
{
'x1': rng.randint(1, 3, size=100),
'x2': np.linspace(-1, 1, 100),
'x3': pd.arrays.SparseArray(rng.randint(0, 11, size=100)),
'x4': rng.rand(100) < 0.5,
}
)
# introduce some missing values
df.loc[1, 'x1'] = np.nan
df.loc[2, 'x2'] = np.nan
df.loc[3, 'x4'] = np.nan
# the previous line turns x3 into object dtype in recent versions of pandas
df['x4'] = df['x4'].astype(np.float64)
y = df['x1'] * df['x2'] + df['x3'] * (1 + df['x4'])
y = y.fillna(0)

# train with regular dtypes
params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1}
ds = lgb.Dataset(df, y)
bst = lgb.train(params, ds, num_boost_round=5)
preds = bst.predict(df)

# convert to nullable dtypes
df2 = df.copy()
df2['x1'] = df2['x1'].astype('Int32')
df2['x2'] = df2['x2'].astype('Float64')
df2['x4'] = df2['x4'].astype('boolean')

# test training succeeds
ds_nullable_dtypes = lgb.Dataset(df2, y)
bst_nullable_dtypes = lgb.train(params, ds_nullable_dtypes, num_boost_round=5)
preds_nullable_dtypes = bst_nullable_dtypes.predict(df2)

trees_df = bst.trees_to_dataframe()
# test all features were used
assert trees_df['split_feature'].nunique() == df.shape[1]
# test the score is better than predicting the mean
baseline = np.full_like(y, y.mean())
assert mean_squared_error(y, preds) < mean_squared_error(y, baseline)

# test equal predictions
np.testing.assert_allclose(preds, preds_nullable_dtypes)

0 comments on commit 8bf1617

Please sign in to comment.