Skip to content

Commit

Permalink
Log warnings for number of bins of categorical features (#4448)
Browse files Browse the repository at this point in the history
* log warnings when number of bins of categorical features exceeds the configured maximum number of bins

* log only one warning information for all categorical features

* Add #include <memory> for unique_ptr

* remove useless param description
  • Loading branch information
shiyu1994 authored Mar 27, 2022
1 parent 17d4e00 commit d163c2c
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 4 deletions.
11 changes: 11 additions & 0 deletions include/LightGBM/dataset_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <LightGBM/dataset.h>

#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
Expand Down Expand Up @@ -63,6 +64,16 @@ class DatasetLoader {
/*! \brief Check can load from binary file */
std::string CheckCanLoadFromBin(const char* filename);

/*! \brief Check the number of bins for categorical features.
* The number of bins for categorical features may exceed the configured maximum value.
* Log warnings when such cases happen.
*
* \param bin_mappers the bin_mappers of all features
* \param max_bin max_bin from Config
* \param max_bin_by_feature max_bin_by_feature from Config
*/
void CheckCategoricalFeatureNumBin(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers, const int max_bin, const std::vector<int>& max_bin_by_feature) const;

const Config& config_;
/*! \brief Random generator*/
Random random_;
Expand Down
42 changes: 42 additions & 0 deletions src/io/dataset_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
cp_ptr += bin_mappers[i]->SizesInByte();
}
}
CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature);
auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
dataset->Construct(&bin_mappers, num_total_features, forced_bin_bounds, sample_indices, sample_values, num_per_col, num_col, total_sample_size, config_);
if (dataset->has_raw()) {
Expand Down Expand Up @@ -1184,6 +1185,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
cp_ptr += bin_mappers[i]->SizesInByte();
}
}
CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature);
dataset->Construct(&bin_mappers, dataset->num_total_features_, forced_bin_bounds, Common::Vector2Ptr<int>(&sample_indices).data(),
Common::Vector2Ptr<double>(&sample_values).data(),
Common::VectorSize<int>(sample_indices).data(), static_cast<int>(sample_indices.size()), sample_data.size(), config_);
Expand Down Expand Up @@ -1463,4 +1465,44 @@ std::vector<std::vector<double>> DatasetLoader::GetForcedBins(std::string forced
return forced_bins;
}

void DatasetLoader::CheckCategoricalFeatureNumBin(
const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const int max_bin, const std::vector<int>& max_bin_by_feature) const {
bool need_warning = false;
if (bin_mappers.size() < 1024) {
for (size_t i = 0; i < bin_mappers.size(); ++i) {
const int max_bin_for_this_feature = max_bin_by_feature.empty() ? max_bin : max_bin_by_feature[i];
if (bin_mappers[i]->bin_type() == BinType::CategoricalBin && bin_mappers[i]->num_bin() > max_bin_for_this_feature) {
need_warning = true;
break;
}
}
} else {
const int num_threads = OMP_NUM_THREADS();
std::vector<bool> thread_need_warning(num_threads, false);
Threading::For<size_t>(0, bin_mappers.size(), 1,
[&bin_mappers, &thread_need_warning, &max_bin_by_feature, max_bin] (int thread_index, size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
thread_need_warning[thread_index] = false;
const int max_bin_for_this_feature = max_bin_by_feature.empty() ? max_bin : max_bin_by_feature[i];
if (bin_mappers[i]->bin_type() == BinType::CategoricalBin && bin_mappers[i]->num_bin() > max_bin_for_this_feature) {
thread_need_warning[thread_index] = true;
break;
}
}
});
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
if (thread_need_warning[thread_index]) {
need_warning = true;
break;
}
}
}

if (need_warning) {
Log::Warning("Categorical features with more bins than the configured maximum bin number found.");
Log::Warning("For categorical features, max_bin and max_bin_by_feature may be ignored with a large number of categories.");
}
}

} // namespace LightGBM
24 changes: 22 additions & 2 deletions src/treelearner/cuda_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,10 +506,30 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
} else {
Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
}
if (max_num_bin_ == 65) {

// ignore the feature groups that contain categorical features when producing warnings about max_bin.
// these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
int max_num_bin_no_categorical = 0;
int cur_feature_group = 0;
bool categorical_feature_found = false;
for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
const int feature_group = train_data_->Feature2Group(inner_feature_index);
const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
categorical_feature_found = true;
}
if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
if (!categorical_feature_found) {
max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
}
categorical_feature_found = false;
cur_feature_group = feature_group;
}
}
if (max_num_bin_no_categorical == 65) {
Log::Warning("Setting max_bin to 63 is suggested for best performance");
}
if (max_num_bin_ == 17) {
if (max_num_bin_no_categorical == 17) {
Log::Warning("Setting max_bin to 15 is suggested for best performance");
}

Expand Down
24 changes: 22 additions & 2 deletions src/treelearner/gpu_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -719,10 +719,30 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
} else {
Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
}
if (max_num_bin_ == 65) {

// ignore the feature groups that contain categorical features when producing warnings about max_bin.
// these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
int max_num_bin_no_categorical = 0;
int cur_feature_group = 0;
bool categorical_feature_found = false;
for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
const int feature_group = train_data_->Feature2Group(inner_feature_index);
const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
categorical_feature_found = true;
}
if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
if (!categorical_feature_found) {
max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
}
categorical_feature_found = false;
cur_feature_group = feature_group;
}
}
if (max_num_bin_no_categorical == 65) {
Log::Warning("Setting max_bin to 63 is suggested for best performance");
}
if (max_num_bin_ == 17) {
if (max_num_bin_no_categorical == 17) {
Log::Warning("Setting max_bin to 15 is suggested for best performance");
}
ctx_ = boost::compute::context(dev_);
Expand Down

0 comments on commit d163c2c

Please sign in to comment.