Skip to content

Commit d163c2c

Browse files
authored
Log warnings for number of bins of categorical features (#4448)
* log warnings when number of bins of categorical features exceeds the configured maximum number of bins * log only one warning information for all categorical features * Add #include <memory> for unique_ptr * remove useless param description
1 parent 17d4e00 commit d163c2c

File tree

4 files changed

+97
-4
lines changed

4 files changed

+97
-4
lines changed

include/LightGBM/dataset_loader.h

+11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include <LightGBM/dataset.h>
99

10+
#include <memory>
1011
#include <string>
1112
#include <unordered_set>
1213
#include <vector>
@@ -63,6 +64,16 @@ class DatasetLoader {
6364
/*! \brief Check can load from binary file */
6465
std::string CheckCanLoadFromBin(const char* filename);
6566

67+
/*! \brief Check the number of bins for categorical features.
68+
* The number of bins for categorical features may exceed the configured maximum value.
69+
* Log warnings when such cases happen.
70+
*
71+
* \param bin_mappers the bin_mappers of all features
72+
* \param max_bin max_bin from Config
73+
* \param max_bin_by_feature max_bin_by_feature from Config
74+
*/
75+
void CheckCategoricalFeatureNumBin(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers, const int max_bin, const std::vector<int>& max_bin_by_feature) const;
76+
6677
const Config& config_;
6778
/*! \brief Random generator*/
6879
Random random_;

src/io/dataset_loader.cpp

+42
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
805805
cp_ptr += bin_mappers[i]->SizesInByte();
806806
}
807807
}
808+
CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature);
808809
auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
809810
dataset->Construct(&bin_mappers, num_total_features, forced_bin_bounds, sample_indices, sample_values, num_per_col, num_col, total_sample_size, config_);
810811
if (dataset->has_raw()) {
@@ -1184,6 +1185,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
11841185
cp_ptr += bin_mappers[i]->SizesInByte();
11851186
}
11861187
}
1188+
CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature);
11871189
dataset->Construct(&bin_mappers, dataset->num_total_features_, forced_bin_bounds, Common::Vector2Ptr<int>(&sample_indices).data(),
11881190
Common::Vector2Ptr<double>(&sample_values).data(),
11891191
Common::VectorSize<int>(sample_indices).data(), static_cast<int>(sample_indices.size()), sample_data.size(), config_);
@@ -1463,4 +1465,44 @@ std::vector<std::vector<double>> DatasetLoader::GetForcedBins(std::string forced
14631465
return forced_bins;
14641466
}
14651467

1468+
void DatasetLoader::CheckCategoricalFeatureNumBin(
1469+
const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
1470+
const int max_bin, const std::vector<int>& max_bin_by_feature) const {
1471+
bool need_warning = false;
1472+
if (bin_mappers.size() < 1024) {
1473+
for (size_t i = 0; i < bin_mappers.size(); ++i) {
1474+
const int max_bin_for_this_feature = max_bin_by_feature.empty() ? max_bin : max_bin_by_feature[i];
1475+
if (bin_mappers[i]->bin_type() == BinType::CategoricalBin && bin_mappers[i]->num_bin() > max_bin_for_this_feature) {
1476+
need_warning = true;
1477+
break;
1478+
}
1479+
}
1480+
} else {
1481+
const int num_threads = OMP_NUM_THREADS();
1482+
std::vector<bool> thread_need_warning(num_threads, false);
1483+
Threading::For<size_t>(0, bin_mappers.size(), 1,
1484+
[&bin_mappers, &thread_need_warning, &max_bin_by_feature, max_bin] (int thread_index, size_t start, size_t end) {
1485+
for (size_t i = start; i < end; ++i) {
1486+
thread_need_warning[thread_index] = false;
1487+
const int max_bin_for_this_feature = max_bin_by_feature.empty() ? max_bin : max_bin_by_feature[i];
1488+
if (bin_mappers[i]->bin_type() == BinType::CategoricalBin && bin_mappers[i]->num_bin() > max_bin_for_this_feature) {
1489+
thread_need_warning[thread_index] = true;
1490+
break;
1491+
}
1492+
}
1493+
});
1494+
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
1495+
if (thread_need_warning[thread_index]) {
1496+
need_warning = true;
1497+
break;
1498+
}
1499+
}
1500+
}
1501+
1502+
if (need_warning) {
1503+
Log::Warning("Categorical features with more bins than the configured maximum bin number found.");
1504+
Log::Warning("For categorical features, max_bin and max_bin_by_feature may be ignored with a large number of categories.");
1505+
}
1506+
}
1507+
14661508
} // namespace LightGBM

src/treelearner/cuda_tree_learner.cpp

+22-2
Original file line numberDiff line numberDiff line change
@@ -506,10 +506,30 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
506506
} else {
507507
Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
508508
}
509-
if (max_num_bin_ == 65) {
509+
510+
// ignore the feature groups that contain categorical features when producing warnings about max_bin.
511+
// these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
512+
int max_num_bin_no_categorical = 0;
513+
int cur_feature_group = 0;
514+
bool categorical_feature_found = false;
515+
for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
516+
const int feature_group = train_data_->Feature2Group(inner_feature_index);
517+
const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
518+
if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
519+
categorical_feature_found = true;
520+
}
521+
if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
522+
if (!categorical_feature_found) {
523+
max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
524+
}
525+
categorical_feature_found = false;
526+
cur_feature_group = feature_group;
527+
}
528+
}
529+
if (max_num_bin_no_categorical == 65) {
510530
Log::Warning("Setting max_bin to 63 is suggested for best performance");
511531
}
512-
if (max_num_bin_ == 17) {
532+
if (max_num_bin_no_categorical == 17) {
513533
Log::Warning("Setting max_bin to 15 is suggested for best performance");
514534
}
515535

src/treelearner/gpu_tree_learner.cpp

+22-2
Original file line numberDiff line numberDiff line change
@@ -719,10 +719,30 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
719719
} else {
720720
Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
721721
}
722-
if (max_num_bin_ == 65) {
722+
723+
// ignore the feature groups that contain categorical features when producing warnings about max_bin.
724+
// these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
725+
int max_num_bin_no_categorical = 0;
726+
int cur_feature_group = 0;
727+
bool categorical_feature_found = false;
728+
for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
729+
const int feature_group = train_data_->Feature2Group(inner_feature_index);
730+
const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
731+
if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
732+
categorical_feature_found = true;
733+
}
734+
if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
735+
if (!categorical_feature_found) {
736+
max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
737+
}
738+
categorical_feature_found = false;
739+
cur_feature_group = feature_group;
740+
}
741+
}
742+
if (max_num_bin_no_categorical == 65) {
723743
Log::Warning("Setting max_bin to 63 is suggested for best performance");
724744
}
725-
if (max_num_bin_ == 17) {
745+
if (max_num_bin_no_categorical == 17) {
726746
Log::Warning("Setting max_bin to 15 is suggested for best performance");
727747
}
728748
ctx_ = boost::compute::context(dev_);

0 commit comments

Comments
 (0)