diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index 27bea113b052..e4c93e182d96 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -7,6 +7,7 @@ #include +#include #include #include #include @@ -63,6 +64,16 @@ class DatasetLoader { /*! \brief Check can load from binary file */ std::string CheckCanLoadFromBin(const char* filename); + /*! \brief Check the number of bins for categorical features. + * The number of bins for categorical features may exceed the configured maximum value. + * Log warnings when such cases happen. + * + * \param bin_mappers the bin_mappers of all features + * \param max_bin max_bin from Config + * \param max_bin_by_feature max_bin_by_feature from Config + */ + void CheckCategoricalFeatureNumBin(const std::vector>& bin_mappers, const int max_bin, const std::vector& max_bin_by_feature) const; + const Config& config_; /*! \brief Random generator*/ Random random_; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index a4e22956cbeb..a2d73a79bcd5 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -805,6 +805,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values, cp_ptr += bin_mappers[i]->SizesInByte(); } } + CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature); auto dataset = std::unique_ptr(new Dataset(num_data)); dataset->Construct(&bin_mappers, num_total_features, forced_bin_bounds, sample_indices, sample_values, num_per_col, num_col, total_sample_size, config_); if (dataset->has_raw()) { @@ -1184,6 +1185,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, cp_ptr += bin_mappers[i]->SizesInByte(); } } + CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature); dataset->Construct(&bin_mappers, dataset->num_total_features_, forced_bin_bounds, Common::Vector2Ptr(&sample_indices).data(), Common::Vector2Ptr(&sample_values).data(), Common::VectorSize(sample_indices).data(), static_cast(sample_indices.size()), sample_data.size(), config_); @@ -1463,4 +1465,44 @@ std::vector> DatasetLoader::GetForcedBins(std::string forced return forced_bins; } +void DatasetLoader::CheckCategoricalFeatureNumBin( + const std::vector>& bin_mappers, + const int max_bin, const std::vector& max_bin_by_feature) const { + bool need_warning = false; + if (bin_mappers.size() < 1024) { + for (size_t i = 0; i < bin_mappers.size(); ++i) { + const int max_bin_for_this_feature = max_bin_by_feature.empty() ? max_bin : max_bin_by_feature[i]; + if (bin_mappers[i]->bin_type() == BinType::CategoricalBin && bin_mappers[i]->num_bin() > max_bin_for_this_feature) { + need_warning = true; + break; + } + } + } else { + const int num_threads = OMP_NUM_THREADS(); + std::vector thread_need_warning(num_threads, false); + Threading::For(0, bin_mappers.size(), 1, + [&bin_mappers, &thread_need_warning, &max_bin_by_feature, max_bin] (int thread_index, size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + thread_need_warning[thread_index] = false; + const int max_bin_for_this_feature = max_bin_by_feature.empty() ? max_bin : max_bin_by_feature[i]; + if (bin_mappers[i]->bin_type() == BinType::CategoricalBin && bin_mappers[i]->num_bin() > max_bin_for_this_feature) { + thread_need_warning[thread_index] = true; + break; + } + } + }); + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + if (thread_need_warning[thread_index]) { + need_warning = true; + break; + } + } + } + + if (need_warning) { + Log::Warning("Categorical features with more bins than the configured maximum bin number found."); + Log::Warning("For categorical features, max_bin and max_bin_by_feature may be ignored with a large number of categories."); + } +} + } // namespace LightGBM diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp index 4fb9f9ff51fc..a6bd4c47ae06 100644 --- a/src/treelearner/cuda_tree_learner.cpp +++ b/src/treelearner/cuda_tree_learner.cpp @@ -506,10 +506,30 @@ void CUDATreeLearner::InitGPU(int num_gpu) { } else { Log::Fatal("bin size %d cannot run on GPU", max_num_bin_); } - if (max_num_bin_ == 65) { + + // ignore the feature groups that contain categorical features when producing warnings about max_bin. + // these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin. + int max_num_bin_no_categorical = 0; + int cur_feature_group = 0; + bool categorical_feature_found = false; + for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { + const int feature_group = train_data_->Feature2Group(inner_feature_index); + const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index); + if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) { + categorical_feature_found = true; + } + if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) { + if (!categorical_feature_found) { + max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group)); + } + categorical_feature_found = false; + cur_feature_group = feature_group; + } + } + if (max_num_bin_no_categorical == 65) { Log::Warning("Setting max_bin to 63 is suggested for best performance"); } - if (max_num_bin_ == 17) { + if (max_num_bin_no_categorical == 17) { Log::Warning("Setting max_bin to 15 is suggested for best performance"); } diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index 9ab996372964..ead5027a95e4 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -719,10 +719,30 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) { } else { Log::Fatal("bin size %d cannot run on GPU", max_num_bin_); } - if (max_num_bin_ == 65) { + + // ignore the feature groups that contain categorical features when producing warnings about max_bin. + // these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin. + int max_num_bin_no_categorical = 0; + int cur_feature_group = 0; + bool categorical_feature_found = false; + for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { + const int feature_group = train_data_->Feature2Group(inner_feature_index); + const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index); + if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) { + categorical_feature_found = true; + } + if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) { + if (!categorical_feature_found) { + max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group)); + } + categorical_feature_found = false; + cur_feature_group = feature_group; + } + } + if (max_num_bin_no_categorical == 65) { Log::Warning("Setting max_bin to 63 is suggested for best performance"); } - if (max_num_bin_ == 17) { + if (max_num_bin_no_categorical == 17) { Log::Warning("Setting max_bin to 15 is suggested for best performance"); } ctx_ = boost::compute::context(dev_);