From ae4cce6d7c3da4b4c4d5879b9cfd695794a9bde4 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 24 Dec 2024 11:04:42 +0000 Subject: [PATCH] use num_gpu instead of num_gpus --- R-package/tests/testthat/test_lgb.Booster.R | 2 +- docs/Parameters.rst | 8 +-- include/LightGBM/config.h | 5 +- include/LightGBM/cuda/cuda_nccl_topology.hpp | 64 +++++++++---------- .../LightGBM/cuda/cuda_objective_function.hpp | 2 +- src/application/application.cpp | 6 +- src/boosting/boosting.cpp | 6 +- src/boosting/cuda/nccl_gbdt.cpp | 2 +- src/c_api.cpp | 2 +- src/io/config_auto.cpp | 13 ++-- tests/python_package_test/test_engine.py | 2 +- 11 files changed, 53 insertions(+), 59 deletions(-) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 38d0cae762b0..9197fd7226af 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -1107,7 +1107,7 @@ test_that("all parameters are stored correctly with save_model_to_string()", { , "[gpu_platform_id: -1]" , "[gpu_device_id: -1]" , "[gpu_use_dp: 0]" - , "[num_gpus: 1]" + , "[num_gpu: 1]" ) all_param_entries <- c(non_default_param_entries, default_param_entries) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index ceaf746846f5..b1d72d4009a5 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -1371,12 +1371,6 @@ GPU Parameters - **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details -- ``num_gpus`` :raw-html:`🔗︎`, default = ``1``, type = int - - - Number of GPUs to use for training, used with device_type=cuda - - - When <= 0, only 1 GPU will be used - - ``gpu_device_id_list`` :raw-html:`🔗︎`, default = ``""``, type = string - List of CUDA device IDs used when device_type=cuda @@ -1395,6 +1389,8 @@ GPU Parameters - **Note**: can be used only in CUDA implementation (``device_type="cuda"``) + - When <= 0, only 1 GPU will be used + .. end params list Others diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index e4a2ade69c01..d419c32adb89 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -1125,10 +1125,6 @@ struct Config { // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details int gpu_device_id = -1; - // desc = Number of GPUs to use for training, used with device_type=cuda - // desc = When <= 0, only 1 GPU will be used - int num_gpus = 1; - // desc = List of CUDA device IDs used when device_type=cuda // desc = When empty, the devices with the smallest IDs will be used std::string gpu_device_id_list = ""; @@ -1140,6 +1136,7 @@ struct Config { // check = >0 // desc = number of GPUs // desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``) + // desc = When <= 0, only 1 GPU will be used int num_gpu = 1; #ifndef __NVCC__ diff --git a/include/LightGBM/cuda/cuda_nccl_topology.hpp b/include/LightGBM/cuda/cuda_nccl_topology.hpp index 840b68224f0e..d05f85231b8b 100644 --- a/include/LightGBM/cuda/cuda_nccl_topology.hpp +++ b/include/LightGBM/cuda/cuda_nccl_topology.hpp @@ -20,8 +20,8 @@ namespace LightGBM { class NCCLTopology { public: - NCCLTopology(const int master_gpu_device_id, const int num_gpus, const std::string& gpu_device_id_list, const data_size_t global_num_data) { - num_gpus_ = num_gpus; + NCCLTopology(const int master_gpu_device_id, const int num_gpu, const std::string& gpu_device_id_list, const data_size_t global_num_data) { + num_gpu_ = num_gpu; master_gpu_device_id_ = master_gpu_device_id; global_num_data_ = global_num_data; int max_num_gpu = 0; @@ -42,10 +42,10 @@ class NCCLTopology { gpu_list_.push_back(gpu_id); } } - if (!gpu_list_.empty() && num_gpus_ != static_cast(gpu_list_.size())) { - Log::Warning("num_gpus_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.", \ - num_gpus_, static_cast(gpu_list_.size()), static_cast(gpu_list_.size())); - num_gpus_ = static_cast(gpu_list_.size()); + if (!gpu_list_.empty() && num_gpu_ != static_cast(gpu_list_.size())) { + Log::Warning("num_gpu_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.", \ + num_gpu_, static_cast(gpu_list_.size()), static_cast(gpu_list_.size())); + num_gpu_ = static_cast(gpu_list_.size()); } if (!gpu_list_.empty()) { @@ -64,18 +64,18 @@ class NCCLTopology { master_gpu_index_ = 0; } } else { - if (num_gpus_ <= 0) { - num_gpus_ = 1; - } else if (num_gpus_ > max_num_gpu) { + if (num_gpu_ <= 0) { + num_gpu_ = 1; + } else if (num_gpu_ > max_num_gpu) { Log::Warning("Only %d GPUs available, using num_gpu = %d.", max_num_gpu, max_num_gpu); - num_gpus_ = max_num_gpu; + num_gpu_ = max_num_gpu; } - if (master_gpu_device_id_ < 0 || master_gpu_device_id_ >= num_gpus_) { + if (master_gpu_device_id_ < 0 || master_gpu_device_id_ >= num_gpu_) { Log::Warning("Invalid gpu_device_id = %d for master GPU index, using gpu_device_id = 0 instead.", master_gpu_device_id_); master_gpu_device_id_ = 0; master_gpu_index_ = 0; } - for (int i = 0; i < num_gpus_; ++i) { + for (int i = 0; i < num_gpu_; ++i) { gpu_list_.push_back(i); } } @@ -83,18 +83,18 @@ class NCCLTopology { Log::Info("Using GPU devices %s, and local master GPU device %d.", Common::Join(gpu_list_, ",").c_str(), master_gpu_device_id_); const int num_threads = OMP_NUM_THREADS(); - if (num_gpus_ > num_threads) { - Log::Fatal("Number of GPUs %d is greater than the number of threads %d. Please use more threads.", num_gpus_, num_threads); + if (num_gpu_ > num_threads) { + Log::Fatal("Number of GPUs %d is greater than the number of threads %d. Please use more threads.", num_gpu_, num_threads); } - host_threads_.resize(num_gpus_); + host_threads_.resize(num_gpu_); } ~NCCLTopology() {} void InitNCCL() { - nccl_gpu_rank_.resize(num_gpus_, -1); - nccl_communicators_.resize(num_gpus_); + nccl_gpu_rank_.resize(num_gpu_, -1); + nccl_communicators_.resize(num_gpu_); ncclUniqueId nccl_unique_id; if (Network::num_machines() == 1 || Network::rank() == 0) { NCCLCHECK(ncclGetUniqueId(&nccl_unique_id)); @@ -113,15 +113,15 @@ class NCCLTopology { if (Network::num_machines() > 1) { node_rank_offset_.resize(Network::num_machines() + 1, 0); Network::Allgather( - reinterpret_cast(&num_gpus_), + reinterpret_cast(&num_gpu_), sizeof(int) / sizeof(char), reinterpret_cast(node_rank_offset_.data() + 1)); for (int rank = 1; rank < Network::num_machines() + 1; ++rank) { node_rank_offset_[rank] += node_rank_offset_[rank - 1]; } - CHECK_EQ(node_rank_offset_[Network::rank() + 1] - node_rank_offset_[Network::rank()], num_gpus_); + CHECK_EQ(node_rank_offset_[Network::rank() + 1] - node_rank_offset_[Network::rank()], num_gpu_); NCCLCHECK(ncclGroupStart()); - for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) { + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__); nccl_gpu_rank_[gpu_index] = gpu_index + node_rank_offset_[Network::rank()]; NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], node_rank_offset_.back(), nccl_unique_id, nccl_gpu_rank_[gpu_index])); @@ -129,10 +129,10 @@ class NCCLTopology { NCCLCHECK(ncclGroupEnd()); } else { NCCLCHECK(ncclGroupStart()); - for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) { + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__); nccl_gpu_rank_[gpu_index] = gpu_index; - NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpus_, nccl_unique_id, gpu_index)); + NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpu_, nccl_unique_id, gpu_index)); } NCCLCHECK(ncclGroupEnd()); } @@ -143,8 +143,8 @@ class NCCLTopology { template void RunPerDevice(const std::vector>& objs, const std::function& func) { - #pragma omp parallel for schedule(static) num_threads(num_gpus_) - for (int i = 0; i < num_gpus_; ++i) { + #pragma omp parallel for schedule(static) num_threads(num_gpu_) + for (int i = 0; i < num_gpu_; ++i) { CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i])); func(objs[i].get()); } @@ -153,9 +153,9 @@ class NCCLTopology { template void InitPerDevice(std::vector>* vec) { - vec->resize(num_gpus_); - #pragma omp parallel for schedule(static) num_threads(num_gpus_) - for (int i = 0; i < num_gpus_; ++i) { + vec->resize(num_gpu_); + #pragma omp parallel for schedule(static) num_threads(num_gpu_) + for (int i = 0; i < num_gpu_; ++i) { CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i])); RET_T* nccl_info = new RET_T(); nccl_info->SetNCCLInfo(nccl_communicators_[i], nccl_gpu_rank_[i], i, gpu_list_[i], global_num_data_); @@ -166,13 +166,13 @@ class NCCLTopology { template void DispatchPerDevice(std::vector>* objs, const std::function& func) { - for (int i = 0; i < num_gpus_; ++i) { + for (int i = 0; i < num_gpu_; ++i) { host_threads_[i] = std::thread([this, i, &func, objs] () { CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i])) func(objs->operator[](i).get()); }); } - for (int i = 0; i < num_gpus_; ++i) { + for (int i = 0; i < num_gpu_; ++i) { host_threads_[i].join(); } CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); @@ -186,7 +186,7 @@ class NCCLTopology { template void RunOnNonMasterDevice(const std::vector>& objs, const std::function& func) { - for (int i = 0; i < num_gpus_; ++i) { + for (int i = 0; i < num_gpu_; ++i) { if (i != master_gpu_index_) { CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i])); func(objs[i].get()); @@ -195,7 +195,7 @@ class NCCLTopology { CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); } - int num_gpus() const { return num_gpus_; } + int num_gpu() const { return num_gpu_; } int master_gpu_index() const { return master_gpu_index_; } @@ -204,7 +204,7 @@ class NCCLTopology { const std::vector& gpu_list() const { return gpu_list_; } private: - int num_gpus_; + int num_gpu_; int master_gpu_index_; int master_gpu_device_id_; std::vector gpu_list_; diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp index b31d6eb1d016..906a7851368d 100644 --- a/include/LightGBM/cuda/cuda_objective_function.hpp +++ b/include/LightGBM/cuda/cuda_objective_function.hpp @@ -22,7 +22,7 @@ template class CUDAObjectiveInterface: public HOST_OBJECTIVE, public NCCLInfo { public: explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) { - if (config.num_gpus <= 1) { + if (config.num_gpu <= 1) { const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0; SetCUDADevice(gpu_device_id, __FILE__, __LINE__); } diff --git a/src/application/application.cpp b/src/application/application.cpp index 83ef1fe6a8f1..ddfe4352097d 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -181,7 +181,7 @@ void Application::InitTrain() { // create boosting boosting_.reset( Boosting::CreateBoosting(config_.boosting, - config_.input_model.c_str(), config_.device_type, config_.num_gpus)); + config_.input_model.c_str(), config_.device_type, config_.num_gpu)); // create objective function objective_fun_.reset( ObjectiveFunction::CreateObjectiveFunction(config_.objective, @@ -274,13 +274,13 @@ void Application::Predict() { void Application::InitPredict() { boosting_.reset( - Boosting::CreateBoosting("gbdt", config_.input_model.c_str(), config_.device_type, config_.num_gpus)); + Boosting::CreateBoosting("gbdt", config_.input_model.c_str(), config_.device_type, config_.num_gpu)); Log::Info("Finished initializing prediction, total used %d iterations", boosting_->GetCurrentIteration()); } void Application::ConvertModel() { boosting_.reset( - Boosting::CreateBoosting(config_.boosting, config_.input_model.c_str(), config_.device_type, config_.num_gpus)); + Boosting::CreateBoosting(config_.boosting, config_.input_model.c_str(), config_.device_type, config_.num_gpu)); boosting_->SaveModelToIfElse(-1, config_.convert_model.c_str()); } diff --git a/src/boosting/boosting.cpp b/src/boosting/boosting.cpp index d8fbee0ceb81..4b5bf33f201d 100644 --- a/src/boosting/boosting.cpp +++ b/src/boosting/boosting.cpp @@ -42,13 +42,13 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename #endif // USE_CUDA , const int #ifdef USE_CUDA - num_gpus + num_gpu #endif // USE_CUDA ) { if (filename == nullptr || filename[0] == '\0') { if (type == std::string("gbdt")) { #ifdef USE_CUDA - if (device_type == std::string("cuda") && num_gpus > 1) { + if (device_type == std::string("cuda") && num_gpu > 1) { return new NCCLGBDT(); } else { #endif // USE_CUDA @@ -70,7 +70,7 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename if (GetBoostingTypeFromModelFile(filename) == std::string("tree")) { if (type == std::string("gbdt")) { #ifdef USE_CUDA - if (device_type == std::string("cuda") && num_gpus > 1) { + if (device_type == std::string("cuda") && num_gpu > 1) { ret.reset(new NCCLGBDT()); } else { #endif // USE_CUDA diff --git a/src/boosting/cuda/nccl_gbdt.cpp b/src/boosting/cuda/nccl_gbdt.cpp index b2e2566f5ba9..af236a39f944 100644 --- a/src/boosting/cuda/nccl_gbdt.cpp +++ b/src/boosting/cuda/nccl_gbdt.cpp @@ -30,7 +30,7 @@ void NCCLGBDT::Init( this->tree_learner_.reset(); - nccl_topology_.reset(new NCCLTopology(this->config_->gpu_device_id, this->config_->num_gpus, this->config_->gpu_device_id_list, train_data->num_data())); + nccl_topology_.reset(new NCCLTopology(this->config_->gpu_device_id, this->config_->num_gpu, this->config_->gpu_device_id_list, train_data->num_data())); nccl_topology_->InitNCCL(); diff --git a/src/c_api.cpp b/src/c_api.cpp index 7d05121292ce..c289d9fdbabd 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -177,7 +177,7 @@ class Booster { "please use continued train with input score"); } - boosting_.reset(Boosting::CreateBoosting(config_.boosting, nullptr, config_.device_type, config_.num_gpus)); + boosting_.reset(Boosting::CreateBoosting(config_.boosting, nullptr, config_.device_type, config_.num_gpu)); train_data_ = train_data; CreateObjectiveAndMetrics(); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 2e95f2da6bd8..1bc96c38b926 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -320,9 +320,9 @@ const std::unordered_set& Config::parameter_set() { "machines", "gpu_platform_id", "gpu_device_id", - "num_gpus", "gpu_device_id_list", "gpu_use_dp", + "num_gpu", }); return params; } @@ -664,11 +664,12 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"machines", {"workers", "nodes"}}, {"gpu_platform_id", {}}, {"gpu_device_id", {}}, - {"num_gpus", {}}, {"gpu_device_id_list", {}}, {"gpu_use_dp", {}}, + {"num_gpu", {}}, }); return map; } @@ -1078,9 +1079,9 @@ const std::unordered_map& Config::ParameterTypes() { {"machines", "string"}, {"gpu_platform_id", "int"}, {"gpu_device_id", "int"}, - {"num_gpus", "int"}, {"gpu_device_id_list", "string"}, {"gpu_use_dp", "bool"}, + {"num_gpu", "int"}, }); return map; } diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index b534e4da7f99..667cb86c1a14 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1684,7 +1684,7 @@ def test_all_expected_params_are_written_out_to_model_text(tmp_path): "[machines: ]", "[gpu_platform_id: -1]", "[gpu_device_id: -1]", - "[num_gpus: 1]", + "[num_gpu: 1]", ] all_param_entries = non_default_param_entries + default_param_entries