Skip to content

Commit

Permalink
[CUDA] CUDA Quantized Training (fixes microsoft#5606) (microsoft#5933)
Browse files Browse the repository at this point in the history
* add quantized training (first stage)

* add histogram construction functions for integer gradients

* add stochastic rounding

* update docs

* fix compilation errors by adding template instantiations

* update files for compilation

* fix compilation of gpu version

* initialize gradient discretizer before share states

* add a test case for quantized training

* add quantized training for data distributed training

* Delete origin.pred

* Delete ifelse.pred

* Delete LightGBM_model.txt

* remove useless changes

* fix lint error

* remove debug loggings

* fix mismatch of vector and allocator types

* remove changes in main.cpp

* fix bugs with uninitialized gradient discretizer

* initialize ordered gradients in gradient discretizer

* disable quantized training with gpu and cuda

fix msvc compilation errors and warnings

* fix bug in data parallel tree learner

* make quantized training test deterministic

* make quantized training in test case more accurate

* refactor test_quantized_training

* fix leaf splits initialization with quantized training

* check distributed quantized training result

* add cuda gradient discretizer

* add quantized training for CUDA version in tree learner

* remove cuda computability 6.1 and 6.2

* fix parts of gpu quantized training errors and warnings

* fix build-python.sh to install locally built version

* fix memory access bugs

* fix lint errors

* mark cuda quantized training on cuda with categorical features as unsupported

* rename cuda_utils.h to cuda_utils.hu

* enable quantized training with cuda

* fix cuda quantized training with sparse row data

* allow using global memory buffer in histogram construction with cuda quantized training

* recover build-python.sh

enlarge allowed package size to 100M
  • Loading branch information
shiyu1994 authored Oct 8, 2023
1 parent 3d9ada7 commit f901f47
Show file tree
Hide file tree
Showing 33 changed files with 1,912 additions and 259 deletions.
2 changes: 1 addition & 1 deletion .ci/check_python_dists.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ if [ $PY_MINOR_VER -gt 7 ]; then
pydistcheck \
--inspect \
--ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \
--max-allowed-size-uncompressed '70M' \
--max-allowed-size-uncompressed '100M' \
--max-allowed-files 800 \
${DIST_DIR}/* || exit -1
elif { test $(uname -m) = "aarch64"; }; then
Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/cuda/cuda_algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <stdio.h>

#include <LightGBM/bin.h>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/utils/log.h>

#include <algorithm>
Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/cuda/cuda_column_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_

#include <LightGBM/config.h>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>

Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/cuda/cuda_metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_
#define LIGHTGBM_CUDA_CUDA_METADATA_HPP_

#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/meta.h>

#include <vector>
Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/cuda/cuda_metric.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

#ifdef USE_CUDA

#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/metric.h>

namespace LightGBM {
Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/cuda/cuda_objective_function.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

#ifdef USE_CUDA

#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/objective_function.h>
#include <LightGBM/meta.h>

Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/cuda/cuda_row_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

#include <LightGBM/bin.h>
#include <LightGBM/config.h>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/dataset.h>
#include <LightGBM/train_share_states.h>
#include <LightGBM/utils/openmp_wrapper.h>
Expand Down
2 changes: 2 additions & 0 deletions include/LightGBM/cuda/cuda_split_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@ class CUDASplitInfo {

double left_sum_gradients;
double left_sum_hessians;
int64_t left_sum_of_gradients_hessians;
data_size_t left_count;
double left_gain;
double left_value;

double right_sum_gradients;
double right_sum_hessians;
int64_t right_sum_of_gradients_hessians;
data_size_t right_count;
double right_gain;
double right_value;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,21 @@
#define LIGHTGBM_CUDA_CUDA_UTILS_H_

#ifdef USE_CUDA

#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>

#include <LightGBM/utils/log.h>

#include <algorithm>
#include <vector>
#include <cmath>

namespace LightGBM {

typedef unsigned long long atomic_add_long_t;

#define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
if (code != cudaSuccess) {
Expand Down Expand Up @@ -125,13 +131,19 @@ class CUDAVector {
T* new_data = nullptr;
AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
if (size_ > 0 && data_ != nullptr) {
CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size, __FILE__, __LINE__);
const size_t size_for_old_content = std::min<size_t>(size_, size);
CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_for_old_content, __FILE__, __LINE__);
}
DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
data_ = new_data;
size_ = size;
}

void InitFromHostVector(const std::vector<T>& host_vector) {
Resize(host_vector.size());
CopyFromHostToCUDADevice(data_, host_vector.data(), host_vector.size(), __FILE__, __LINE__);
}

void Clear() {
if (size_ > 0 && data_ != nullptr) {
DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
Expand Down Expand Up @@ -171,6 +183,10 @@ class CUDAVector {
return data_;
}

void SetValue(int value) {
SetCUDAMemory<T>(data_, value, size_, __FILE__, __LINE__);
}

const T* RawDataReadOnly() const {
return data_;
}
Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/sample_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#ifndef LIGHTGBM_SAMPLE_STRATEGY_H_
#define LIGHTGBM_SAMPLE_STRATEGY_H_

#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/threading.h>
Expand Down
2 changes: 1 addition & 1 deletion src/boosting/cuda/cuda_score_updater.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#ifdef USE_CUDA

#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>

#include "../score_updater.hpp"

Expand Down
2 changes: 1 addition & 1 deletion src/cuda/cuda_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#ifdef USE_CUDA

#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>

namespace LightGBM {

Expand Down
4 changes: 0 additions & 4 deletions src/io/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,10 +389,6 @@ void Config::CheckParamConflict() {
if (deterministic) {
Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
}
if (use_quantized_grad) {
Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training.");
use_quantized_grad = false;
}
}
// linear tree learner must be serial type and run on CPU device
if (linear_tree) {
Expand Down
2 changes: 1 addition & 1 deletion src/metric/cuda/cuda_binary_metric.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#ifdef USE_CUDA

#include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>

#include <vector>

Expand Down
2 changes: 1 addition & 1 deletion src/metric/cuda/cuda_pointwise_metric.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#ifdef USE_CUDA

#include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>

#include <vector>

Expand Down
2 changes: 1 addition & 1 deletion src/metric/cuda/cuda_regression_metric.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#ifdef USE_CUDA

#include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>

#include <vector>

Expand Down
19 changes: 16 additions & 3 deletions src/treelearner/cuda/cuda_best_split_finder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ CUDABestSplitFinder::CUDABestSplitFinder(
select_features_by_node_(select_features_by_node),
cuda_hist_(cuda_hist) {
InitFeatureMetaInfo(train_data);
if (has_categorical_feature_ && config->use_quantized_grad) {
Log::Fatal("Quantized training on GPU with categorical features is not supported yet.");
}
cuda_leaf_best_split_info_ = nullptr;
cuda_best_split_info_ = nullptr;
cuda_best_split_info_buffer_ = nullptr;
Expand Down Expand Up @@ -326,13 +329,23 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(
const data_size_t num_data_in_smaller_leaf,
const data_size_t num_data_in_larger_leaf,
const double sum_hessians_in_smaller_leaf,
const double sum_hessians_in_larger_leaf) {
const double sum_hessians_in_larger_leaf,
const score_t* grad_scale,
const score_t* hess_scale,
const uint8_t smaller_num_bits_in_histogram_bins,
const uint8_t larger_num_bits_in_histogram_bins) {
const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ &&
sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_);
const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ &&
sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0);
LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
if (grad_scale != nullptr && hess_scale != nullptr) {
LaunchFindBestSplitsDiscretizedForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid,
grad_scale, hess_scale, smaller_num_bits_in_histogram_bins, larger_num_bits_in_histogram_bins);
} else {
LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
}
global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel");
LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
SynchronizeCUDADevice(__FILE__, __LINE__);
Expand Down
Loading

0 comments on commit f901f47

Please sign in to comment.