Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support both row-wise and col-wise multi-threading #2699

Merged
merged 33 commits into from
Feb 2, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
c8883fc
commit
guolinke Jan 20, 2020
281dd32
fix a bug
guolinke Jan 20, 2020
ea718c2
fix bug
guolinke Jan 21, 2020
2ad4af5
reset to track changes
guolinke Jan 30, 2020
748c95a
refine the auto choose logic
guolinke Jan 30, 2020
0340ffd
sort the time stats output
guolinke Jan 30, 2020
d3434c7
fix include
guolinke Jan 30, 2020
8c4ea1a
change multi_val_bin_sparse_threshold
guolinke Jan 30, 2020
6cac288
add cmake
guolinke Jan 30, 2020
afdbf3c
add _mm_malloc and _mm_free for cross platform
guolinke Jan 30, 2020
210ac4b
fix cmake bug
guolinke Jan 30, 2020
ad2865d
timer for split
guolinke Jan 30, 2020
4c4a33b
try to fix cmake
guolinke Jan 30, 2020
2a33dcb
fix tests
guolinke Jan 30, 2020
256e6d9
refactor DataPartition::Split
guolinke Jan 30, 2020
a722b38
Merge remote-tracking branch 'origin/master' into sparse_bin_clean
guolinke Jan 30, 2020
7a59f19
fix test
guolinke Jan 30, 2020
1ac8283
typo
guolinke Jan 30, 2020
5b8de4f
formating
guolinke Jan 30, 2020
106c081
Revert "formating"
guolinke Jan 31, 2020
382e13e
add document
guolinke Jan 31, 2020
dec3d79
[R-package] Added tests on use of force_col_wise and force_row_wise i…
jameslamb Jan 31, 2020
d2fb9b3
naming
guolinke Jan 31, 2020
5db5d74
fix gpu code
guolinke Jan 31, 2020
7fda05a
Update include/LightGBM/bin.h
guolinke Jan 31, 2020
27a7209
Update src/treelearner/ocl/histogram16.cl
guolinke Jan 31, 2020
4623cd4
test: swap compilers for CI
StrikerRUS Jan 31, 2020
38d1e57
fix omp
guolinke Feb 1, 2020
8e27631
not avx2
guolinke Feb 1, 2020
c86a479
no aligned for feature histogram
guolinke Feb 1, 2020
737e9c9
Revert "refactor DataPartition::Split"
guolinke Feb 1, 2020
ce5f66b
slightly refactor data partition
guolinke Feb 1, 2020
a123c47
reduce the memory cost
guolinke Feb 2, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
reset to track changes
  • Loading branch information
guolinke committed Jan 30, 2020
commit 2ad4af5b89fa43f27a540c3ad89cb385634dd248
4 changes: 3 additions & 1 deletion include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,9 @@ class MultiValBin {

virtual void FinishLoad() = 0;

static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin);
virtual bool IsSparse() = 0;

static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate);

virtual MultiValBin* Clone() = 0;
};
Expand Down
13 changes: 8 additions & 5 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -516,11 +516,6 @@ class Dataset {
return feature_groups_[group]->bin_mappers_[sub_feature].get();
}

inline const Bin* FeatureBin(int i) const {
const int group = feature2group_[i];
return feature_groups_[group]->bin_data_.get();
}

inline const Bin* FeatureGroupBin(int group) const {
return feature_groups_[group]->bin_data_.get();
}
Expand All @@ -531,6 +526,14 @@ class Dataset {
return feature_groups_[group]->SubFeatureIterator(sub_feature);
}

inline BinIterator* FeatureGroupIterator(int group) const {
return feature_groups_[group]->FeatureGroupIterator();
}

inline bool IsMultiGroup(int i) const {
return feature_groups_[i]->is_multi_val_;
}

inline double RealThreshold(int i, uint32_t threshold) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
Expand Down
3 changes: 0 additions & 3 deletions include/LightGBM/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,6 @@ typedef void(*AllgatherFunction)(char* input, comm_size_t input_size, const comm

#define NO_SPECIFIC (-1)

// Prefetch size is usually 64 bytes
const int kCacheLineSize = 64;

const int kAlignedSize = 32;

#define SIZE_ALIGNED(t) ((t) + kAlignedSize - 1) / kAlignedSize * kAlignedSize
Expand Down
4 changes: 4 additions & 0 deletions include/LightGBM/utils/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -948,6 +948,10 @@ inline bool CheckAllowedJSON(const std::string& s) {
return true;
}

inline int RoundInt(double x) {
return static_cast<int>(x + 0.5f);
}

template <typename T, std::size_t N = 32>
class AlignmentAllocator {
public:
Expand Down
24 changes: 18 additions & 6 deletions src/io/bin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "dense_bin.hpp"
#include "dense_nbits_bin.hpp"
#include "multi_val_dense_bin.hpp"
#include "multi_val_sparse_bin.hpp"
#include "sparse_bin.hpp"

namespace LightGBM {
Expand Down Expand Up @@ -663,13 +664,24 @@ namespace LightGBM {
}
}

MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new MultiValDenseBin<uint8_t>(num_data, num_bin);
} else if (num_bin <= 65536) {
return new MultiValDenseBin<uint16_t>(num_data, num_bin);
MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate) {
const double multi_val_bin_sparse_threshold = 0.2;
if (sparse_rate >= multi_val_bin_sparse_threshold) {
if (num_bin <= 256) {
return new MultiValSparseBin<uint8_t>(num_data, num_bin);
} else if (num_bin <= 65536) {
return new MultiValSparseBin<uint16_t>(num_data, num_bin);
} else {
return new MultiValSparseBin<uint32_t>(num_data, num_bin);
}
} else {
return new MultiValDenseBin<uint32_t>(num_data, num_bin);
if (num_bin <= 256) {
return new MultiValDenseBin<uint8_t>(num_data, num_bin, num_feature);
} else if (num_bin <= 65536) {
return new MultiValDenseBin<uint16_t>(num_data, num_bin, num_feature);
} else {
return new MultiValDenseBin<uint32_t>(num_data, num_bin, num_feature);
}
}
}

Expand Down
164 changes: 94 additions & 70 deletions src/io/dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_
if (mark[indices[i]]) {
++ret;
}
if (ret >= max_cnt) {
if (ret > max_cnt) {
return -1;
}
}
Expand Down Expand Up @@ -98,7 +98,7 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
data_size_t total_sample_cnt,
data_size_t num_data,
bool is_use_gpu,
std::vector<bool>* multi_val_group) {
std::vector<int8_t>* multi_val_group) {
const int max_search_group = 100;
const int max_bin_per_group = 256;
const data_size_t single_val_max_conflict_cnt = static_cast<data_size_t>(total_sample_cnt / 10000);
Expand Down Expand Up @@ -217,7 +217,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
const std::vector<int>& used_features,
data_size_t num_data,
bool is_use_gpu,
std::vector<bool>* multi_val_group) {
std::vector<int8_t>* multi_val_group) {
Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
std::vector<size_t> feature_non_zero_cnt;
feature_non_zero_cnt.reserve(used_features.size());
Expand Down Expand Up @@ -262,7 +262,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
tmp_num_per_col[fidx] = num_per_col[fidx];
}
}
std::vector<bool> group_is_multi_val, group_is_multi_val2;
std::vector<int8_t> group_is_multi_val, group_is_multi_val2;
auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val);
auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val2);

Expand All @@ -277,7 +277,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
int j = tmp_rand.NextShort(i + 1, num_group);
std::swap(features_in_group[i], features_in_group[j]);
// Use std::swap for vector<bool> will cause the wrong result..
std::vector<bool>::swap(group_is_multi_val[i], group_is_multi_val[j]);
std::swap(group_is_multi_val[i], group_is_multi_val[j]);
}
*multi_val_group = group_is_multi_val;
return features_in_group;
Expand Down Expand Up @@ -307,7 +307,7 @@ void Dataset::Construct(
Log::Warning("There are no meaningful features, as all feature values are constant.");
}
auto features_in_group = NoGroup(used_features);
std::vector<bool> group_is_multi_val(used_features.size(), false);
std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
if (io_config.enable_bundle && !used_features.empty()) {
features_in_group = FastFeatureBundling(*bin_mappers,
sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast<data_size_t>(total_sample_cnt),
Expand Down Expand Up @@ -482,6 +482,66 @@ void Dataset::FinishLoad() {
is_finish_load_ = true;
}

void PushDataToMultiValBin(int num_threads, data_size_t num_data, const std::vector<uint32_t> most_freq_bins,
const std::vector<uint32_t> offsets, std::vector<std::vector<std::unique_ptr<BinIterator>>>& iters, MultiValBin* ret) {
Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin", global_timer);
const data_size_t min_block_size = 4096;
const int n_block = std::min(num_threads, (num_data + min_block_size - 1) / min_block_size);
const data_size_t block_size = (num_data + n_block - 1) / n_block;
if (ret->IsSparse()) {
#pragma omp parallel for schedule(static)
for (int tid = 0; tid < n_block; ++tid) {
std::vector<uint32_t> cur_data;
data_size_t start = tid * block_size;
data_size_t end = std::min(num_data, start + block_size);
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
iters[tid][j]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
cur_data.clear();
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
auto cur_bin = iters[tid][j]->Get(i);
if (cur_bin == most_freq_bins[j]) {
continue;
}
cur_bin += offsets[j];
if (most_freq_bins[j] == 0) {
cur_bin -= 1;
}
cur_data.push_back(cur_bin);
}
ret->PushOneRow(tid, i, cur_data);
}
}
} else {
#pragma omp parallel for schedule(static)
for (int tid = 0; tid < n_block; ++tid) {
std::vector<uint32_t> cur_data;
data_size_t start = tid * block_size;
data_size_t end = std::min(num_data, start + block_size);
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
iters[tid][j]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
cur_data.clear();
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
auto cur_bin = iters[tid][j]->Get(i);
if (cur_bin == most_freq_bins[j]) {
cur_bin = 0;
} else {
cur_bin += offsets[j];
if (most_freq_bins[j] == 0) {
cur_bin -= 1;
}
}
cur_data.push_back(cur_bin);
}
ret->PushOneRow(tid, i, cur_data);
}
}
}
}

MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures", global_timer);
int multi_group_id = -1;
Expand All @@ -508,44 +568,19 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {

std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
std::vector<uint32_t> most_freq_bins;
double sum_sparse_rate = 0;
for (int i = 0; i < num_feature; ++i) {
for (int tid = 0; tid < num_threads; ++tid) {
iters[tid].emplace_back(feature_groups_[multi_group_id]->SubFeatureIterator(i));
}
most_freq_bins.push_back(feature_groups_[multi_group_id]->bin_mappers_[i]->GetMostFreqBin());
sum_sparse_rate += feature_groups_[multi_group_id]->bin_mappers_[i]->sparse_rate();
}

sum_sparse_rate /= num_feature;
Log::Debug("GetMultiBinFromSparseFeatures:: sparse rate %f", sum_sparse_rate);
std::unique_ptr<MultiValBin> ret;
ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back()));

const data_size_t min_block_size = 4096;
const int n_block = std::min(num_threads, (num_data_ + min_block_size - 1) / min_block_size);
const data_size_t block_size = (num_data_ + n_block - 1) / n_block;

#pragma omp parallel for schedule(static)
for (int tid = 0; tid < n_block; ++tid) {
std::vector<uint32_t> cur_data;
data_size_t start = tid * block_size;
data_size_t end = std::min(num_data_, start + block_size);
for (int j = 0; j < num_feature; ++j) {
iters[tid][j]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
cur_data.clear();
for (int j = 0; j < num_feature; ++j) {
auto cur_bin = iters[tid][j]->Get(i);
if (cur_bin == most_freq_bins[j]) {
continue;
}
if (most_freq_bins[j] == 0) {
cur_bin -= 1;
}
cur_bin += offsets[j];
cur_data.push_back(cur_bin);
}
ret->PushOneRow(tid, i, cur_data);
}
}
ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(), num_feature, sum_sparse_rate));
PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get());
ret->FinishLoad();
return ret.release();
}
Expand All @@ -558,15 +593,19 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
{
num_threads = omp_get_num_threads();
}
std::vector<int> offsets;
double sum_dense_ratio = 0;

std::unique_ptr<MultiValBin> ret;
std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
std::vector<uint32_t> most_freq_bins;
std::vector<uint32_t> offsets;
int num_total_bin = 1;
offsets.push_back(num_total_bin);
for (int gid = 0; gid < num_groups_; ++gid) {
if (feature_groups_[gid]->is_multi_val_) {
for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
most_freq_bins.push_back(bin_mapper->GetMostFreqBin());
num_total_bin += bin_mapper->num_bin();
if (most_freq_bins.back() == 0) {
Expand All @@ -584,39 +623,16 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
iters[tid].emplace_back(feature_groups_[gid]->FeatureGroupIterator());
}
offsets.push_back(num_total_bin);
}
}
std::unique_ptr<MultiValBin> ret;
ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back()));

const data_size_t min_block_size = 4096;
const int n_block = std::min(num_threads, (num_data_ + min_block_size - 1) / min_block_size);
const data_size_t block_size = (num_data_ + n_block - 1) / n_block;

#pragma omp parallel for schedule(static)
for (int tid = 0; tid < n_block; ++tid) {
std::vector<uint32_t> cur_data;
data_size_t start = tid * block_size;
data_size_t end = std::min(num_data_, start + block_size);
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
iters[tid][j]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
cur_data.clear();
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
auto cur_bin = iters[tid][j]->Get(i);
if (cur_bin == most_freq_bins[j]) {
continue;
}
if (most_freq_bins[j] == 0) {
cur_bin -= 1;
}
cur_bin += offsets[j];
cur_data.push_back(cur_bin);
for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
}
ret->PushOneRow(tid, i, cur_data);
}
}
sum_dense_ratio /= static_cast<double>(most_freq_bins.size());
Log::Debug("GetMultiBinFromAllFeatures:: sparse rate %f", 1.0 - sum_dense_ratio);
ret.reset(MultiValBin::CreateMultiValBin(num_data_, num_total_bin, static_cast<int>(most_freq_bins.size()), 1.0 - sum_dense_ratio));
PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get());
ret->FinishLoad();
return ret.release();
}
Expand All @@ -627,13 +643,16 @@ MultiValBin* Dataset::TestMultiThreadingMethod(score_t* gradients, score_t* hess
if (force_colwise && force_rowwise) {
Log::Fatal("cannot set both `force_col_wise` and `force_row_wise` to `true`.");
}
CHECK(num_groups_ > 0);
if (num_groups_ <= 0) {
return nullptr;
}
if (force_colwise) {
*is_hist_col_wise = true;
return GetMultiBinFromSparseFeatures();
} else if (force_rowwise) {
*is_hist_col_wise = false;
return GetMultiBinFromAllFeatures();
auto ret = GetMultiBinFromAllFeatures();
return ret;
} else {
std::unique_ptr<MultiValBin> sparse_bin;
std::unique_ptr<MultiValBin> all_bin;
Expand All @@ -654,6 +673,11 @@ MultiValBin* Dataset::TestMultiThreadingMethod(score_t* gradients, score_t* hess
} else {
*is_hist_col_wise = false;
Log::Info("Use row-wise multi-threading, may increase memory usage. If memory is not enough, you can set `force_col_wise=true`.");
if (all_bin->IsSparse()) {
Log::Debug("Use Sparse Multi-Val Bin");
} else {
Log::Debug("Use Dense Multi-Val Bin");
}
return all_bin.release();
}
}
Expand Down
Loading