Skip to content

Commit

Permalink
change fastforest interface to separate FastForest data structure fro…
Browse files Browse the repository at this point in the history
…m i/o
  • Loading branch information
guitargeek committed Jun 15, 2020
1 parent 0fc69c8 commit 07984a3
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 81 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ add_subdirectory (test)
enable_testing ()
add_test (NAME fastforestTest COMMAND Test WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/test)

add_library (fastforest SHARED src/fastforest.cpp)
add_library (fastforest SHARED src/fastforest.cpp src/util.cpp)
target_link_libraries(fastforest stdc++fs)

set_target_properties(fastforest PROPERTIES VERSION ${PROJECT_VERSION})
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The __FastForest__ library helps you to get your xgboost model into a C++ produc
The mission of this library is to be:
* __Easy__: deploying your xgboost model should be as painless as it can be
* __Fast__: thanks to efficient data structures for storing the trees, this library goes easy on your CPU and memory
* __Safe__: the FastForest objects are immutable, and therefore they are an excellent choice in multithreading
* __Safe__: the FastForest objects are not mutated when used, and therefore they are an excellent choice in multithreading
environments
* __Portable__: FastForest has no dependency other than the C++ standard library

Expand Down Expand Up @@ -56,7 +56,7 @@ In C++, you can now easily load the model into a `FastForest` and obtain predict
int main() {
std::vector<std::string> features{"f0", "f1", "f2", "f3", "f4"};

FastForest fastForest("model.txt", features);
const auto fastForest = fastforest::load_txt("model.txt", features);

std::vector<float> input{0.0, 0.2, 0.4, 0.6, 0.8};

Expand Down Expand Up @@ -99,13 +99,13 @@ The tests were performed on a Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz.
### Serialization

The FastForests can serialized to it's own binary format. The binary format exactly reflects the memory layout of the
FastForest class, so saving and loading is as fast as it can be. The serialization to file is done with the `save`
FastForest class, so saving and loading is as fast as it can be. The serialization to file is done with the `write_bin`
method.
```C++
fastForest.save("forest.bin");
fastForest.write_bin("forest.bin");
```
The serialized FastForest can be read back with it's constructor, this time the one that does not take a reference to a
vector for the feature names.
```C++
FastForest fastForest("forest.bin");
const auto fastForest = fastforest::load_bin("forest.bin");
```
2 changes: 1 addition & 1 deletion benchmark/benchmark-01.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
int main() {
std::vector<std::string> features{"f0", "f1", "f2", "f3", "f4"};

FastForest fastForest("model.txt", features);
const auto fastForest = fastforest::load_txt("model.txt", features);

const int n = 100000;

Expand Down
2 changes: 1 addition & 1 deletion benchmark/benchmark-01.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

X, y = make_classification(n_samples=10000, n_features=5, random_state=42, n_classes=2, weights=[0.5])

model = XGBClassifier(n_estimators=1000, objective="binary:logistic").fit(X, y)
model = XGBClassifier(n_estimators=3, objective="binary:logistic").fit(X, y)

model._Booster.dump_model("model.txt")
model._Booster.save_model("model.bin")
Expand Down
41 changes: 19 additions & 22 deletions include/fastforest.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ SOFTWARE.
#include <vector>
#include <string>

class FastForest {
public:
namespace fastforest {

// The floating point number type that will be used to accept features and store cut values
using FeatureType = float;
// Tue floating point number type that the individual trees return their responses in
Expand All @@ -42,25 +42,22 @@ class FastForest {
// Set to `unsigned char` for most compact fastforest ofjects if you have less than 256 features.
using CutIndexType = unsigned int;

FastForest(std::string const& txtpath, std::vector<std::string>& features);
FastForest(std::string const& txtpath);
TreeEnsembleResponseType operator()(const FeatureType* array) const;

auto const& cutIndices() const { return cutIndices_; }
auto const& cutValues() const { return cutValues_; }
auto const& leftIndices() const { return leftIndices_; }
auto const& rightIndices() const { return rightIndices_; }
auto const& responses() const { return responses_; }

void save(std::string const& filename) const;

private:
std::vector<int> rootIndices_;
std::vector<CutIndexType> cutIndices_;
std::vector<FeatureType> cutValues_;
std::vector<int> leftIndices_;
std::vector<int> rightIndices_;
std::vector<TreeResponseType> responses_;
};
struct FastForest {
TreeEnsembleResponseType operator()(const FeatureType* array) const;

void write_bin(std::string const& filename) const;

std::vector<int> rootIndices_;
std::vector<CutIndexType> cutIndices_;
std::vector<FeatureType> cutValues_;
std::vector<int> leftIndices_;
std::vector<int> rightIndices_;
std::vector<TreeResponseType> responses_;
};

FastForest load_txt(std::string const& txtpath, std::vector<std::string>& features);
FastForest load_bin(std::string const& txtpath);

} // namespace fastforest

#endif
74 changes: 42 additions & 32 deletions src/fastforest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ SOFTWARE.
#include <stdexcept>
#include <experimental/filesystem>

using namespace fastforest;

namespace {

namespace util {
Expand Down Expand Up @@ -106,13 +108,15 @@ namespace {

} // namespace

FastForest::FastForest(std::string const& txtpath, std::vector<std::string>& features) {
FastForest fastforest::load_txt(std::string const& txtpath, std::vector<std::string>& features) {
const std::string info = "constructing FastForest from " + txtpath + ": ";

if (!std::experimental::filesystem::exists(txtpath)) {
throw std::runtime_error(info + "file does not exists");
}

FastForest ff;

std::ifstream file(txtpath);

int nVariables = 0;
Expand Down Expand Up @@ -142,14 +146,14 @@ FastForest::FastForest(std::string const& txtpath, std::vector<std::string>& fea
auto subline = line.substr(foundBegin + 1, foundEnd - foundBegin - 1);
if (util::isInteger(subline)) {
detail::correctIndices(
rightIndices_.begin() + nPreviousNodes, rightIndices_.end(), nodeIndices, leafIndices);
ff.rightIndices_.begin() + nPreviousNodes, ff.rightIndices_.end(), nodeIndices, leafIndices);
detail::correctIndices(
leftIndices_.begin() + nPreviousNodes, leftIndices_.end(), nodeIndices, leafIndices);
ff.leftIndices_.begin() + nPreviousNodes, ff.leftIndices_.end(), nodeIndices, leafIndices);
nodeIndices.clear();
leafIndices.clear();
nPreviousNodes = cutValues_.size();
nPreviousLeaves = responses_.size();
rootIndices_.push_back(nPreviousNodes);
nPreviousNodes = ff.cutValues_.size();
nPreviousLeaves = ff.responses_.size();
ff.rootIndices_.push_back(nPreviousNodes);
} else {
std::stringstream ss(line);
int index;
Expand Down Expand Up @@ -182,10 +186,10 @@ FastForest::FastForest(std::string const& txtpath, std::vector<std::string>& fea
throw std::runtime_error(info + "problem while parsing the text dump");
}

cutValues_.push_back(cutValue);
cutIndices_.push_back(varIndices[varName]);
leftIndices_.push_back(yes);
rightIndices_.push_back(no);
ff.cutValues_.push_back(cutValue);
ff.cutIndices_.push_back(varIndices[varName]);
ff.leftIndices_.push_back(yes);
ff.rightIndices_.push_back(no);
nodeIndices[index] = nodeIndices.size() + nPreviousNodes;
}

Expand All @@ -197,16 +201,18 @@ FastForest::FastForest(std::string const& txtpath, std::vector<std::string>& fea
ss >> index;
line = ss.str();

responses_.push_back(output.value);
ff.responses_.push_back(output.value);
leafIndices[index] = leafIndices.size() + nPreviousLeaves;
}
}
}
detail::correctIndices(rightIndices_.begin() + nPreviousNodes, rightIndices_.end(), nodeIndices, leafIndices);
detail::correctIndices(leftIndices_.begin() + nPreviousNodes, leftIndices_.end(), nodeIndices, leafIndices);
detail::correctIndices(ff.rightIndices_.begin() + nPreviousNodes, ff.rightIndices_.end(), nodeIndices, leafIndices);
detail::correctIndices(ff.leftIndices_.begin() + nPreviousNodes, ff.leftIndices_.end(), nodeIndices, leafIndices);

return ff;
}

FastForest::TreeEnsembleResponseType FastForest::operator()(const FeatureType* array) const {
TreeEnsembleResponseType fastforest::FastForest::operator()(const FeatureType* array) const {
TreeEnsembleResponseType response = 0.;
for (int index : rootIndices_) {
do {
Expand All @@ -219,33 +225,37 @@ FastForest::TreeEnsembleResponseType FastForest::operator()(const FeatureType* a
return response;
}

FastForest::FastForest(std::string const& txtpath) {
FastForest fastforest::load_bin(std::string const& txtpath) {
FastForest ff;

std::ifstream is(txtpath, std::ios::binary);

int nRootNodes = rootIndices_.size();
int nNodes = cutValues_.size();
int nLeaves = responses_.size();
int nRootNodes = ff.rootIndices_.size();
int nNodes = ff.cutValues_.size();
int nLeaves = ff.responses_.size();

is.read((char*)&nRootNodes, sizeof(int));
is.read((char*)&nNodes, sizeof(int));
is.read((char*)&nLeaves, sizeof(int));

rootIndices_.resize(nRootNodes);
cutIndices_.resize(nNodes);
cutValues_.resize(nNodes);
leftIndices_.resize(nNodes);
rightIndices_.resize(nNodes);
responses_.resize(nLeaves);

is.read((char*)rootIndices_.data(), nRootNodes * sizeof(int));
is.read((char*)cutIndices_.data(), nNodes * sizeof(CutIndexType));
is.read((char*)cutValues_.data(), nNodes * sizeof(FeatureType));
is.read((char*)leftIndices_.data(), nNodes * sizeof(int));
is.read((char*)rightIndices_.data(), nNodes * sizeof(int));
is.read((char*)responses_.data(), nLeaves * sizeof(TreeResponseType));
ff.rootIndices_.resize(nRootNodes);
ff.cutIndices_.resize(nNodes);
ff.cutValues_.resize(nNodes);
ff.leftIndices_.resize(nNodes);
ff.rightIndices_.resize(nNodes);
ff.responses_.resize(nLeaves);

is.read((char*)ff.rootIndices_.data(), nRootNodes * sizeof(int));
is.read((char*)ff.cutIndices_.data(), nNodes * sizeof(CutIndexType));
is.read((char*)ff.cutValues_.data(), nNodes * sizeof(FeatureType));
is.read((char*)ff.leftIndices_.data(), nNodes * sizeof(int));
is.read((char*)ff.rightIndices_.data(), nNodes * sizeof(int));
is.read((char*)ff.responses_.data(), nLeaves * sizeof(TreeResponseType));

return ff;
}

void FastForest::save(std::string const& filename) const {
void fastforest::FastForest::write_bin(std::string const& filename) const {
std::ofstream os(filename, std::ios::binary);

int nRootNodes = rootIndices_.size();
Expand Down
38 changes: 19 additions & 19 deletions test/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,29 @@
#include <cmath>

using ReferenceType = double;
constexpr FastForest::FeatureType tolerance = 1e-4;
constexpr fastforest::FeatureType tolerance = 1e-4;

BOOST_AUTO_TEST_CASE(ExampleTest) {
std::vector<std::string> features{"f0", "f1", "f2", "f3", "f4"};

FastForest fastForest("continuous/model.txt", features);
const auto fastForest = fastforest::load_txt("continuous/model.txt", features);

std::vector<FastForest::FeatureType> input{0.0, 0.2, 0.4, 0.6, 0.8};
std::vector<fastforest::FeatureType> input{0.0, 0.2, 0.4, 0.6, 0.8};

FastForest::FeatureType score = fastForest(input.data());
FastForest::FeatureType logistcScore = 1. / (1. + std::exp(-score));
fastforest::FeatureType score = fastForest(input.data());
fastforest::FeatureType logistcScore = 1. / (1. + std::exp(-score));
}

BOOST_AUTO_TEST_CASE(BasicTest) {
std::vector<std::string> features{"f0", "f1", "f2", "f3", "f4"};

FastForest fastForest("continuous/model.txt", features);
const auto fastForest = fastforest::load_txt("continuous/model.txt", features);

std::ifstream fileX("continuous/X.csv");
std::ifstream filePreds("continuous/preds.csv");

std::vector<FastForest::FeatureType> input(5);
FastForest::FeatureType score;
std::vector<fastforest::FeatureType> input(5);
fastforest::FeatureType score;
ReferenceType ref;

for (int i = 0; i < 100; ++i) {
Expand All @@ -46,17 +46,17 @@ BOOST_AUTO_TEST_CASE(BasicTest) {
BOOST_AUTO_TEST_CASE(SerializationTest) {
{
std::vector<std::string> features{"f0", "f1", "f2", "f3", "f4"};
FastForest fastForest("continuous/model.txt", features);
fastForest.save("continuous/forest.bin");
const auto fastForest = fastforest::load_txt("continuous/model.txt", features);
fastForest.write_bin("continuous/forest.bin");
}

FastForest fastForest("continuous/forest.bin");
const auto fastForest = fastforest::load_bin("continuous/forest.bin");

std::ifstream fileX("continuous/X.csv");
std::ifstream filePreds("continuous/preds.csv");

std::vector<FastForest::FeatureType> input(5);
FastForest::FeatureType score;
std::vector<fastforest::FeatureType> input(5);
fastforest::FeatureType score;
ReferenceType ref;

for (int i = 0; i < 100; ++i) {
Expand All @@ -73,13 +73,13 @@ BOOST_AUTO_TEST_CASE(SerializationTest) {
BOOST_AUTO_TEST_CASE(DiscreteTest) {
std::vector<std::string> features{"f0", "f1", "f2", "f3", "f4"};

FastForest fastForest("discrete/model.txt", features);
const auto fastForest = fastforest::load_txt("discrete/model.txt", features);

std::ifstream fileX("discrete/X.csv");
std::ifstream filePreds("discrete/preds.csv");

std::vector<FastForest::FeatureType> input(5);
FastForest::FeatureType score;
std::vector<fastforest::FeatureType> input(5);
fastforest::FeatureType score;
ReferenceType ref;

for (int i = 0; i < 100; ++i) {
Expand All @@ -99,13 +99,13 @@ BOOST_AUTO_TEST_CASE(ManyfeaturesTest) {
features.push_back(std::string("f") + std::to_string(i));
}

FastForest fastForest("manyfeatures/model.txt", features);
const auto fastForest = fastforest::load_txt("manyfeatures/model.txt", features);

std::ifstream fileX("manyfeatures/X.csv");
std::ifstream filePreds("manyfeatures/preds.csv");

std::vector<FastForest::FeatureType> input(features.size());
FastForest::FeatureType score;
std::vector<fastforest::FeatureType> input(features.size());
fastforest::FeatureType score;
ReferenceType ref;

for (int i = 0; i < 100; ++i) {
Expand Down

0 comments on commit 07984a3

Please sign in to comment.