diff --git a/CMakeLists.txt b/CMakeLists.txt index b6d4704..c98ac69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ add_subdirectory (test) enable_testing () add_test (NAME fastforestTest COMMAND Test WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/test) -add_library (fastforest SHARED src/fastforest.cpp) +add_library (fastforest SHARED src/fastforest.cpp src/util.cpp) target_link_libraries(fastforest stdc++fs) set_target_properties(fastforest PROPERTIES VERSION ${PROJECT_VERSION}) diff --git a/README.md b/README.md index 721541e..64fbf76 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ The __FastForest__ library helps you to get your xgboost model into a C++ produc The mission of this library is to be: * __Easy__: deploying your xgboost model should be as painless as it can be * __Fast__: thanks to efficient data structures for storing the trees, this library goes easy on your CPU and memory -* __Safe__: the FastForest objects are immutable, and therefore they are an excellent choice in multithreading +* __Safe__: the FastForest objects are not mutated when used, and therefore they are an excellent choice in multithreading environments * __Portable__: FastForest has no dependency other than the C++ standard library @@ -56,7 +56,7 @@ In C++, you can now easily load the model into a `FastForest` and obtain predict int main() { std::vector features{"f0", "f1", "f2", "f3", "f4"}; - FastForest fastForest("model.txt", features); + const auto fastForest = fastforest::load_txt("model.txt", features); std::vector input{0.0, 0.2, 0.4, 0.6, 0.8}; @@ -99,13 +99,13 @@ The tests were performed on a Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz. ### Serialization The FastForests can serialized to it's own binary format. The binary format exactly reflects the memory layout of the -FastForest class, so saving and loading is as fast as it can be. The serialization to file is done with the `save` +FastForest class, so saving and loading is as fast as it can be. The serialization to file is done with the `write_bin` method. ```C++ -fastForest.save("forest.bin"); +fastForest.write_bin("forest.bin"); ``` The serialized FastForest can be read back with it's constructor, this time the one that does not take a reference to a vector for the feature names. ```C++ -FastForest fastForest("forest.bin"); +const auto fastForest = fastforest::load_bin("forest.bin"); ``` diff --git a/benchmark/benchmark-01.cpp b/benchmark/benchmark-01.cpp index 30c5554..f1b6ccf 100644 --- a/benchmark/benchmark-01.cpp +++ b/benchmark/benchmark-01.cpp @@ -14,7 +14,7 @@ int main() { std::vector features{"f0", "f1", "f2", "f3", "f4"}; - FastForest fastForest("model.txt", features); + const auto fastForest = fastforest::load_txt("model.txt", features); const int n = 100000; diff --git a/benchmark/benchmark-01.py b/benchmark/benchmark-01.py index caff31d..d81edde 100644 --- a/benchmark/benchmark-01.py +++ b/benchmark/benchmark-01.py @@ -11,7 +11,7 @@ X, y = make_classification(n_samples=10000, n_features=5, random_state=42, n_classes=2, weights=[0.5]) -model = XGBClassifier(n_estimators=1000, objective="binary:logistic").fit(X, y) +model = XGBClassifier(n_estimators=3, objective="binary:logistic").fit(X, y) model._Booster.dump_model("model.txt") model._Booster.save_model("model.bin") diff --git a/include/fastforest.h b/include/fastforest.h index 4eee358..a933afe 100644 --- a/include/fastforest.h +++ b/include/fastforest.h @@ -30,8 +30,8 @@ SOFTWARE. #include #include -class FastForest { - public: +namespace fastforest { + // The floating point number type that will be used to accept features and store cut values using FeatureType = float; // Tue floating point number type that the individual trees return their responses in @@ -42,25 +42,22 @@ class FastForest { // Set to `unsigned char` for most compact fastforest ofjects if you have less than 256 features. using CutIndexType = unsigned int; - FastForest(std::string const& txtpath, std::vector& features); - FastForest(std::string const& txtpath); - TreeEnsembleResponseType operator()(const FeatureType* array) const; - - auto const& cutIndices() const { return cutIndices_; } - auto const& cutValues() const { return cutValues_; } - auto const& leftIndices() const { return leftIndices_; } - auto const& rightIndices() const { return rightIndices_; } - auto const& responses() const { return responses_; } - - void save(std::string const& filename) const; - - private: - std::vector rootIndices_; - std::vector cutIndices_; - std::vector cutValues_; - std::vector leftIndices_; - std::vector rightIndices_; - std::vector responses_; -}; + struct FastForest { + TreeEnsembleResponseType operator()(const FeatureType* array) const; + + void write_bin(std::string const& filename) const; + + std::vector rootIndices_; + std::vector cutIndices_; + std::vector cutValues_; + std::vector leftIndices_; + std::vector rightIndices_; + std::vector responses_; + }; + + FastForest load_txt(std::string const& txtpath, std::vector& features); + FastForest load_bin(std::string const& txtpath); + +} // namespace fastforest #endif diff --git a/src/fastforest.cpp b/src/fastforest.cpp index c124e80..33bb0f4 100644 --- a/src/fastforest.cpp +++ b/src/fastforest.cpp @@ -35,6 +35,8 @@ SOFTWARE. #include #include +using namespace fastforest; + namespace { namespace util { @@ -106,13 +108,15 @@ namespace { } // namespace -FastForest::FastForest(std::string const& txtpath, std::vector& features) { +FastForest fastforest::load_txt(std::string const& txtpath, std::vector& features) { const std::string info = "constructing FastForest from " + txtpath + ": "; if (!std::experimental::filesystem::exists(txtpath)) { throw std::runtime_error(info + "file does not exists"); } + FastForest ff; + std::ifstream file(txtpath); int nVariables = 0; @@ -142,14 +146,14 @@ FastForest::FastForest(std::string const& txtpath, std::vector& fea auto subline = line.substr(foundBegin + 1, foundEnd - foundBegin - 1); if (util::isInteger(subline)) { detail::correctIndices( - rightIndices_.begin() + nPreviousNodes, rightIndices_.end(), nodeIndices, leafIndices); + ff.rightIndices_.begin() + nPreviousNodes, ff.rightIndices_.end(), nodeIndices, leafIndices); detail::correctIndices( - leftIndices_.begin() + nPreviousNodes, leftIndices_.end(), nodeIndices, leafIndices); + ff.leftIndices_.begin() + nPreviousNodes, ff.leftIndices_.end(), nodeIndices, leafIndices); nodeIndices.clear(); leafIndices.clear(); - nPreviousNodes = cutValues_.size(); - nPreviousLeaves = responses_.size(); - rootIndices_.push_back(nPreviousNodes); + nPreviousNodes = ff.cutValues_.size(); + nPreviousLeaves = ff.responses_.size(); + ff.rootIndices_.push_back(nPreviousNodes); } else { std::stringstream ss(line); int index; @@ -182,10 +186,10 @@ FastForest::FastForest(std::string const& txtpath, std::vector& fea throw std::runtime_error(info + "problem while parsing the text dump"); } - cutValues_.push_back(cutValue); - cutIndices_.push_back(varIndices[varName]); - leftIndices_.push_back(yes); - rightIndices_.push_back(no); + ff.cutValues_.push_back(cutValue); + ff.cutIndices_.push_back(varIndices[varName]); + ff.leftIndices_.push_back(yes); + ff.rightIndices_.push_back(no); nodeIndices[index] = nodeIndices.size() + nPreviousNodes; } @@ -197,16 +201,18 @@ FastForest::FastForest(std::string const& txtpath, std::vector& fea ss >> index; line = ss.str(); - responses_.push_back(output.value); + ff.responses_.push_back(output.value); leafIndices[index] = leafIndices.size() + nPreviousLeaves; } } } - detail::correctIndices(rightIndices_.begin() + nPreviousNodes, rightIndices_.end(), nodeIndices, leafIndices); - detail::correctIndices(leftIndices_.begin() + nPreviousNodes, leftIndices_.end(), nodeIndices, leafIndices); + detail::correctIndices(ff.rightIndices_.begin() + nPreviousNodes, ff.rightIndices_.end(), nodeIndices, leafIndices); + detail::correctIndices(ff.leftIndices_.begin() + nPreviousNodes, ff.leftIndices_.end(), nodeIndices, leafIndices); + + return ff; } -FastForest::TreeEnsembleResponseType FastForest::operator()(const FeatureType* array) const { +TreeEnsembleResponseType fastforest::FastForest::operator()(const FeatureType* array) const { TreeEnsembleResponseType response = 0.; for (int index : rootIndices_) { do { @@ -219,33 +225,37 @@ FastForest::TreeEnsembleResponseType FastForest::operator()(const FeatureType* a return response; } -FastForest::FastForest(std::string const& txtpath) { +FastForest fastforest::load_bin(std::string const& txtpath) { + FastForest ff; + std::ifstream is(txtpath, std::ios::binary); - int nRootNodes = rootIndices_.size(); - int nNodes = cutValues_.size(); - int nLeaves = responses_.size(); + int nRootNodes = ff.rootIndices_.size(); + int nNodes = ff.cutValues_.size(); + int nLeaves = ff.responses_.size(); is.read((char*)&nRootNodes, sizeof(int)); is.read((char*)&nNodes, sizeof(int)); is.read((char*)&nLeaves, sizeof(int)); - rootIndices_.resize(nRootNodes); - cutIndices_.resize(nNodes); - cutValues_.resize(nNodes); - leftIndices_.resize(nNodes); - rightIndices_.resize(nNodes); - responses_.resize(nLeaves); - - is.read((char*)rootIndices_.data(), nRootNodes * sizeof(int)); - is.read((char*)cutIndices_.data(), nNodes * sizeof(CutIndexType)); - is.read((char*)cutValues_.data(), nNodes * sizeof(FeatureType)); - is.read((char*)leftIndices_.data(), nNodes * sizeof(int)); - is.read((char*)rightIndices_.data(), nNodes * sizeof(int)); - is.read((char*)responses_.data(), nLeaves * sizeof(TreeResponseType)); + ff.rootIndices_.resize(nRootNodes); + ff.cutIndices_.resize(nNodes); + ff.cutValues_.resize(nNodes); + ff.leftIndices_.resize(nNodes); + ff.rightIndices_.resize(nNodes); + ff.responses_.resize(nLeaves); + + is.read((char*)ff.rootIndices_.data(), nRootNodes * sizeof(int)); + is.read((char*)ff.cutIndices_.data(), nNodes * sizeof(CutIndexType)); + is.read((char*)ff.cutValues_.data(), nNodes * sizeof(FeatureType)); + is.read((char*)ff.leftIndices_.data(), nNodes * sizeof(int)); + is.read((char*)ff.rightIndices_.data(), nNodes * sizeof(int)); + is.read((char*)ff.responses_.data(), nLeaves * sizeof(TreeResponseType)); + + return ff; } -void FastForest::save(std::string const& filename) const { +void fastforest::FastForest::write_bin(std::string const& filename) const { std::ofstream os(filename, std::ios::binary); int nRootNodes = rootIndices_.size(); diff --git a/test/test.cpp b/test/test.cpp index 9ebe3f9..6971233 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -7,29 +7,29 @@ #include using ReferenceType = double; -constexpr FastForest::FeatureType tolerance = 1e-4; +constexpr fastforest::FeatureType tolerance = 1e-4; BOOST_AUTO_TEST_CASE(ExampleTest) { std::vector features{"f0", "f1", "f2", "f3", "f4"}; - FastForest fastForest("continuous/model.txt", features); + const auto fastForest = fastforest::load_txt("continuous/model.txt", features); - std::vector input{0.0, 0.2, 0.4, 0.6, 0.8}; + std::vector input{0.0, 0.2, 0.4, 0.6, 0.8}; - FastForest::FeatureType score = fastForest(input.data()); - FastForest::FeatureType logistcScore = 1. / (1. + std::exp(-score)); + fastforest::FeatureType score = fastForest(input.data()); + fastforest::FeatureType logistcScore = 1. / (1. + std::exp(-score)); } BOOST_AUTO_TEST_CASE(BasicTest) { std::vector features{"f0", "f1", "f2", "f3", "f4"}; - FastForest fastForest("continuous/model.txt", features); + const auto fastForest = fastforest::load_txt("continuous/model.txt", features); std::ifstream fileX("continuous/X.csv"); std::ifstream filePreds("continuous/preds.csv"); - std::vector input(5); - FastForest::FeatureType score; + std::vector input(5); + fastforest::FeatureType score; ReferenceType ref; for (int i = 0; i < 100; ++i) { @@ -46,17 +46,17 @@ BOOST_AUTO_TEST_CASE(BasicTest) { BOOST_AUTO_TEST_CASE(SerializationTest) { { std::vector features{"f0", "f1", "f2", "f3", "f4"}; - FastForest fastForest("continuous/model.txt", features); - fastForest.save("continuous/forest.bin"); + const auto fastForest = fastforest::load_txt("continuous/model.txt", features); + fastForest.write_bin("continuous/forest.bin"); } - FastForest fastForest("continuous/forest.bin"); + const auto fastForest = fastforest::load_bin("continuous/forest.bin"); std::ifstream fileX("continuous/X.csv"); std::ifstream filePreds("continuous/preds.csv"); - std::vector input(5); - FastForest::FeatureType score; + std::vector input(5); + fastforest::FeatureType score; ReferenceType ref; for (int i = 0; i < 100; ++i) { @@ -73,13 +73,13 @@ BOOST_AUTO_TEST_CASE(SerializationTest) { BOOST_AUTO_TEST_CASE(DiscreteTest) { std::vector features{"f0", "f1", "f2", "f3", "f4"}; - FastForest fastForest("discrete/model.txt", features); + const auto fastForest = fastforest::load_txt("discrete/model.txt", features); std::ifstream fileX("discrete/X.csv"); std::ifstream filePreds("discrete/preds.csv"); - std::vector input(5); - FastForest::FeatureType score; + std::vector input(5); + fastforest::FeatureType score; ReferenceType ref; for (int i = 0; i < 100; ++i) { @@ -99,13 +99,13 @@ BOOST_AUTO_TEST_CASE(ManyfeaturesTest) { features.push_back(std::string("f") + std::to_string(i)); } - FastForest fastForest("manyfeatures/model.txt", features); + const auto fastForest = fastforest::load_txt("manyfeatures/model.txt", features); std::ifstream fileX("manyfeatures/X.csv"); std::ifstream filePreds("manyfeatures/preds.csv"); - std::vector input(features.size()); - FastForest::FeatureType score; + std::vector input(features.size()); + fastforest::FeatureType score; ReferenceType ref; for (int i = 0; i < 100; ++i) {