root-project
diff --git a/‎bindings/pyroot/pythonizations/python/ROOT/_facade.py
Lines changed: 2 additions & 1 deletion b/‎bindings/pyroot/pythonizations/python/ROOT/_facade.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/__init__.py
Lines changed: 1 addition & 1 deletion b/‎bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_tree_inference.py
Lines changed: 51 additions & 97 deletions b/‎bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_tree_inference.py
Lines changed: 51 additions & 97 deletions
diff --git a/‎tmva/tmva/CMakeLists.txt
Lines changed: 0 additions & 4 deletions b/‎tmva/tmva/CMakeLists.txt
Lines changed: 0 additions & 4 deletions
diff --git a/‎tmva/tmva/inc/LinkDefUtils.h
Lines changed: 2 additions & 3 deletions b/‎tmva/tmva/inc/LinkDefUtils.h
Lines changed: 2 additions & 3 deletions
diff --git a/‎tmva/tmva/inc/TMVA/RBDT.hxx
Lines changed: 45 additions & 82 deletions b/‎tmva/tmva/inc/TMVA/RBDT.hxx
Lines changed: 45 additions & 82 deletions
@@ -347,10 +347,11 @@ def TMVA(self):
         hasRDF = "dataframe" in gROOT.GetConfigFeatures()
         if hasRDF:
             try:
-                from ._pythonization._tmva import inject_rbatchgenerator, _AsRTensor
+                from ._pythonization._tmva import inject_rbatchgenerator, _AsRTensor, SaveXGBoost
 
                 inject_rbatchgenerator(ns)
                 ns.Experimental.AsRTensor = _AsRTensor
+                ns.Experimental.SaveXGBoost = SaveXGBoost
             except:
                 raise Exception("Failed to pythonize the namespace TMVA")
         del type(self).TMVA
 
@@ -51,7 +51,7 @@ def inject_rbatchgenerator(ns):
 
 #this should be available only when xgboost is there ?
 # We probably don't need a protection here since the code is run only when there is xgboost
-from ._tree_inference import SaveXGBoost, pythonize_tree_inference
+from ._tree_inference import SaveXGBoost
 
 
 # list of python classes that are used to pythonize TMVA classes
 
@@ -1,18 +1,46 @@
-# Author: Stefan Wunsch CERN  09/2019
+# Author : Stefan Wunsch CERN 09 / 2019
 
 ################################################################################
-# Copyright (C) 1995-2019, Rene Brun and Fons Rademakers.                      #
-# All rights reserved.                                                         #
-#                                                                              #
-# For the licensing terms see $ROOTSYS/LICENSE.                                #
-# For the list of contributors see $ROOTSYS/README/CREDITS.                    #
+# Copyright(C) 1995 - 2019, Rene Brun and Fons Rademakers.#
+# All rights reserved.#
+# #
+# For the licensing terms see $ROOTSYS / LICENSE.#
+# For the list of contributors see $ROOTSYS / README / CREDITS.#
 ################################################################################
 
 from .. import pythonization
 import cppyy
 
+import json
 
-def SaveXGBoost(self, xgb_model, key_name, output_path, num_inputs, tmp_path="/tmp", threshold_dtype="float"):
+
+def get_basescore(model):
+    """Get base score from an XGBoost sklearn estimator.
+
+    Copy-pasted from XGBoost unit test code.
+
+    See also:
+      * https://github.com/dmlc/xgboost/blob/a99bb38bd2762e35e6a1673a0c11e09eddd8e723/python-package/xgboost/testing/updater.py#L13
+      * https://github.com/dmlc/xgboost/issues/9347
+      * https://discuss.xgboost.ai/t/how-to-get-base-score-from-trained-booster/3192
+    """
+    base_score = float(json.loads(model.get_booster().save_config())["learner"]["learner_model_param"]["base_score"])
+    return base_score
+
+
+def SaveXGBoost(xgb_model, key_name, output_path, num_inputs):
+    """
+    Saves the XGBoost model to a ROOT file as a TMVA::Experimental::RBDT object.
+
+    Args:
+        xgb_model: The trained XGBoost model.
+        key_name (str): The name to use for storing the RBDT in the output file.
+        output_path (str): The path to save the output file.
+        num_inputs (int): The number of input features used in the model.
+
+    Raises:
+        Exception: If the XGBoost model has an unsupported objective.
+    """
     # Extract objective
     objective_map = {
         "multi:softprob": "softmax",  # Naming the objective softmax is more common today
@@ -29,99 +57,25 @@ def SaveXGBoost(self, xgb_model, key_name, output_path, num_inputs, tmp_path="/t
         )
     objective = cppyy.gbl.std.string(objective_map[model_objective])
 
-    # Extract max depth of the trees
-    max_depth = xgb_model.max_depth
-
     # Determine number of outputs
-    if "reg:" in model_objective:
-        num_outputs = 1
-    elif "binary:" in model_objective:
-        num_outputs = 1
-    else:
-        num_outputs = xgb_model.n_classes_
+    num_outputs = xgb_model.n_classes_ if "multi:" in model_objective else 1
+
+    # Dump XGB model as json file
+    xgb_model.get_booster().dump_model(output_path, dump_format="json")
 
-    # Dump XGB model to the tmp folder as json file
-    import os
-    import uuid
+    with open(output_path, "r") as json_file:
+        forest = json.load(json_file)
 
-    tmp_path = os.path.join(tmp_path, str(uuid.uuid4()) + ".json")
-    xgb_model.get_booster().dump_model(tmp_path, dump_format="json")
+    # Dump XGB model as txt file
+    xgb_model.get_booster().dump_model(output_path)
 
-    import json
+    features = cppyy.gbl.std.vector["std::string"]([f"f{i}" for i in range(num_inputs)])
+    bdt = cppyy.gbl.TMVA.Experimental.RBDT.LoadText(output_path, features, num_outputs)
 
-    with open(tmp_path, "r") as json_file:
-        forest = json.load(json_file)
+    bdt.logistic_ = objective == "logistic"
+
+    bs = get_basescore(xgb_model)
+    bdt.baseScore_ = cppyy.gbl.std.log(bs / (1.0 - bs)) if bdt.logistic_ else bs
 
-    # Determine whether the model has a bias paramter and write bias trees
-    if hasattr(xgb_model, "base_score") and "reg:" in model_objective:
-        bias = xgb_model.base_score
-        if not bias == 0.0:
-            forest += [{"leaf": bias}] * num_outputs
-    # print(str(forest).replace("u'", "'").replace("'", '"'))
-
-    # Extract parameters from json and write to arrays
-    num_trees = len(forest)
-    len_inputs = 2 ** max_depth - 1
-    inputs = cppyy.gbl.std.vector["int"](len_inputs * num_trees, -1)
-    len_thresholds = 2 ** (max_depth + 1) - 1
-    thresholds = cppyy.gbl.std.vector[threshold_dtype](len_thresholds * num_trees)
-
-    def fill_arrays(node, index, inputs_base, thresholds_base):
-        # Set leaf score as threshold value if this node is a leaf
-        if "leaf" in node:
-            thresholds[thresholds_base + index] = node["leaf"]
-            return
-
-        # Set input index
-        input_ = int(node["split"].replace("f", ""))
-        inputs[inputs_base + index] = input_
-
-        # Set threshold value
-        thresholds[thresholds_base + index] = node["split_condition"]
-
-        # Find next left (no) and right (yes) node
-        if node["children"][0]["nodeid"] == node["yes"]:
-            yes, no = 1, 0
-        else:
-            yes, no = 0, 1
-
-        # Fill values from the child nodes
-        fill_arrays(node["children"][no], 2 * index + 1, inputs_base, thresholds_base)
-        fill_arrays(node["children"][yes], 2 * index + 2, inputs_base, thresholds_base)
-
-    for i_tree, tree in enumerate(forest):
-        fill_arrays(tree, 0, len_inputs * i_tree, len_thresholds * i_tree)
-
-    # Determine to which output node a tree belongs
-    outputs = cppyy.gbl.std.vector["int"](num_trees)
-    if num_outputs != 1:
-        for i in range(num_trees):
-            outputs[i] = int(i % num_outputs)
-
-    # Store arrays in a ROOT file in a folder with the given key name
-    # TODO: Write single values as simple integers and not vectors.
-    f = cppyy.gbl.TFile(output_path, "RECREATE")
-    f.mkdir(key_name)
-    d = f.Get(key_name)
-    d.WriteObjectAny(inputs, "std::vector<int>", "inputs")
-    d.WriteObjectAny(outputs, "std::vector<int>", "outputs")
-    d.WriteObjectAny(thresholds, "std::vector<" + threshold_dtype + ">", "thresholds")
-    d.WriteObjectAny(objective, "std::string", "objective")
-    max_depth_ = cppyy.gbl.std.vector["int"](1, max_depth)
-    d.WriteObjectAny(max_depth_, "std::vector<int>", "max_depth")
-    num_trees_ = cppyy.gbl.std.vector["int"](1, num_trees)
-    d.WriteObjectAny(num_trees_, "std::vector<int>", "num_trees")
-    num_inputs_ = cppyy.gbl.std.vector["int"](1, num_inputs)
-    d.WriteObjectAny(num_inputs_, "std::vector<int>", "num_inputs")
-    num_outputs_ = cppyy.gbl.std.vector["int"](1, num_outputs)
-    d.WriteObjectAny(num_outputs_, "std::vector<int>", "num_outputs")
-    f.Write()
-    f.Close()
-
-
-@pythonization("SaveXGBoost", ns="TMVA::Experimental")
-def pythonize_tree_inference(klass):
-    # Parameters:
-    # klass: class to be pythonized
-
-    klass.__init__ = SaveXGBoost
+    with cppyy.gbl.TFile.Open(output_path, "RECREATE") as tFile:
+        tFile.WriteObject(bdt, key_name)
@@ -462,10 +462,6 @@ ROOT_STANDARD_LIBRARY_PACKAGE(TMVAUtils
   TMVA/RBatchGenerator.hxx
   TMVA/RBatchLoader.hxx
   TMVA/RChunkLoader.hxx
-  TMVA/TreeInference/PythonHelpers.hxx
-  TMVA/TreeInference/BranchlessTree.hxx
-  TMVA/TreeInference/Forest.hxx
-  TMVA/TreeInference/Objectives.hxx
 
   SOURCES
 
 
@@ -11,11 +11,10 @@
 
 #ifdef R__HAS_DATAFRAME
 // BDT inference
-#pragma link C++ class TMVA::Experimental::RBDT<TMVA::Experimental::BranchlessForest<float>>+;
-#pragma link C++ class TMVA::Experimental::RBDT<TMVA::Experimental::BranchlessJittedForest<float>>+;
+#pragma link C++ class TMVA::Experimental::RBDT+;
 #endif
 
 // RTensor will have its own streamer function
 #pragma link C++ class TMVA::Experimental::RTensor<float,std::vector<float>>-;
 
-#endif
+#endif
@@ -1,14 +1,15 @@
 /**********************************************************************************
  * Project: ROOT - a Root-integrated toolkit for multivariate data analysis       *
  * Package: TMVA                                                                  *
- *                                             *
+ *                                                                                *
  *                                                                                *
  * Description:                                                                   *
  *                                                                                *
  * Authors:                                                                       *
  *      Stefan Wunsch (stefan.wunsch@cern.ch)                                     *
+ *      Jonas Rembser (jonas.rembser@cern.ch)                                     *
  *                                                                                *
- * Copyright (c) 2019:                                                            *
+ * Copyright (c) 2024:                                                            *
  *      CERN, Switzerland                                                         *
  *                                                                                *
  * Redistribution and use in source and binary forms, with or without             *
@@ -19,108 +20,70 @@
 #ifndef TMVA_RBDT
 #define TMVA_RBDT
 
-#include "TMVA/RTensor.hxx"
-#include "TMVA/TreeInference/Forest.hxx"
-#include "TFile.h"
+#include <Rtypes.h>
+#include <TMVA/RTensor.hxx>
 
-#include <vector>
+#include <array>
+#include <istream>
 #include <string>
-#include <sstream> // std::stringstream
-#include <memory>
+#include <vector>
 
 namespace TMVA {
+
 namespace Experimental {
 
-/// Fast boosted decision tree inference
-template <typename Backend = BranchlessJittedForest<float>>
-class RBDT {
+class RBDT final {
 public:
-   using Value_t = typename Backend::Value_t;
-   using Backend_t = Backend;
+   typedef float Value_t;
 
-private:
-   int fNumOutputs;
-   bool fNormalizeOutputs;
-   std::vector<Backend_t> fBackends;
+   /// IO constructor (both for ROOT IO and LoadText()).
+   RBDT() = default;
 
-public:
-   /// Construct backends from model in ROOT file
-   RBDT(const std::string &key, const std::string &filename)
-   {
-      // Get number of output nodes of the forest
-      std::unique_ptr<TFile> file{TFile::Open(filename.c_str(),"READ")};
-      if (!file || file->IsZombie()) {
-         throw std::runtime_error("Failed to open input file " + filename);
-      }
-      auto numOutputs = Internal::GetObjectSafe<std::vector<int>>(file.get(), filename, key + "/num_outputs");
-      fNumOutputs = numOutputs->at(0);
-      delete numOutputs;
-
-      // Get objective and decide whether to normalize output nodes for example in the multiclass case
-      auto objective = Internal::GetObjectSafe<std::string>(file.get(), filename, key + "/objective");
-      if (objective->compare("softmax") == 0)
-         fNormalizeOutputs = true;
-      else
-         fNormalizeOutputs = false;
-      delete objective;
-      file->Close();
-
-      // Initialize backends
-      fBackends = std::vector<Backend_t>(fNumOutputs);
-      for (int i = 0; i < fNumOutputs; i++)
-         fBackends[i].Load(key, filename, i);
-   }
+   /// Construct backends from model in ROOT file.
+   RBDT(const std::string &key, const std::string &filename);
 
-   /// Compute model prediction on a single event
+   /// Compute model prediction on a single event.
    ///
    /// The method is intended to be used with std::vectors-like containers,
    /// for example RVecs.
    template <typename Vector>
-   Vector Compute(const Vector &x)
+   Vector Compute(const Vector &x) const
    {
-      Vector y;
-      y.resize(fNumOutputs);
-      for (int i = 0; i < fNumOutputs; i++)
-         fBackends[i].Inference(&x[0], 1, true, &y[i]);
-      if (fNormalizeOutputs) {
-         Value_t s = 0.0;
-         for (int i = 0; i < fNumOutputs; i++)
-            s += y[i];
-         for (int i = 0; i < fNumOutputs; i++)
-            y[i] /= s;
-      }
+      std::size_t nOut = baseResponses_.size() > 2 ? baseResponses_.size() : 1;
+      Vector y(nOut);
+      ComputeImpl(x.data(), y.data());
       return y;
    }
 
-   /// Compute model prediction on a single event
-   std::vector<Value_t> Compute(const std::vector<Value_t> &x) { return this->Compute<std::vector<Value_t>>(x); }
+   /// Compute model prediction on a single event.
+   inline std::vector<Value_t> Compute(std::vector<Value_t> const &x) const { return Compute<std::vector<Value_t>>(x); }
 
-   /// Compute model prediction on input RTensor
-   RTensor<Value_t> Compute(const RTensor<Value_t> &x)
-   {
-      const auto rows = x.GetShape()[0];
-      RTensor<Value_t> y({rows, static_cast<std::size_t>(fNumOutputs)}, MemoryLayout::ColumnMajor);
-      const bool layout = x.GetMemoryLayout() == MemoryLayout::ColumnMajor ? false : true;
-      for (int i = 0; i < fNumOutputs; i++)
-         fBackends[i].Inference(x.GetData(), rows, layout, &y(0, i));
-      if (fNormalizeOutputs) {
-         Value_t s;
-         for (int i = 0; i < static_cast<int>(rows); i++) {
-            s = 0.0;
-            for (int j = 0; j < fNumOutputs; j++)
-               s += y(i, j);
-            for (int j = 0; j < fNumOutputs; j++)
-               y(i, j) /= s;
-         }
-      }
-      return y;
-   }
-};
+   RTensor<Value_t> Compute(RTensor<Value_t> const &x) const;
 
-extern template class TMVA::Experimental::RBDT<TMVA::Experimental::BranchlessForest<float>>;
-extern template class TMVA::Experimental::RBDT<TMVA::Experimental::BranchlessJittedForest<float>>;
+   static RBDT LoadText(std::string const &txtpath, std::vector<std::string> &features, int nClasses = 2);
+   static RBDT LoadText(std::istream &is, std::vector<std::string> &features, int nClasses = 2);
+
+   std::vector<int> rootIndices_;
+   std::vector<unsigned int> cutIndices_;
+   std::vector<Value_t> cutValues_;
+   std::vector<int> leftIndices_;
+   std::vector<int> rightIndices_;
+   std::vector<Value_t> responses_;
+   std::vector<int> treeNumbers_;
+   std::vector<Value_t> baseResponses_;
+   Value_t baseScore_ = 0.0;
+   bool logistic_ = false;
+
+private:
+   void Softmax(const Value_t *array, Value_t *out) const;
+   void ComputeImpl(const Value_t *array, Value_t *out) const;
+   Value_t EvaluateBinary(const Value_t *array) const;
+
+   ClassDefNV(RBDT, 1);
+};
 
 } // namespace Experimental
+
 } // namespace TMVA
 
 #endif // TMVA_RBDT