root-project · vepadulano · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024
@@ -12,12 +12,13 @@
 import cppyy
 
 
-def SaveXGBoost(self, xgb_model, key_name, output_path, num_inputs=None, tmp_path="/tmp", threshold_dtype="float"):
+def SaveXGBoost(self, xgb_model, key_name, output_path, num_inputs, tmp_path="/tmp", threshold_dtype="float"):
     # Extract objective
     objective_map = {
         "multi:softprob": "softmax",  # Naming the objective softmax is more common today
         "binary:logistic": "logistic",
         "reg:linear": "identity",
+        "reg:squarederror": "identity",
     }
     model_objective = xgb_model.objective
     if not model_objective in objective_map:
@@ -48,7 +49,8 @@ def SaveXGBoost(self, xgb_model, key_name, output_path, num_inputs=None, tmp_pat
 
     import json
 
-    forest = json.load(open(tmp_path, "r"))
+    with open(tmp_path, "r") as json_file:
+        forest = json.load(json_file)
 
     # Determine whether the model has a bias paramter and write bias trees
     if hasattr(xgb_model, "base_score") and "reg:" in model_objective:
@@ -96,16 +98,6 @@ def fill_arrays(node, index, inputs_base, thresholds_base):
         for i in range(num_trees):
             outputs[i] = int(i % num_outputs)
 
-    # Determine number of input variables
-    if not num_inputs is None:
-        pass
-    elif hasattr(xgb_model, "_features_count"):
-        num_inputs = xgb_model._features_count
-    else:
-        raise Exception(
-            "Failed to get number of input variables from XGBoost model. Please provide the additional keyword argument 'num_inputs' to this function."
-        )
-
     # Store arrays in a ROOT file in a folder with the given key name
     # TODO: Write single values as simple integers and not vectors.
     f = cppyy.gbl.TFile(output_path, "RECREATE")

@@ -3,6 +3,8 @@
 # PyROOT: Interoperability with numpy arrays
 # TMVA: PyMVA interfaces
 numpy>=1.4.1
+scikit-learn
+xgboost
 
 # PyROOT: ROOT.Numba.Declare decorator
 numba>=0.47.0 ; python_version < "3.11" # See https://github.com/numba/numba/issues/8304

@@ -31,12 +31,14 @@ if(dataframe)
     ROOT_ADD_GTEST(rbdt rbdt.cxx LIBRARIES ROOTVecOps TMVAUtils)
 endif()
 
-if(dataframe)
-  find_python_module(xgboost QUIET)
-  if (PY_XGBOOST_FOUND)
-    ROOT_ADD_PYUNITTEST(rbdt_xgboost rbdt_xgboost.py)
-  endif()
-endif()
+# Disabled because RBDT doesn't support the imbalanced tree structure of
+# XGBoost models.
+# if(dataframe)
+#   find_python_module(xgboost QUIET)
+#   if (PY_XGBOOST_FOUND)
+#     ROOT_ADD_PYUNITTEST(rbdt_xgboost rbdt_xgboost.py)
+#   endif()
+# endif()
 
 #--stressTMVA--------------------------------------------------------------------------------------
 

@@ -21,7 +21,7 @@ def _test_XGBBinary(backend, label):
     x, y = create_dataset(1000, 10, 2)
     xgb = xgboost.XGBClassifier(n_estimators=100, max_depth=3)
     xgb.fit(x, y)
-    ROOT.TMVA.Experimental.SaveXGBoost(xgb, "myModel", "testXGBBinary{}.root".format(label))
+    ROOT.TMVA.Experimental.SaveXGBoost(xgb, "myModel", "testXGBBinary{}.root".format(label), num_inputs=10)
     bdt = ROOT.TMVA.Experimental.RBDT[backend]("myModel", "testXGBBinary{}.root".format(label))
 
     y_xgb = xgb.predict_proba(x)[:, 1].squeeze()
@@ -51,7 +51,7 @@ def _test_XGBMulticlass(backend, label):
     x, y = create_dataset(1000, 10, 3)
     xgb = xgboost.XGBClassifier(n_estimators=100, max_depth=3)
     xgb.fit(x, y)
-    ROOT.TMVA.Experimental.SaveXGBoost(xgb, "myModel", "testXGBMulticlass{}.root".format(label))
+    ROOT.TMVA.Experimental.SaveXGBoost(xgb, "myModel", "testXGBMulticlass{}.root".format(label), num_inputs=10)
     bdt = ROOT.TMVA.Experimental.RBDT[backend]("myModel", "testXGBMulticlass{}.root".format(label))
 
     y_xgb = xgb.predict_proba(x)

@@ -12,9 +12,13 @@
 ## \date August 2019
 ## \author Stefan Wunsch
 
+# XGBoost has to be imported before ROOT to avoid crashes because of clashing
+# std::regexp symbols that are exported by cppyy.
+# See also: https://github.com/wlav/cppyy/issues/227
+from xgboost import XGBClassifier
+
 import ROOT
 import numpy as np
-import pickle
 
 from tmva100_DataPreparation import variables
 
@@ -45,7 +49,6 @@ def load_data(signal_filename, background_filename):
     x, y, w = load_data("train_signal.root", "train_background.root")
 
     # Fit xgboost model
-    from xgboost import XGBClassifier
     bdt = XGBClassifier(max_depth=3, n_estimators=500)
     bdt.fit(x, y, sample_weight=w)