From 40eae8c4c45974705f8053e4d3d05b88e3cfaefd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 20 Aug 2024 02:26:45 +0800
Subject: [PATCH] Avoid the use of shared variables. (#50)

- Use register instead.
- Fix Python benchmark script with latest pandas and XGBoost.
- Update the CI.

---------

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 .github/workflows/pr.yaml  |   6 +-
 CMakeLists.txt             |   2 +-
 Doxyfile.in                |   2 +-
 GPUTreeShap/gpu_treeshap.h |   8 +--
 benchmark/benchmark.py     | 109 ++++++++++++++++++++++++-------------
 dependencies.yaml          |   4 ++
 fetch_rapids.cmake         |   2 +-
 7 files changed, 83 insertions(+), 50 deletions(-)
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4f044a4..38a092e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -15,16 +15,16 @@ jobs:
       - checks
       - cpp-build-test
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   cpp-build-test:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_and_test.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34ac201..ef2a7cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
 include(FetchContent)
 
-project(GPUTreeShap VERSION 24.04.00 LANGUAGES CXX CUDA)
+project(GPUTreeShap VERSION 24.10.00 LANGUAGES CXX CUDA)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
 
diff --git a/Doxyfile.in b/Doxyfile.in
index f3f1c6f..49a869f 100644
--- a/Doxyfile.in
+++ b/Doxyfile.in
@@ -38,7 +38,7 @@ PROJECT_NAME           = "GPUTreeShap"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "24.04"
+PROJECT_NUMBER         = "24.10"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/GPUTreeShap/gpu_treeshap.h b/GPUTreeShap/gpu_treeshap.h
index bc213d3..c9d5d6c 100644
--- a/GPUTreeShap/gpu_treeshap.h
+++ b/GPUTreeShap/gpu_treeshap.h
@@ -459,15 +459,13 @@ __global__ void __launch_bounds__(GPUTREESHAP_MAX_THREADS_PER_BLOCK)
                const PathElement<SplitConditionT>* path_elements,
                const size_t* bin_segments, size_t num_groups, double* phis) {
   // Use shared memory for structs, otherwise nvcc puts in local memory
-  __shared__ DatasetT s_X;
-  s_X = X;
   __shared__ PathElement<SplitConditionT> s_elements[kBlockSize];
   PathElement<SplitConditionT>& e = s_elements[threadIdx.x];
 
   size_t start_row, end_row;
   bool thread_active;
   ConfigureThread<DatasetT, kBlockSize, kRowsPerWarp>(
-      s_X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, &e,
+      X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, &e,
       &thread_active);
   uint32_t mask = __ballot_sync(FULL_MASK, thread_active);
   if (!thread_active) return;
@@ -564,15 +562,13 @@ __global__ void __launch_bounds__(GPUTREESHAP_MAX_THREADS_PER_BLOCK)
                            const size_t* bin_segments, size_t num_groups,
                            double* phis_interactions) {
   // Use shared memory for structs, otherwise nvcc puts in local memory
-  __shared__ DatasetT s_X;
-  s_X = X;
   __shared__ PathElement<SplitConditionT> s_elements[kBlockSize];
   PathElement<SplitConditionT>* e = &s_elements[threadIdx.x];
 
   size_t start_row, end_row;
   bool thread_active;
   ConfigureThread<DatasetT, kBlockSize, kRowsPerWarp>(
-      s_X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, e,
+      X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, e,
       &thread_active);
   uint32_t mask = __ballot_sync(FULL_MASK, thread_active);
   if (!thread_active) return;
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 70caecd..dc916b8 100644
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -1,42 +1,62 @@
-import xgboost as xgb
-import numpy as np
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import argparse
 import time
-from sklearn import datasets
-from joblib import Memory
+
+import numpy as np
 import pandas as pd
-import argparse
+import xgboost as xgb
+from joblib import Memory
+from sklearn import datasets
 
 memory = Memory('./cachedir', verbose=0)
 
 
 # Contains a dataset in numpy format as well as the relevant objective and metric
 class TestDataset:
-    def __init__(self, name, Xy, objective
-                 ):
+    def __init__(self, name, Xy, objective):
         self.name = name
         self.objective = objective
         self.X, self.y = Xy
 
     def set_params(self, params_in):
-        params_in['objective'] = self.objective
+        params_in["objective"] = self.objective
         if self.objective == "multi:softmax":
             params_in["num_class"] = int(np.max(self.y) + 1)
         return params_in
 
     def get_dmat(self):
-        return xgb.DMatrix(self.X, self.y)
+        return xgb.QuantileDMatrix(self.X, self.y, enable_categorical=True)
 
     def get_test_dmat(self, num_rows):
         rs = np.random.RandomState(432)
-        return xgb.DMatrix(self.X[rs.randint(0, self.X.shape[0], size=num_rows), :])
+        if hasattr(self.X, "iloc"):
+            x = self.X.iloc[rs.randint(0, self.X.shape[0], size=num_rows), :]
+        else:
+            x = self.X[rs.randint(0, self.X.shape[0], size=num_rows), :]
+        return xgb.DMatrix(x, enable_categorical=True)
 
 
 @memory.cache
-def train_model(dataset, max_depth, num_rounds):
+def train_model(dataset: TestDataset, max_depth: int, num_rounds: int) -> xgb.Booster:
     dmat = dataset.get_dmat()
-    params = {'tree_method': 'gpu_hist', 'max_depth': max_depth, 'eta': 0.01}
+    params = {'tree_method': 'hist', "device": "gpu", 'max_depth': max_depth, 'eta': 0.01}
     params = dataset.set_params(params)
-    model = xgb.train(params, dmat, num_rounds, [(dmat, 'train')])
+    model = xgb.train(params, dmat, num_rounds, evals=[(dmat, 'train')])
     return model
 
 
@@ -64,33 +84,34 @@ def get_model_stats(model):
 
 
 class Model:
-    def __init__(self, name, dataset, num_rounds, max_depth):
+    def __init__(
+        self, name: str, dataset: TestDataset, num_rounds: int, max_depth: int
+    ) -> None:
         self.name = name
         self.dataset = dataset
         self.num_rounds = num_rounds
         self.max_depth = max_depth
         print("Training " + name)
         self.xgb_model = train_model(dataset, max_depth, num_rounds)
-        self.num_trees, self.num_leaves, self.average_depth = get_model_stats(self.xgb_model)
+        self.num_trees, self.num_leaves, self.average_depth = get_model_stats(
+            self.xgb_model
+        )
 
 
 def check_accuracy(shap, margin):
-    if len(shap.shape) == 2:
-        sum = np.sum(shap, axis=len(shap.shape) - 1)
-    else:
-        sum = np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2))
+    shap = np.sum(shap, axis=len(shap.shape) - 1)
 
-    if not np.allclose(sum, margin, 1e-1, 1e-1):
+    if not np.allclose(shap, margin, 1e-1, 1e-1):
         print("Warning: Failed 1e-1 accuracy")
 
 
-def get_models(model):
+def get_models(model: str) -> list[Model]:
     test_datasets = [
+        TestDataset("adult", fetch_adult(), "binary:logistic"),
         TestDataset("covtype", datasets.fetch_covtype(return_X_y=True), "multi:softmax"),
         TestDataset("cal_housing", datasets.fetch_california_housing(return_X_y=True),
                     "reg:squarederror"),
         TestDataset("fashion_mnist", fetch_fashion_mnist(), "multi:softmax"),
-        TestDataset("adult", fetch_adult(), "binary:logistic"),
     ]
 
     models = []
@@ -110,24 +131,37 @@ def get_models(model):
 def print_model_stats(models, args):
     # get model statistics
     models_df = pd.DataFrame(
-        columns=["model", "num_rounds", "num_trees", "num_leaves", "max_depth", "average_depth"])
-    for m in models:
-        models_df = models_df.append(
-            {"model": m.name, "num_rounds": m.num_rounds, "num_trees": m.num_trees,
-             "num_leaves": m.num_leaves, "max_depth": m.max_depth,
-             "average_depth": m.average_depth},
-            ignore_index=True)
+        columns=[
+            "model",
+            "num_rounds",
+            "num_trees",
+            "num_leaves",
+            "max_depth",
+            "average_depth",
+        ]
+    )
+    for i, m in enumerate(models):
+        df = pd.DataFrame.from_dict(
+            {
+                "model": [m.name],
+                "num_rounds": [m.num_rounds],
+                "num_trees": [m.num_trees],
+                "num_leaves": [m.num_leaves],
+                "max_depth": [m.max_depth],
+                "average_depth": [m.average_depth],
+            }
+        )
+        models_df = pd.concat([models_df, df])
     print(models_df)
     print("Writing model statistics to: " + args.out_models)
     models_df.to_csv(args.out_models, index=False)
 
 
-def run_benchmark(args):
-    models = get_models(args)
+def run_benchmark(args: argparse.Namespace) -> None:
+    models = get_models(args.model)
     print_model_stats(models, args)
 
-    predictors = ["cpu_predictor", "gpu_predictor"]
-    # predictors = ["gpu_predictor"]
+    devices = ["cpu", "gpu"]
     test_rows = args.nrows
     df = pd.DataFrame(
         columns=["model", "test_rows", "cpu_time(s)", "cpu_std", "gpu_time(s)", "gpu_std",
@@ -135,8 +169,8 @@ def run_benchmark(args):
     for m in models:
         dtest = m.dataset.get_test_dmat(test_rows)
         result_row = {"model": m.name, "test_rows": test_rows, "cpu_time(s)": 0.0}
-        for p in predictors:
-            m.xgb_model.set_param({"predictor": p})
+        for p in devices:
+            m.xgb_model.set_param({"device": p})
             samples = []
             for i in range(args.niter):
                 start = time.perf_counter()
@@ -145,7 +179,7 @@ def run_benchmark(args):
                 else:
                     xgb_shap = m.xgb_model.predict(dtest, pred_contribs=True)
                 samples.append(time.perf_counter() - start)
-            if p is "gpu_predictor":
+            if p == "gpu":
                 result_row["gpu_time(s)"] = np.mean(samples)
                 result_row["gpu_std"] = np.std(samples)
             else:
@@ -156,8 +190,7 @@ def run_benchmark(args):
             check_accuracy(xgb_shap, margin)
 
         result_row["speedup"] = result_row["cpu_time(s)"] / result_row["gpu_time(s)"]
-        df = df.append(result_row,
-                       ignore_index=True)
+        df = pd.concat([df, pd.DataFrame.from_records([result_row])])
         print(df)
     print("Writing results to: " + args.out)
     df.to_csv(args.out, index=False)
diff --git a/dependencies.yaml b/dependencies.yaml
index 333b122..360a5f4 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -45,6 +45,10 @@ dependencies:
               cuda: "12.2"
             packages:
               - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
   cuda:
     specific:
       - output_types: conda
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index f494812..8e2ef84 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/GPUTREESHAP_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.04/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.10/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/GPUTREESHAP_RAPIDS.cmake
   )
 endif()