rapidsai · Aug 23, 2022 · Sep 1, 2022 · Nov 5, 2022 · Apr 27, 2023 · Apr 28, 2023
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,14 @@
+#gputreeshap code owners
+*                  @rapidsai/gputreeshap-write
+
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainer/    @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -0,0 +1,31 @@
+name: pr
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  pr-builder:
+    needs:
+      - checks
+      - cpp-build-test
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+  checks:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    with:
+      enable_check_generated_files: false
+  cpp-build-test:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    with:
+      build_type: pull-request
+      script: ci/build_and_test.sh
+      matrix_filter: map(select(.ARCH == "amd64"))
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,4 @@ build/
 *.pdf
 *.csv
 *.png
-*.pyc
+*.pyc
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+  - repo: https://github.com/cpplint/cpplint
+    rev: 1.6.1
+    hooks:
+      - id: cpplint
+        args: ["--recursive"]
+  - repo: https://github.com/rapidsai/pre-commit-hooks
+    rev: v0.0.3
+    hooks:
+      - id: verify-copyright
+        args: ["--fix", "--main-branch", "main"]
+  - repo: https://github.com/rapidsai/dependency-file-generator
+    rev: v1.13.11
+    hooks:
+      - id: rapids-dependency-file-generator
+        args: ["--clean"]
+
+default_language_version:
+      python: python3
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,8 +1,38 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+#=============================================================================
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+
+
 include(FetchContent)
-project(GPUTreeShap LANGUAGES CXX CUDA)
-set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_CUDA_STANDARD 14)
+
+project(GPUTreeShap VERSION 24.10.00 LANGUAGES CXX CUDA)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+
+include(fetch_rapids.cmake)
+
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
+
+rapids_cuda_init_architectures(CUML)
+
 option(BUILD_GTEST "Build google tests" OFF)
 option(BUILD_EXAMPLES "Build examples" OFF)
 option(BUILD_BENCHMARKS "Build benchmarks" OFF)
@@ -12,31 +42,31 @@ if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
 endif()
 
+# add third party dependencies using CPM
+rapids_cpm_init()
+
 # GPUTreeSHAP target is header-only
 add_library(GPUTreeShap INTERFACE)
 add_library(GPUTreeShap::GPUTreeShap ALIAS GPUTreeShap)
 target_sources(GPUTreeShap INTERFACE ${GPUTreeShap_SOURCE_DIR}/GPUTreeShap/gpu_treeshap.h)
 target_include_directories(GPUTreeShap INTERFACE ${GPUTreeShap_SOURCE_DIR})
 
+set_property(TARGET GPUTreeShap PROPERTY CXX_STANDARD 17)
+set_property(TARGET GPUTreeShap PROPERTY CUDA_STANDARD 17)
 set(COMPILE_OPTIONS --expt-extended-lambda -lineinfo --Werror all-warnings)
 set(GCC_COMPILE_OPTIONS -Xcompiler -Werror,-Wall,-Wextra)
+
 if(BUILD_GTEST)
-  set(gtest_force_shared_crt ON CACHE BOOL "Always use msvcrt.dll")
-  FetchContent_Declare(
-    googletest
-    GIT_REPOSITORY https://github.com/google/googletest.git
-    GIT_TAG        release-1.8.1
-    GIT_SHALLOW       true
-  )
-  FetchContent_MakeAvailable(googletest)
+  include(${rapids-cmake-dir}/cpm/gtest.cmake)
+  rapids_cpm_gtest()
+
   add_executable(TestGPUTreeShap tests/test_gpu_treeshap.cu)
   target_link_libraries(TestGPUTreeShap PRIVATE GPUTreeShap)
   target_compile_options(TestGPUTreeShap PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: ${COMPILE_OPTIONS}>)
   if(NOT MSVC)
     target_compile_options(TestGPUTreeShap PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${GCC_COMPILE_OPTIONS}>)
-  else()
   endif()
-  target_link_libraries(TestGPUTreeShap  PRIVATE gtest gtest_main)
+  target_link_libraries(TestGPUTreeShap PRIVATE GTest::gtest GTest::gtest_main)
 endif()
 
 if(BUILD_EXAMPLES)
@@ -49,14 +79,9 @@ if(BUILD_EXAMPLES)
 endif()
 
 if(BUILD_BENCHMARKS)
-  FetchContent_Declare(
-    benchmark
-    GIT_REPOSITORY https://github.com/google/benchmark.git
-    GIT_TAG        v1.5.2
-    GIT_SHALLOW       true
-  )
-  set(BENCHMARK_ENABLE_TESTING OFF CACHE INTERNAL "Google benchmark tests off")
-  FetchContent_MakeAvailable(benchmark)
+  include(${rapids-cmake-dir}/cpm/gbench.cmake)
+  rapids_cpm_gbench()
+
   add_executable(BenchmarkGPUTreeShap benchmark/benchmark_gpu_treeshap.cu)
   target_link_libraries(BenchmarkGPUTreeShap PRIVATE GPUTreeShap)
   target_compile_options(BenchmarkGPUTreeShap PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: ${COMPILE_OPTIONS}>)
@@ -69,4 +94,4 @@ endif()
 include(cmake/doxygen.cmake)
 add_doxygen_target(IN_DOXYFILE Doxyfile.in
   OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
-  CWD ${CMAKE_CURRENT_BINARY_DIR})
+  CWD ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/Doxyfile.in b/Doxyfile.in
@@ -38,7 +38,7 @@ PROJECT_NAME           = "GPUTreeShap"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         =
+PROJECT_NUMBER         = "24.10"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a

diff --git a/GPUTreeShap/gpu_treeshap.h b/GPUTreeShap/gpu_treeshap.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,27 @@
  */
 
 #pragma once
+
+#include <thrust/copy.h>
 #include <thrust/device_allocator.h>
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/functional.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/logical.h>
+#include <thrust/pair.h>
 #include <thrust/reduce.h>
-#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
 #include <cub/cub.cuh>
+
 #include <algorithm>
 #include <functional>
 #include <set>
@@ -100,16 +114,16 @@ struct PathElement {
   }
 
   /*! Unique path index. */
-  size_t path_idx;
+  size_t path_idx{};
   /*! Feature of this split, -1 indicates bias term. */
-  int64_t feature_idx;
+  int64_t feature_idx{};
   /*! Indicates class for multiclass problems. */
-  int group;
-  SplitConditionT split_condition;
+  int group{};
+  SplitConditionT split_condition{};
   /*! Probability of following this path when feature_idx is not in the active
    * set. */
-  double zero_fraction;
-  float v;  // Leaf weight at the end of the path
+  double zero_fraction{};
+  float v{};  // Leaf weight at the end of the path
 };
 
 // Helper function that accepts an index into a flat contiguous array and the
@@ -445,15 +459,13 @@ __global__ void __launch_bounds__(GPUTREESHAP_MAX_THREADS_PER_BLOCK)
                const PathElement<SplitConditionT>* path_elements,
                const size_t* bin_segments, size_t num_groups, double* phis) {
   // Use shared memory for structs, otherwise nvcc puts in local memory
-  __shared__ DatasetT s_X;
-  s_X = X;
   __shared__ PathElement<SplitConditionT> s_elements[kBlockSize];
   PathElement<SplitConditionT>& e = s_elements[threadIdx.x];
 
   size_t start_row, end_row;
   bool thread_active;
   ConfigureThread<DatasetT, kBlockSize, kRowsPerWarp>(
-      s_X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, &e,
+      X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, &e,
       &thread_active);
   uint32_t mask = __ballot_sync(FULL_MASK, thread_active);
   if (!thread_active) return;
@@ -550,15 +562,13 @@ __global__ void __launch_bounds__(GPUTREESHAP_MAX_THREADS_PER_BLOCK)
                            const size_t* bin_segments, size_t num_groups,
                            double* phis_interactions) {
   // Use shared memory for structs, otherwise nvcc puts in local memory
-  __shared__ DatasetT s_X;
-  s_X = X;
   __shared__ PathElement<SplitConditionT> s_elements[kBlockSize];
   PathElement<SplitConditionT>* e = &s_elements[threadIdx.x];
 
   size_t start_row, end_row;
   bool thread_active;
   ConfigureThread<DatasetT, kBlockSize, kRowsPerWarp>(
-      s_X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, e,
+      X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, e,
       &thread_active);
   uint32_t mask = __ballot_sync(FULL_MASK, thread_active);
   if (!thread_active) return;
@@ -890,7 +900,7 @@ void DeduplicatePaths(PathVectorT* device_paths,
   size_t* h_num_runs_out;
   CheckCuda(cudaMallocHost(&h_num_runs_out, sizeof(size_t)));
 
-  auto combine = [] __device__(PathElement<SplitConditionT> a,
+  auto combine = [] __host__ __device__(PathElement<SplitConditionT> a,
                                PathElement<SplitConditionT> b) {
     // Combine duplicate features
     a.split_condition.Merge(b.split_condition);
@@ -1171,7 +1181,7 @@ void ComputeBias(const PathVectorT& device_paths, DoubleVectorT* bias) {
                                                   PathIdxTransformOp());
   PathVectorT combined(sorted_paths.size());
   auto combined_out = thrust::reduce_by_key(
-      thrust::cuda ::par(alloc), path_key, path_key + sorted_paths.size(),
+      thrust::cuda::par(alloc), path_key, path_key + sorted_paths.size(),
       sorted_paths.begin(), thrust::make_discard_iterator(), combined.begin(),
       thrust::equal_to<size_t>(),
       [=] __device__(PathElement<SplitConditionT> a,
@@ -1220,15 +1230,15 @@ void ComputeBias(const PathVectorT& device_paths, DoubleVectorT* bias) {
  * ensemble size.
  *
  * \exception std::invalid_argument Thrown when an invalid argument error
- * condition occurs. 
+ * condition occurs.
  * \tparam  PathIteratorT     Thrust type iterator, may be
  * thrust::device_ptr for device memory, or stl iterator/raw pointer for host
- * memory. 
+ * memory.
  * \tparam  PhiIteratorT      Thrust type iterator, may be
  * thrust::device_ptr for device memory, or stl iterator/raw pointer for host
- * memory. Value type must be floating point. 
+ * memory. Value type must be floating point.
  * \tparam  DatasetT User-specified
- * dataset container. 
+ * dataset container.
  * \tparam  DeviceAllocatorT  Optional thrust style
  * allocator.
  *
@@ -1242,13 +1252,13 @@ void ComputeBias(const PathVectorT& device_paths, DoubleVectorT* bias) {
  * root with feature_idx = -1 and zero_fraction = 1.0. The ordering of path
  * elements inside a unique path does not matter - the result will be the same.
  * Paths may contain duplicate features. See the PathElement class for more
- * information. 
- * \param end         Path end iterator. 
+ * information.
+ * \param end         Path end iterator.
  * \param num_groups  Number
  * of output groups. In multiclass classification the algorithm outputs feature
- * contributions per output class. 
+ * contributions per output class.
  * \param phis_begin  Begin iterator for output
- * phis. 
+ * phis.
  * \param phis_end    End iterator for output phis.
  */
 template <typename DeviceAllocatorT = thrust::device_allocator<int>,

diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ GPUTreeShap is a cuda implementation of the TreeShap algorithm by Lundberg et al
 See the associated publication [here](https://arxiv.org/abs/2010.13972)
 ```
 @misc{mitchell2022gputreeshap,
-      title={GPUTreeShap: Massively Parallel Exact Calculation of SHAP Scores for Tree Ensembles}, 
+      title={GPUTreeShap: Massively Parallel Exact Calculation of SHAP Scores for Tree Ensembles},
       author={Rory Mitchell and Eibe Frank and Geoffrey Holmes},
       year={2022},
       eprint={2010.13972},
-Original file line number
+Diff line change
@@ @@ -4,4 +4,4 @@ build/ @@
     *.pdf
     *.csv
     *.png
-    *.pyc
+    *.pyc