From 40eae8c4c45974705f8053e4d3d05b88e3cfaefd Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 20 Aug 2024 02:26:45 +0800 Subject: [PATCH] Avoid the use of shared variables. (#50) - Use register instead. - Fix Python benchmark script with latest pandas and XGBoost. - Update the CI. --------- Co-authored-by: Bradley Dice --- .github/workflows/pr.yaml | 6 +- CMakeLists.txt | 2 +- Doxyfile.in | 2 +- GPUTreeShap/gpu_treeshap.h | 8 +-- benchmark/benchmark.py | 109 ++++++++++++++++++++++++------------- dependencies.yaml | 4 ++ fetch_rapids.cmake | 2 +- 7 files changed, 83 insertions(+), 50 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 4f044a4..38a092e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -15,16 +15,16 @@ jobs: - checks - cpp-build-test secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 with: enable_check_generated_files: false cpp-build-test: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 with: build_type: pull-request script: ci/build_and_test.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 34ac201..ef2a7cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(FetchContent) -project(GPUTreeShap VERSION 24.04.00 LANGUAGES CXX CUDA) +project(GPUTreeShap VERSION 24.10.00 LANGUAGES CXX CUDA) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) diff --git a/Doxyfile.in b/Doxyfile.in index f3f1c6f..49a869f 100644 --- a/Doxyfile.in +++ b/Doxyfile.in @@ -38,7 +38,7 @@ PROJECT_NAME = "GPUTreeShap" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "24.04" +PROJECT_NUMBER = "24.10" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/GPUTreeShap/gpu_treeshap.h b/GPUTreeShap/gpu_treeshap.h index bc213d3..c9d5d6c 100644 --- a/GPUTreeShap/gpu_treeshap.h +++ b/GPUTreeShap/gpu_treeshap.h @@ -459,15 +459,13 @@ __global__ void __launch_bounds__(GPUTREESHAP_MAX_THREADS_PER_BLOCK) const PathElement* path_elements, const size_t* bin_segments, size_t num_groups, double* phis) { // Use shared memory for structs, otherwise nvcc puts in local memory - __shared__ DatasetT s_X; - s_X = X; __shared__ PathElement s_elements[kBlockSize]; PathElement& e = s_elements[threadIdx.x]; size_t start_row, end_row; bool thread_active; ConfigureThread( - s_X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, &e, + X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, &e, &thread_active); uint32_t mask = __ballot_sync(FULL_MASK, thread_active); if (!thread_active) return; @@ -564,15 +562,13 @@ __global__ void __launch_bounds__(GPUTREESHAP_MAX_THREADS_PER_BLOCK) const size_t* bin_segments, size_t num_groups, double* phis_interactions) { // Use shared memory for structs, otherwise nvcc puts in local memory - __shared__ DatasetT s_X; - s_X = X; __shared__ PathElement s_elements[kBlockSize]; PathElement* e = &s_elements[threadIdx.x]; size_t start_row, end_row; bool thread_active; ConfigureThread( - s_X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, e, + X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, e, &thread_active); uint32_t mask = __ballot_sync(FULL_MASK, thread_active); if (!thread_active) return; diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 70caecd..dc916b8 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -1,42 +1,62 @@ -import xgboost as xgb -import numpy as np +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from __future__ import annotations + +import argparse import time -from sklearn import datasets -from joblib import Memory + +import numpy as np import pandas as pd -import argparse +import xgboost as xgb +from joblib import Memory +from sklearn import datasets memory = Memory('./cachedir', verbose=0) # Contains a dataset in numpy format as well as the relevant objective and metric class TestDataset: - def __init__(self, name, Xy, objective - ): + def __init__(self, name, Xy, objective): self.name = name self.objective = objective self.X, self.y = Xy def set_params(self, params_in): - params_in['objective'] = self.objective + params_in["objective"] = self.objective if self.objective == "multi:softmax": params_in["num_class"] = int(np.max(self.y) + 1) return params_in def get_dmat(self): - return xgb.DMatrix(self.X, self.y) + return xgb.QuantileDMatrix(self.X, self.y, enable_categorical=True) def get_test_dmat(self, num_rows): rs = np.random.RandomState(432) - return xgb.DMatrix(self.X[rs.randint(0, self.X.shape[0], size=num_rows), :]) + if hasattr(self.X, "iloc"): + x = self.X.iloc[rs.randint(0, self.X.shape[0], size=num_rows), :] + else: + x = self.X[rs.randint(0, self.X.shape[0], size=num_rows), :] + return xgb.DMatrix(x, enable_categorical=True) @memory.cache -def train_model(dataset, max_depth, num_rounds): +def train_model(dataset: TestDataset, max_depth: int, num_rounds: int) -> xgb.Booster: dmat = dataset.get_dmat() - params = {'tree_method': 'gpu_hist', 'max_depth': max_depth, 'eta': 0.01} + params = {'tree_method': 'hist', "device": "gpu", 'max_depth': max_depth, 'eta': 0.01} params = dataset.set_params(params) - model = xgb.train(params, dmat, num_rounds, [(dmat, 'train')]) + model = xgb.train(params, dmat, num_rounds, evals=[(dmat, 'train')]) return model @@ -64,33 +84,34 @@ def get_model_stats(model): class Model: - def __init__(self, name, dataset, num_rounds, max_depth): + def __init__( + self, name: str, dataset: TestDataset, num_rounds: int, max_depth: int + ) -> None: self.name = name self.dataset = dataset self.num_rounds = num_rounds self.max_depth = max_depth print("Training " + name) self.xgb_model = train_model(dataset, max_depth, num_rounds) - self.num_trees, self.num_leaves, self.average_depth = get_model_stats(self.xgb_model) + self.num_trees, self.num_leaves, self.average_depth = get_model_stats( + self.xgb_model + ) def check_accuracy(shap, margin): - if len(shap.shape) == 2: - sum = np.sum(shap, axis=len(shap.shape) - 1) - else: - sum = np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)) + shap = np.sum(shap, axis=len(shap.shape) - 1) - if not np.allclose(sum, margin, 1e-1, 1e-1): + if not np.allclose(shap, margin, 1e-1, 1e-1): print("Warning: Failed 1e-1 accuracy") -def get_models(model): +def get_models(model: str) -> list[Model]: test_datasets = [ + TestDataset("adult", fetch_adult(), "binary:logistic"), TestDataset("covtype", datasets.fetch_covtype(return_X_y=True), "multi:softmax"), TestDataset("cal_housing", datasets.fetch_california_housing(return_X_y=True), "reg:squarederror"), TestDataset("fashion_mnist", fetch_fashion_mnist(), "multi:softmax"), - TestDataset("adult", fetch_adult(), "binary:logistic"), ] models = [] @@ -110,24 +131,37 @@ def get_models(model): def print_model_stats(models, args): # get model statistics models_df = pd.DataFrame( - columns=["model", "num_rounds", "num_trees", "num_leaves", "max_depth", "average_depth"]) - for m in models: - models_df = models_df.append( - {"model": m.name, "num_rounds": m.num_rounds, "num_trees": m.num_trees, - "num_leaves": m.num_leaves, "max_depth": m.max_depth, - "average_depth": m.average_depth}, - ignore_index=True) + columns=[ + "model", + "num_rounds", + "num_trees", + "num_leaves", + "max_depth", + "average_depth", + ] + ) + for i, m in enumerate(models): + df = pd.DataFrame.from_dict( + { + "model": [m.name], + "num_rounds": [m.num_rounds], + "num_trees": [m.num_trees], + "num_leaves": [m.num_leaves], + "max_depth": [m.max_depth], + "average_depth": [m.average_depth], + } + ) + models_df = pd.concat([models_df, df]) print(models_df) print("Writing model statistics to: " + args.out_models) models_df.to_csv(args.out_models, index=False) -def run_benchmark(args): - models = get_models(args) +def run_benchmark(args: argparse.Namespace) -> None: + models = get_models(args.model) print_model_stats(models, args) - predictors = ["cpu_predictor", "gpu_predictor"] - # predictors = ["gpu_predictor"] + devices = ["cpu", "gpu"] test_rows = args.nrows df = pd.DataFrame( columns=["model", "test_rows", "cpu_time(s)", "cpu_std", "gpu_time(s)", "gpu_std", @@ -135,8 +169,8 @@ def run_benchmark(args): for m in models: dtest = m.dataset.get_test_dmat(test_rows) result_row = {"model": m.name, "test_rows": test_rows, "cpu_time(s)": 0.0} - for p in predictors: - m.xgb_model.set_param({"predictor": p}) + for p in devices: + m.xgb_model.set_param({"device": p}) samples = [] for i in range(args.niter): start = time.perf_counter() @@ -145,7 +179,7 @@ def run_benchmark(args): else: xgb_shap = m.xgb_model.predict(dtest, pred_contribs=True) samples.append(time.perf_counter() - start) - if p is "gpu_predictor": + if p == "gpu": result_row["gpu_time(s)"] = np.mean(samples) result_row["gpu_std"] = np.std(samples) else: @@ -156,8 +190,7 @@ def run_benchmark(args): check_accuracy(xgb_shap, margin) result_row["speedup"] = result_row["cpu_time(s)"] / result_row["gpu_time(s)"] - df = df.append(result_row, - ignore_index=True) + df = pd.concat([df, pd.DataFrame.from_records([result_row])]) print(df) print("Writing results to: " + args.out) df.to_csv(args.out, index=False) diff --git a/dependencies.yaml b/dependencies.yaml index 333b122..360a5f4 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -45,6 +45,10 @@ dependencies: cuda: "12.2" packages: - cuda-version=12.2 + - matrix: + cuda: "12.5" + packages: + - cuda-version=12.5 cuda: specific: - output_types: conda diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake index f494812..8e2ef84 100644 --- a/fetch_rapids.cmake +++ b/fetch_rapids.cmake @@ -12,7 +12,7 @@ # the License. # ============================================================================= if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/GPUTREESHAP_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.04/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.10/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/GPUTREESHAP_RAPIDS.cmake ) endif()