Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamic cuda wrapper #1065

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,15 +120,16 @@ jobs:
[[ "${{ matrix.os }}" = windows-* ]] && python3 -m pip install ninja
for NO_CUBLASLT in ON OFF; do
if [ ${build_os:0:6} == ubuntu ]; then
image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04
image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu20.04
echo "Using image $image"
docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
"apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
&& cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3 python3-pip \
&& pip install cmake==3.27.9 \
&& cmake -DCOMPUTE_BACKEND=cuda -DUSE_CUDA_WRAPPER=ON -DNO_CUBLASLT=${NO_CUBLASLT} . \
&& cmake --build ."
else
cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S .
cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DUSE_CUDA_WRAPPER=ON -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S .
cmake --build . --config Release
fi
done
Expand Down
18 changes: 15 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ list(APPEND SRC_FILES ${CPP_FILES})
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)")
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps)
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
option(USE_CUDA_WRAPPER "Dynamic CUDA Linking" OFF)

if(APPLE)
set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
Expand All @@ -43,6 +44,7 @@ if(${COMPUTE_BACKEND} STREQUAL "cuda")
set(BUILD_CUDA ON)
set(BUILD_MPS OFF)
message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")
message(STATUS "USE_CUDA_WRAPPER := ${USE_CUDA_WRAPPER}")
elseif(${COMPUTE_BACKEND} STREQUAL "mps")
if(NOT APPLE)
message(FATAL_ERROR "MPS is only supported on macOS" )
Expand Down Expand Up @@ -111,7 +113,13 @@ if(BUILD_CUDA)

list(APPEND SRC_FILES ${CUDA_FILES})

string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
if(USE_CUDA_WRAPPER)
string(APPEND BNB_OUTPUT_NAME "_cuda")
else()
string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
endif()
add_compile_definitions(USE_CUDA_WRAPPER)

if(NO_CUBLASLT)
string(APPEND BNB_OUTPUT_NAME "_nocublaslt")
endif()
Expand Down Expand Up @@ -160,11 +168,15 @@ target_include_directories(bitsandbytes PUBLIC csrc include)

if(BUILD_CUDA)
target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse)
if(NOT USE_CUDA_WRAPPER)
target_link_libraries(bitsandbytes PUBLIC CUDA::cublas CUDA::cusparse)
endif()
if(NO_CUBLASLT)
target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
else()
target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt)
if(NOT USE_CUDA_WRAPPER)
target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt)
endif()
Comment on lines +171 to +179
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CMake has CMAKE_CUDA_RUNTIME_LIBRARY to control this; maybe would be better here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_RUNTIME_LIBRARY.html
docuement says CMAKE_CUDA_RUNTIME_LIBRARY only control cudart
(Currently, with or without this PR, cudart_static added by default)

endif()

set_target_properties(bitsandbytes
Expand Down
5 changes: 5 additions & 0 deletions bitsandbytes/cextension.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
lib.get_context.restype = ct.c_void_p
lib.get_cusparse.restype = ct.c_void_p
lib.cget_managed_ptr.restype = ct.c_void_p
try:
lib.initCudaLibs()
except Exception:
# ignore
pass
COMPILED_WITH_CUDA = True
except AttributeError as ex:
warn("The installed version of bitsandbytes was compiled without GPU support. "
Expand Down
11 changes: 9 additions & 2 deletions bitsandbytes/cuda_setup/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

DYNAMIC_LIBRARY_SUFFIX = { "Darwin": ".dylib", "Windows": ".dll", "Linux": ".so"}.get(platform.system(), ".so")
if platform.system() == "Windows": # Windows
CUDA_RUNTIME_LIBS = ["nvcuda.dll"]
CUDA_RUNTIME_LIBS = ["cudart64_110.dll", "cudart64_12.dll"]
else: # Linux or other
# these are the most common libs names
# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
Expand Down Expand Up @@ -383,7 +383,14 @@ def evaluate_cuda_setup():
# we use ls -l instead of nvcc to determine the cuda version
# since most installations will have the libcudart.so installed, but not the compiler

binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
binary = f"libbitsandbytes_cuda{DYNAMIC_LIBRARY_SUFFIX}"
package_dir = Path(__file__).parent.parent
binary_path = package_dir / binary
# check binary_path without cuda_version_string
if binary_path.exists():
binary_name = "libbitsandbytes_cuda"
else:
binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
if not has_cublaslt:
# if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
binary_name += "_nocublaslt"
Expand Down
4 changes: 2 additions & 2 deletions csrc/ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ void gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, in
m, n, k,
alpha, A, CUDA_R_8I, lda, B, CUDA_R_8I, ldb, beta,
C, CUDA_R_32I, ldc,
CUDA_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP);

if (status != CUBLAS_STATUS_SUCCESS)
{
Expand Down Expand Up @@ -285,7 +285,7 @@ void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, i
m, n, k,
alpha, A, CUDA_R_8I, lda, (long long int)strideA, B, CUDA_R_8I, ldb, (long long int)strideB, beta,
C, CUDA_R_32I, ldc, (long long int)strideC, batchCount,
CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DEFAULT);

if (status != CUBLAS_STATUS_SUCCESS)
{
Expand Down
253 changes: 253 additions & 0 deletions csrc/ops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,260 @@
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

#if USE_CUDA_WRAPPER

#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#else
#include <dlfcn.h>
#endif

typedef const char* (*cudaGetErrorString_t)(cudaError_t err);
typedef const char* (*cusparseGetErrorString_t)(cusparseStatus_t status);
typedef cusparseStatus_t (*cusparseCreate_t)(cusparseHandle_t* handle);
typedef cublasStatus_t (*cublasCreate_v2_t)(cublasHandle_t* handle);
typedef cublasStatus_t (*cublasLtCreate_t)(cublasLtHandle_t* lightHandle);
//typedef cudaError_t (*cudaMallocManaged_t)(void **devPtr, size_t size, unsigned int flags);
//typedef cudaError_t (*cudaMemPrefetchAsync_t)(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream);
//typedef cudaError_t (*cudaDeviceGetAttribute_t)(int *value, enum cudaDeviceAttr attr, int device);

typedef cusparseStatus_t (*cusparseCreateCoo_t)(cusparseSpMatDescr_t* spMatDescr,
int64_t rows,
int64_t cols,
int64_t nnz,
void* cooRowInd,
void* cooColInd,
void* cooValues,
cusparseIndexType_t cooIdxType,
cusparseIndexBase_t idxBase,
cudaDataType valueType);

typedef cusparseStatus_t (*cusparseCreateDnMat_t)(cusparseDnMatDescr_t* dnMatDescr,
int64_t rows,
int64_t cols,
int64_t ld,
void* values,
cudaDataType valueType,
cusparseOrder_t order);

typedef cusparseStatus_t (*cusparseSpMM_bufferSize_t)(cusparseHandle_t handle,
cusparseOperation_t opA,
cusparseOperation_t opB,
const void* alpha,
cusparseSpMatDescr_t matA,
cusparseDnMatDescr_t matB,
const void* beta,
cusparseDnMatDescr_t matC,
cudaDataType computeType,
cusparseSpMMAlg_t alg,
size_t* bufferSize);

typedef cusparseStatus_t (*cusparseSpMM_t)(cusparseHandle_t handle,
cusparseOperation_t opA,
cusparseOperation_t opB,
const void* alpha,
cusparseSpMatDescr_t matA,
cusparseDnMatDescr_t matB,
const void* beta,
cusparseDnMatDescr_t matC,
cudaDataType computeType,
cusparseSpMMAlg_t alg,
void* externalBuffer);

typedef cusparseStatus_t (*cusparseDestroySpMat_t)(cusparseSpMatDescr_t spMatDescr);
typedef cusparseStatus_t (*cusparseDestroyDnMat_t)(cusparseDnMatDescr_t dnMatDescr);

typedef cudaError_t (*cudaMemset_t)(void *devPtr, int value, size_t count);
typedef cudaError_t (*cudaMalloc_t)(void **devPtr, size_t size);
typedef cudaError_t (*cudaFree_t)(void *devPtr);
typedef cudaError_t (*cudaPeekAtLastError_t)(void);

typedef cublasStatus_t (*cublasGemmEx_t)(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void* alpha, /* host or device pointer */
const void* A,
cudaDataType Atype,
int lda,
const void* B,
cudaDataType Btype,
int ldb,
const void* beta, /* host or device pointer */
void* C,
cudaDataType Ctype,
int ldc,
cublasComputeType_t computeType,
cublasGemmAlgo_t algo);

typedef cublasStatus_t (*cublasGemmStridedBatchedEx_t)(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void* alpha, /* host or device pointer */
const void* A,
cudaDataType Atype,
int lda,
long long int strideA, /* purposely signed */
const void* B,
cudaDataType Btype,
int ldb,
long long int strideB,
const void* beta, /* host or device pointer */
void* C,
cudaDataType Ctype,
int ldc,
long long int strideC,
int batchCount,
cublasComputeType_t computeType,
cublasGemmAlgo_t algo);

typedef cublasStatus_t (*cublasLtMatrixLayoutCreate_t)( //
cublasLtMatrixLayout_t* matLayout,
cudaDataType type,
uint64_t rows,
uint64_t cols,
int64_t ld);

typedef cublasStatus_t (*cublasLtMatrixLayoutSetAttribute_t)( //
cublasLtMatrixLayout_t matLayout,
cublasLtMatrixLayoutAttribute_t attr,
const void* buf,
size_t sizeInBytes);

typedef cublasStatus_t (*cublasLtMatrixTransform_t)(cublasLtHandle_t lightHandle,
cublasLtMatrixTransformDesc_t transformDesc,
const void* alpha, /* host or device pointer */
const void* A,
cublasLtMatrixLayout_t Adesc,
const void* beta, /* host or device pointer */
const void* B,
cublasLtMatrixLayout_t Bdesc,
void* C,
cublasLtMatrixLayout_t Cdesc,
cudaStream_t stream);

typedef cublasStatus_t (*cublasLtMatrixTransformDescCreate_t)(cublasLtMatrixTransformDesc_t* transformDesc,
cudaDataType scaleType);

typedef cublasStatus_t (*cublasLtMatrixTransformDescSetAttribute_t)( //
cublasLtMatrixTransformDesc_t transformDesc,
cublasLtMatrixTransformDescAttributes_t attr,
const void* buf,
size_t sizeInBytes);

typedef cublasStatus_t (*cublasLtMatrixLayoutDestroy_t)(cublasLtMatrixLayout_t matLayout);

typedef cublasStatus_t (*cublasLtMatrixTransformDescDestroy_t)(cublasLtMatrixTransformDesc_t transformDesc);

typedef cublasStatus_t (*cublasLtMatmul_t)(cublasLtHandle_t lightHandle,
cublasLtMatmulDesc_t computeDesc,
const void* alpha, /* host or device pointer */
const void* A,
cublasLtMatrixLayout_t Adesc,
const void* B,
cublasLtMatrixLayout_t Bdesc,
const void* beta, /* host or device pointer */
const void* C,
cublasLtMatrixLayout_t Cdesc,
void* D,
cublasLtMatrixLayout_t Ddesc,
const cublasLtMatmulAlgo_t* algo,
void* workspace,
size_t workspaceSizeInBytes,
cudaStream_t stream);

typedef cublasStatus_t (*cublasLtMatmulDescCreate_t)(cublasLtMatmulDesc_t* matmulDesc,
cublasComputeType_t computeType,
cudaDataType_t scaleType);

typedef cublasStatus_t (*cublasLtMatmulDescDestroy_t)(cublasLtMatmulDesc_t matmulDesc);

typedef cublasStatus_t (*cublasLtMatmulDescSetAttribute_t)( //
cublasLtMatmulDesc_t matmulDesc,
cublasLtMatmulDescAttributes_t attr,
const void* buf,
size_t sizeInBytes);


/* externs */
extern cudaGetErrorString_t _cudaGetErrorString;
extern cusparseGetErrorString_t _cusparseGetErrorString;
//extern cudaMallocManaged_t _cudaMallocManaged;
//extern cudaMemPrefetchAsync_t _cudaMemPrefetchAsync;
//extern cudaDeviceGetAttribute_t _cudaDeviceGetAttribute;

extern cusparseCreate_t _cusparseCreate;
extern cublasCreate_v2_t _cublasCreate_v2;
extern cublasLtCreate_t _cublasLtCreate;

extern cusparseDestroySpMat_t _cusparseDestroySpMat;
extern cusparseDestroyDnMat_t _cusparseDestroyDnMat;
extern cusparseCreateCoo_t _cusparseCreateCoo;
extern cusparseSpMM_t _cusparseSpMM;
extern cusparseSpMM_bufferSize_t _cusparseSpMM_bufferSize;
extern cusparseCreateDnMat_t _cusparseCreateDnMat;

extern cudaMemset_t _cudaMemset;
extern cudaMalloc_t _cudaMalloc;
extern cudaFree_t _cudaFree;
extern cudaPeekAtLastError_t _cudaPeekAtLastError;

extern cublasGemmEx_t _cublasGemmEx;
extern cublasGemmStridedBatchedEx_t _cublasGemmStridedBatchedEx;

extern cublasLtMatrixLayoutCreate_t _cublasLtMatrixLayoutCreate;
extern cublasLtMatrixLayoutSetAttribute_t _cublasLtMatrixLayoutSetAttribute;
extern cublasLtMatrixTransform_t _cublasLtMatrixTransform;
extern cublasLtMatrixTransformDescCreate_t _cublasLtMatrixTransformDescCreate;
extern cublasLtMatrixTransformDescSetAttribute_t _cublasLtMatrixTransformDescSetAttribute;
extern cublasLtMatrixLayoutDestroy_t _cublasLtMatrixLayoutDestroy;
extern cublasLtMatrixTransformDescDestroy_t _cublasLtMatrixTransformDescDestroy;
extern cublasLtMatmul_t _cublasLtMatmul;
extern cublasLtMatmulDescCreate_t _cublasLtMatmulDescCreate;
extern cublasLtMatmulDescDestroy_t _cublasLtMatmulDescDestroy;
extern cublasLtMatmulDescSetAttribute_t _cublasLtMatmulDescSetAttribute;


#define cudaGetErrorString _cudaGetErrorString
#define cusparseGetErrorString _cusparseGetErrorString
#define cusparseCreate _cusparseCreate
#define cublasCreate_v2 _cublasCreate_v2
#define cublasLtCreate _cublasLtCreate

#define cudaMemset _cudaMemset
#define cudaMalloc _cudaMalloc
#define cudaFree _cudaFree
#define cudaPeekAtLastError _cudaPeekAtLastError

#define cusparseCreateCoo _cusparseCreateCoo
#define cusparseDestroySpMat _cusparseDestroySpMat
#define cusparseDestroyDnMat _cusparseDestroyDnMat
#define cusparseSpMM _cusparseSpMM
#define cusparseSpMM_bufferSize _cusparseSpMM_bufferSize
#define cusparseCreateDnMat _cusparseCreateDnMat

#define cublasGemmEx _cublasGemmEx
#define cublasGemmStridedBatchedEx _cublasGemmStridedBatchedEx
#define cublasLtMatrixLayoutCreate _cublasLtMatrixLayoutCreate
#define cublasLtMatrixLayoutSetAttribute _cublasLtMatrixLayoutSetAttribute
#define cublasLtMatrixTransform _cublasLtMatrixTransform
#define cublasLtMatrixTransformDescCreate _cublasLtMatrixTransformDescCreate

#define cublasLtMatrixTransformDescSetAttribute _cublasLtMatrixTransformDescSetAttribute
#define cublasLtMatrixLayoutDestroy _cublasLtMatrixLayoutDestroy
#define cublasLtMatrixTransformDescDestroy _cublasLtMatrixTransformDescDestroy
#define cublasLtMatmul _cublasLtMatmul
#define cublasLtMatmulDescCreate _cublasLtMatmulDescCreate
#define cublasLtMatmulDescDestroy _cublasLtMatmulDescDestroy
#define cublasLtMatmulDescSetAttribute _cublasLtMatmulDescSetAttribute

#endif /* USE_CUDA_WRAPPER */

#define CUDA_CHECK_RETURN(value) { \
cudaError_t _m_cudaStat = value; \
Expand Down
Loading
Loading