bitsandbytes-foundation · wkpark · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024 · matthewdouglas
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -120,15 +120,16 @@ jobs:
         [[ "${{ matrix.os }}" = windows-* ]] && python3 -m pip install ninja
         for NO_CUBLASLT in ON OFF; do
           if [ ${build_os:0:6} == ubuntu ]; then
-            image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04
+            image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu20.04
             echo "Using image $image"
             docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
               "apt-get update \
-              && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-              && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \
+              && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3 python3-pip \
+              && pip install cmake==3.27.9 \
+              && cmake -DCOMPUTE_BACKEND=cuda -DUSE_CUDA_WRAPPER=ON -DNO_CUBLASLT=${NO_CUBLASLT} . \
               && cmake --build ."
           else
-            cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S .
+            cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DUSE_CUDA_WRAPPER=ON -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S .
             cmake --build . --config Release
           fi
         done

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,6 +26,7 @@ list(APPEND SRC_FILES ${CPP_FILES})
 set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)")
 set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps)
 option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
+option(USE_CUDA_WRAPPER "Dynamic CUDA Linking" OFF)
 
 if(APPLE)
   set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
@@ -43,6 +44,7 @@ if(${COMPUTE_BACKEND} STREQUAL "cuda")
     set(BUILD_CUDA ON)
     set(BUILD_MPS OFF)
     message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")
+    message(STATUS "USE_CUDA_WRAPPER := ${USE_CUDA_WRAPPER}")
 elseif(${COMPUTE_BACKEND} STREQUAL "mps")
     if(NOT APPLE)
         message(FATAL_ERROR "MPS is only supported on macOS" )
@@ -111,7 +113,13 @@ if(BUILD_CUDA)
 
     list(APPEND SRC_FILES ${CUDA_FILES})
 
-    string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
+    if(USE_CUDA_WRAPPER)
+        string(APPEND BNB_OUTPUT_NAME "_cuda")
+    else()
+        string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
+    endif()
+    add_compile_definitions(USE_CUDA_WRAPPER)
+
     if(NO_CUBLASLT)
         string(APPEND BNB_OUTPUT_NAME "_nocublaslt")
     endif()
@@ -160,11 +168,15 @@ target_include_directories(bitsandbytes PUBLIC csrc include)
 
 if(BUILD_CUDA)
     target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-    target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse)
+    if(NOT USE_CUDA_WRAPPER)
+        target_link_libraries(bitsandbytes PUBLIC CUDA::cublas CUDA::cusparse)
+    endif()
     if(NO_CUBLASLT)
         target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
     else()
-        target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt)
+        if(NOT USE_CUDA_WRAPPER)
+            target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt)
+        endif()
     endif()
 
     set_target_properties(bitsandbytes

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
@@ -26,6 +26,11 @@
     lib.get_context.restype = ct.c_void_p
     lib.get_cusparse.restype = ct.c_void_p
     lib.cget_managed_ptr.restype = ct.c_void_p
+    try:
+        lib.initCudaLibs()
+    except Exception:
+        # ignore
+        pass
     COMPILED_WITH_CUDA = True
 except AttributeError as ex:
     warn("The installed version of bitsandbytes was compiled without GPU support. "

diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
@@ -30,7 +30,7 @@
 
 DYNAMIC_LIBRARY_SUFFIX = { "Darwin": ".dylib", "Windows": ".dll", "Linux": ".so"}.get(platform.system(), ".so")
 if platform.system() == "Windows":  # Windows
-    CUDA_RUNTIME_LIBS = ["nvcuda.dll"]
+    CUDA_RUNTIME_LIBS = ["cudart64_110.dll", "cudart64_12.dll"]
 else:  # Linux or other
     # these are the most common libs names
     # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
@@ -383,7 +383,14 @@ def evaluate_cuda_setup():
     # we use ls -l instead of nvcc to determine the cuda version
     # since most installations will have the libcudart.so installed, but not the compiler
 
-    binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
+    binary = f"libbitsandbytes_cuda{DYNAMIC_LIBRARY_SUFFIX}"
+    package_dir = Path(__file__).parent.parent
+    binary_path = package_dir / binary
+    # check binary_path without cuda_version_string
+    if binary_path.exists():
+        binary_name = "libbitsandbytes_cuda"
+    else:
+        binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
     if not has_cublaslt:
         # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
         binary_name += "_nocublaslt"

diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -255,7 +255,7 @@ void gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, in
 					m, n,	k,
 					alpha, A, CUDA_R_8I, lda, B, CUDA_R_8I, ldb, beta,
 					C, CUDA_R_32I, ldc,
-          CUDA_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+          CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 
     if (status != CUBLAS_STATUS_SUCCESS)
     {
@@ -285,7 +285,7 @@ void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, i
 					m, n,	k,
 					alpha, A, CUDA_R_8I, lda, (long long int)strideA, B, CUDA_R_8I, ldb, (long long int)strideB, beta,
 					C, CUDA_R_32I, ldc, (long long int)strideC, batchCount,
-          CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
+          CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DEFAULT);
 
     if (status != CUBLAS_STATUS_SUCCESS)
     {

diff --git a/csrc/ops.cuh b/csrc/ops.cuh
@@ -22,7 +22,260 @@
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 
+#if USE_CUDA_WRAPPER
 
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+typedef const char* (*cudaGetErrorString_t)(cudaError_t err);
+typedef const char* (*cusparseGetErrorString_t)(cusparseStatus_t status);
+typedef cusparseStatus_t (*cusparseCreate_t)(cusparseHandle_t* handle);
+typedef cublasStatus_t (*cublasCreate_v2_t)(cublasHandle_t* handle);
+typedef cublasStatus_t (*cublasLtCreate_t)(cublasLtHandle_t* lightHandle);
+//typedef cudaError_t (*cudaMallocManaged_t)(void **devPtr, size_t size, unsigned int flags);
+//typedef cudaError_t (*cudaMemPrefetchAsync_t)(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream);
+//typedef cudaError_t (*cudaDeviceGetAttribute_t)(int *value, enum cudaDeviceAttr attr, int device);
+
+typedef cusparseStatus_t (*cusparseCreateCoo_t)(cusparseSpMatDescr_t* spMatDescr,
+                  int64_t               rows,
+                  int64_t               cols,
+                  int64_t               nnz,
+                  void*                 cooRowInd,
+                  void*                 cooColInd,
+                  void*                 cooValues,
+                  cusparseIndexType_t   cooIdxType,
+                  cusparseIndexBase_t   idxBase,
+                  cudaDataType          valueType);
+
+typedef cusparseStatus_t (*cusparseCreateDnMat_t)(cusparseDnMatDescr_t* dnMatDescr,
+                    int64_t               rows,
+                    int64_t               cols,
+                    int64_t               ld,
+                    void*                 values,
+                    cudaDataType          valueType,
+                    cusparseOrder_t       order);
+
+typedef cusparseStatus_t (*cusparseSpMM_bufferSize_t)(cusparseHandle_t     handle,
+                        cusparseOperation_t  opA,
+                        cusparseOperation_t  opB,
+                        const void*          alpha,
+                        cusparseSpMatDescr_t matA,
+                        cusparseDnMatDescr_t matB,
+                        const void*          beta,
+                        cusparseDnMatDescr_t matC,
+                        cudaDataType         computeType,
+                        cusparseSpMMAlg_t    alg,
+                        size_t*              bufferSize);
+
+typedef cusparseStatus_t (*cusparseSpMM_t)(cusparseHandle_t     handle,
+             cusparseOperation_t  opA,
+             cusparseOperation_t  opB,
+             const void*          alpha,
+             cusparseSpMatDescr_t matA,
+             cusparseDnMatDescr_t matB,
+             const void*          beta,
+             cusparseDnMatDescr_t matC,
+             cudaDataType         computeType,
+             cusparseSpMMAlg_t    alg,
+             void*                externalBuffer);
+
+typedef cusparseStatus_t (*cusparseDestroySpMat_t)(cusparseSpMatDescr_t spMatDescr);
+typedef cusparseStatus_t (*cusparseDestroyDnMat_t)(cusparseDnMatDescr_t dnMatDescr);
+
+typedef cudaError_t (*cudaMemset_t)(void *devPtr, int value, size_t count);
+typedef cudaError_t (*cudaMalloc_t)(void **devPtr, size_t size);
+typedef cudaError_t (*cudaFree_t)(void *devPtr);
+typedef cudaError_t (*cudaPeekAtLastError_t)(void);
+
+typedef cublasStatus_t (*cublasGemmEx_t)(cublasHandle_t handle,
+                                                   cublasOperation_t transa,
+                                                   cublasOperation_t transb,
+                                                   int m,
+                                                   int n,
+                                                   int k,
+                                                   const void* alpha, /* host or device pointer */
+                                                   const void* A,
+                                                   cudaDataType Atype,
+                                                   int lda,
+                                                   const void* B,
+                                                   cudaDataType Btype,
+                                                   int ldb,
+                                                   const void* beta, /* host or device pointer */
+                                                   void* C,
+                                                   cudaDataType Ctype,
+                                                   int ldc,
+                                                   cublasComputeType_t computeType,
+                                                   cublasGemmAlgo_t algo);
+
+typedef cublasStatus_t (*cublasGemmStridedBatchedEx_t)(cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb,
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const void* alpha, /* host or device pointer */
+                                                                 const void* A,
+                                                                 cudaDataType Atype,
+                                                                 int lda,
+                                                                 long long int strideA, /* purposely signed */
+                                                                 const void* B,
+                                                                 cudaDataType Btype,
+                                                                 int ldb,
+                                                                 long long int strideB,
+                                                                 const void* beta, /* host or device pointer */
+                                                                 void* C,
+                                                                 cudaDataType Ctype,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount,
+                                                                 cublasComputeType_t computeType,
+                                                                 cublasGemmAlgo_t algo);
+
+typedef cublasStatus_t (*cublasLtMatrixLayoutCreate_t)(  //
+    cublasLtMatrixLayout_t* matLayout,
+    cudaDataType type,
+    uint64_t rows,
+    uint64_t cols,
+    int64_t ld);
+
+typedef cublasStatus_t (*cublasLtMatrixLayoutSetAttribute_t)(  //
+    cublasLtMatrixLayout_t matLayout,
+    cublasLtMatrixLayoutAttribute_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+typedef cublasStatus_t (*cublasLtMatrixTransform_t)(cublasLtHandle_t lightHandle,
+                                                    cublasLtMatrixTransformDesc_t transformDesc,
+                                                    const void* alpha, /* host or device pointer */
+                                                    const void* A,
+                                                    cublasLtMatrixLayout_t Adesc,
+                                                    const void* beta, /* host or device pointer */
+                                                    const void* B,
+                                                    cublasLtMatrixLayout_t Bdesc,
+                                                    void* C,
+                                                    cublasLtMatrixLayout_t Cdesc,
+                                                    cudaStream_t stream);
+
+typedef cublasStatus_t (*cublasLtMatrixTransformDescCreate_t)(cublasLtMatrixTransformDesc_t* transformDesc,
+                                                              cudaDataType scaleType);
+
+typedef cublasStatus_t (*cublasLtMatrixTransformDescSetAttribute_t)(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+typedef cublasStatus_t (*cublasLtMatrixLayoutDestroy_t)(cublasLtMatrixLayout_t matLayout);
+
+typedef cublasStatus_t (*cublasLtMatrixTransformDescDestroy_t)(cublasLtMatrixTransformDesc_t transformDesc);
+
+typedef cublasStatus_t (*cublasLtMatmul_t)(cublasLtHandle_t lightHandle,
+                                           cublasLtMatmulDesc_t computeDesc,
+                                           const void* alpha, /* host or device pointer */
+                                           const void* A,
+                                           cublasLtMatrixLayout_t Adesc,
+                                           const void* B,
+                                           cublasLtMatrixLayout_t Bdesc,
+                                           const void* beta, /* host or device pointer */
+                                           const void* C,
+                                           cublasLtMatrixLayout_t Cdesc,
+                                           void* D,
+                                           cublasLtMatrixLayout_t Ddesc,
+                                           const cublasLtMatmulAlgo_t* algo,
+                                           void* workspace,
+                                           size_t workspaceSizeInBytes,
+                                           cudaStream_t stream);
+
+typedef cublasStatus_t (*cublasLtMatmulDescCreate_t)(cublasLtMatmulDesc_t* matmulDesc,
+                                                     cublasComputeType_t computeType,
+                                                     cudaDataType_t scaleType);
+
+typedef cublasStatus_t (*cublasLtMatmulDescDestroy_t)(cublasLtMatmulDesc_t matmulDesc);
+
+typedef cublasStatus_t (*cublasLtMatmulDescSetAttribute_t)(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    cublasLtMatmulDescAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+
+/* externs */
+extern cudaGetErrorString_t _cudaGetErrorString;
+extern cusparseGetErrorString_t _cusparseGetErrorString;
+//extern cudaMallocManaged_t _cudaMallocManaged;
+//extern cudaMemPrefetchAsync_t _cudaMemPrefetchAsync;
+//extern cudaDeviceGetAttribute_t _cudaDeviceGetAttribute;
+
+extern cusparseCreate_t _cusparseCreate;
+extern cublasCreate_v2_t _cublasCreate_v2;
+extern cublasLtCreate_t _cublasLtCreate;
+
+extern cusparseDestroySpMat_t _cusparseDestroySpMat;
+extern cusparseDestroyDnMat_t _cusparseDestroyDnMat;
+extern cusparseCreateCoo_t _cusparseCreateCoo;
+extern cusparseSpMM_t _cusparseSpMM;
+extern cusparseSpMM_bufferSize_t _cusparseSpMM_bufferSize;
+extern cusparseCreateDnMat_t _cusparseCreateDnMat;
+
+extern cudaMemset_t _cudaMemset;
+extern cudaMalloc_t _cudaMalloc;
+extern cudaFree_t _cudaFree;
+extern cudaPeekAtLastError_t _cudaPeekAtLastError;
+
+extern cublasGemmEx_t _cublasGemmEx;
+extern cublasGemmStridedBatchedEx_t _cublasGemmStridedBatchedEx;
+
+extern cublasLtMatrixLayoutCreate_t _cublasLtMatrixLayoutCreate;
+extern cublasLtMatrixLayoutSetAttribute_t _cublasLtMatrixLayoutSetAttribute;
+extern cublasLtMatrixTransform_t _cublasLtMatrixTransform;
+extern cublasLtMatrixTransformDescCreate_t _cublasLtMatrixTransformDescCreate;
+extern cublasLtMatrixTransformDescSetAttribute_t _cublasLtMatrixTransformDescSetAttribute;
+extern cublasLtMatrixLayoutDestroy_t _cublasLtMatrixLayoutDestroy;
+extern cublasLtMatrixTransformDescDestroy_t _cublasLtMatrixTransformDescDestroy;
+extern cublasLtMatmul_t _cublasLtMatmul;
+extern cublasLtMatmulDescCreate_t _cublasLtMatmulDescCreate;
+extern cublasLtMatmulDescDestroy_t _cublasLtMatmulDescDestroy;
+extern cublasLtMatmulDescSetAttribute_t _cublasLtMatmulDescSetAttribute;
+
+
+#define cudaGetErrorString _cudaGetErrorString
+#define cusparseGetErrorString _cusparseGetErrorString
+#define cusparseCreate _cusparseCreate
+#define cublasCreate_v2 _cublasCreate_v2
+#define cublasLtCreate _cublasLtCreate
+
+#define cudaMemset _cudaMemset
+#define cudaMalloc _cudaMalloc
+#define cudaFree _cudaFree
+#define cudaPeekAtLastError _cudaPeekAtLastError
+
+#define cusparseCreateCoo _cusparseCreateCoo
+#define cusparseDestroySpMat _cusparseDestroySpMat
+#define cusparseDestroyDnMat _cusparseDestroyDnMat
+#define cusparseSpMM _cusparseSpMM
+#define cusparseSpMM_bufferSize _cusparseSpMM_bufferSize
+#define cusparseCreateDnMat _cusparseCreateDnMat
+
+#define cublasGemmEx _cublasGemmEx
+#define cublasGemmStridedBatchedEx _cublasGemmStridedBatchedEx
+#define cublasLtMatrixLayoutCreate _cublasLtMatrixLayoutCreate
+#define cublasLtMatrixLayoutSetAttribute _cublasLtMatrixLayoutSetAttribute
+#define cublasLtMatrixTransform _cublasLtMatrixTransform
+#define cublasLtMatrixTransformDescCreate _cublasLtMatrixTransformDescCreate
+
+#define cublasLtMatrixTransformDescSetAttribute _cublasLtMatrixTransformDescSetAttribute
+#define cublasLtMatrixLayoutDestroy _cublasLtMatrixLayoutDestroy
+#define cublasLtMatrixTransformDescDestroy _cublasLtMatrixTransformDescDestroy
+#define cublasLtMatmul _cublasLtMatmul
+#define cublasLtMatmulDescCreate _cublasLtMatmulDescCreate
+#define cublasLtMatmulDescDestroy _cublasLtMatmulDescDestroy
+#define cublasLtMatmulDescSetAttribute _cublasLtMatmulDescSetAttribute
+
+#endif /* USE_CUDA_WRAPPER */
 
 #define CUDA_CHECK_RETURN(value) {                      \
   cudaError_t _m_cudaStat = value;                    \