rocblas -> hipblas changes for ROCm (#5401)

Fixes #4989 In addition to this PR, below changes are required to build below extensions successfully. Please note that not all unit tests for these extensions will pass with this PR. More details on the unit test results are below. These unit tests are skipped in CI anyway, so they will not break the CI. - transformer_inference - quantizer - random_ltd - pytorch/pytorch#121030 - #5402 Unit test results (rocm/pytorch:rocm6.1_ubuntu20.04_py3.9_pytorch_2.1.2) on MI200: **transformer_inference:** pytest --color=yes --durations=0 --verbose -s -m "inference_ops" -rF -n 4 unit/ops/transformer/inference Before this PR: ==== 674 failed, 622 skipped, 8 warnings, 1728 errors in 123.66s (0:02:03) ===== After this PR: ========== 555 failed, 983 passed, 1486 skipped, 8 warnings in 14.35s ========== **quantizer:** pytest --color=yes --durations=0 --verbose -s -m "inference_ops" -rF -n 4 unit/ops/quantizer Before this PR: ==== 244 failed, 8 warnings in 48.02s ==== After this PR: ===== 187 failed, 57 passed, 8 warnings in 14.74s ==== I could not find random_ltd related unit tests to run. --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Logan Adams <loadams@microsoft.com>
microsoft · May 17, 2024 · d3dd8e7 · d3dd8e7
1 parent 8e4f6e4
commit d3dd8e7
Show file tree

Hide file tree

Showing 9 changed files with 293 additions and 66 deletions.
diff --git a/csrc/includes/cublas_wrappers.h b/csrc/includes/cublas_wrappers.h
@@ -17,6 +17,7 @@
 #include <rocblas/rocblas.h>
 #endif
 #include <stdio.h>
+#include <torch/version.h>
 
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
@@ -29,7 +30,9 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const float* A,
                    const float* B,
                    float* C,
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                    rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
@@ -46,7 +49,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const __half* A,
                    const __half* B,
                    __half* C,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                    rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -67,7 +71,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                 rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
@@ -88,7 +93,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                 rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);

diff --git a/csrc/includes/feed_forward.h b/csrc/includes/feed_forward.h
@@ -48,7 +48,9 @@ class FeedForward {
                        weights,
                        input_ptr,
                        out,
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo(config_.gemm_algos[0]));
 #else
                        cublasGemmAlgo_t(config_.gemm_algos[0]));
@@ -77,7 +79,8 @@ class FeedForward {
                        input_ptr,
                        out_grad,
                        weights_grad,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo(config_.gemm_algos[1]));
 #else
                        cublasGemmAlgo_t(config_.gemm_algos[1]));
@@ -94,7 +97,8 @@ class FeedForward {
                        weights,
                        out_grad,
                        inp_grad_out,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo(config_.gemm_algos[2]));
 #else
                        cublasGemmAlgo_t(config_.gemm_algos[2]));

diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
@@ -67,7 +67,9 @@ class GemmTest {
                            B,
                            A,
                            C,
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                            static_cast<rocblas_gemm_algo>(algo));
 #else
                            static_cast<cublasGemmAlgo_t>(algo));
@@ -86,7 +88,8 @@ class GemmTest {
                            A,
                            C,
                            B,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                            static_cast<rocblas_gemm_algo>(algo));
 #else
                            static_cast<cublasGemmAlgo_t>(algo));
@@ -105,7 +108,8 @@ class GemmTest {
                            B,
                            C,
                            A,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                            static_cast<rocblas_gemm_algo>(algo));
 #else
                            static_cast<cublasGemmAlgo_t>(algo));
@@ -121,8 +125,11 @@ class GemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
         for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
+#elif defined(__HIP_PLATFORM_AMD__)
+        for (int algo = (int)HIPBLAS_GEMM_DEFAULT; algo <= (int)HIPBLAS_GEMM_DEFAULT;
 #else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
@@ -211,7 +218,8 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                         static_cast<rocblas_gemm_algo>(algo));
 #else
                                         static_cast<cublasGemmAlgo_t>(algo));
@@ -245,7 +253,8 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                         static_cast<rocblas_gemm_algo>(algo));
 #else
                                         static_cast<cublasGemmAlgo_t>(algo));
@@ -276,7 +285,8 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                         static_cast<rocblas_gemm_algo>(algo));
 #else
                                         static_cast<cublasGemmAlgo_t>(algo));
@@ -292,11 +302,17 @@ class StridedGemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
         for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
+#else
+#ifdef __HIP_PLATFORM_AMD__
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
 #else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
 #endif
              algo++) {
             int warm_up = 5;

diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h
@@ -77,7 +77,9 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                     rocblas_gemm_algo(_config.gemm_algos[0]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
@@ -105,7 +107,8 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     _config.batch_size,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                     rocblas_gemm_algo(_config.gemm_algos[0]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
@@ -149,7 +152,8 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                     rocblas_gemm_algo(_config.gemm_algos[1]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[1]));
@@ -178,7 +182,8 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                     rocblas_gemm_algo(_config.gemm_algos[2]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[2]));