From 6284d2a82b5361691479004a65216583c2569cb0 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Wed, 21 Jul 2021 19:29:19 -0700
Subject: [PATCH] wrap cudaStreamSynchronize calls (#61889)

Summary:
This is a first step towards creating context manager that errors out on synchronizing calls.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61889

Reviewed By: albanD

Differential Revision: D29805280

Pulled By: ngimel

fbshipit-source-id: b66400fbe0941b7daa51e6b30abe27b9cccd4e8a
---
 .github/workflows/lint.yml                    |  4 +++
 Makefile                                      |  3 +-
 aten/src/ATen/native/cuda/CUDAScalar.cu       | 18 +---------
 aten/src/ATen/native/cuda/Copy.cu             |  7 +---
 aten/src/ATen/native/cuda/MiscUtils.h         |  4 +--
 aten/src/ATen/native/cuda/Nonzero.cu          |  4 +--
 aten/src/ATen/native/cuda/TensorModeKernel.cu |  5 ++-
 aten/src/THC/generic/THCStorage.cpp           | 24 +++----------
 aten/src/THC/generic/THCStorageCopy.cpp       | 34 ++++---------------
 c10/cuda/CMakeLists.txt                       |  3 +-
 c10/cuda/CUDAException.h                      |  3 +-
 c10/cuda/CUDAFunctions.cpp                    | 16 ---------
 c10/cuda/CUDAFunctions.h                      | 25 +++++++++++++-
 c10/cuda/CUDAMiscFunctions.cpp                | 20 +++++++++++
 c10/cuda/CUDAMiscFunctions.h                  | 11 ++++++
 c10/cuda/CUDAStream.h                         |  5 ++-
 torch/csrc/CudaIPCTypes.cpp                   |  4 +--
 torch/csrc/generic/StorageSharing.cpp         |  2 +-
 torch/csrc/jit/codegen/cuda/executor.cpp      |  3 +-
 torch/utils/hipify/cuda_to_hip_mappings.py    |  1 +
 20 files changed, 89 insertions(+), 107 deletions(-)
 create mode 100644 c10/cuda/CUDAMiscFunctions.cpp
 create mode 100644 c10/cuda/CUDAMiscFunctions.h

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index ef8bda66c5a1d..0d0d36f989c43 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -86,6 +86,10 @@ jobs:
         if: always()
         run: |
           (! git --no-pager grep -I -no $'#include <cub/' --  ./aten  ':(exclude)aten/src/ATen/cuda/cub.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
+      - name: Ensure no raw cuda api calls
+        if: always()
+        run: |
+          (! git --no-pager grep -I -no $'cudaStreamSynchronize' --  ./aten ./c10 ':(exclude)aten/src/ATen/test' ':(exclude)c10/cuda/CUDAFunctions.h' || (echo "The above files call raw cuda APIs directly; please use at::cuda wrappers instead"; false))
 
   clang-format:
     runs-on: ubuntu-18.04
diff --git a/Makefile b/Makefile
index 27af703d80833..f2c835b46b61a 100644
--- a/Makefile
+++ b/Makefile
@@ -87,7 +87,8 @@ quick_checks:
 		--step 'Ensure no unqualified noqa' \
 		--step 'Ensure no unqualified type ignore' \
 		--step 'Ensure no direct cub include' \
-		--step 'Ensure correct trailing newlines'
+		--step 'Ensure correct trailing newlines' \
+		--step 'Ensure no raw cuda api calls'
 
 flake8:
 	@$(PYTHON) tools/actions_local_runner.py \
diff --git a/aten/src/ATen/native/cuda/CUDAScalar.cu b/aten/src/ATen/native/cuda/CUDAScalar.cu
index e06ef78c454b4..242778faa14f0 100644
--- a/aten/src/ATen/native/cuda/CUDAScalar.cu
+++ b/aten/src/ATen/native/cuda/CUDAScalar.cu
@@ -2,35 +2,19 @@
 #include <ATen/NativeFunctions.h>
 
 #include <ATen/cuda/CUDAContext.h>
-#include <cuda.h>
-
-#ifdef __HIP_PLATFORM_HCC__
-#include <hip/hip_version.h>
-#endif
 
 namespace at {
 namespace native {
 
 Scalar _local_scalar_dense_cuda(const Tensor& self) {
   Scalar r;
-#if HIP_VERSION >= 301
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-    at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_cuda", [&] {
-        scalar_t value;
-        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        AT_CUDA_CHECK(hipMemcpyWithStream(&value, self.data_ptr<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream));
-        r = Scalar(value);
-      });
-#else
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
     at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_cuda", [&] {
         scalar_t value;
         cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        AT_CUDA_CHECK(cudaMemcpyAsync(&value, self.data_ptr<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream));
-        AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+        at::cuda::memcpy_and_sync(&value, self.data_ptr<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream);
         r = Scalar(value);
       });
-#endif
   return r;
 }
 
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
index 968f7207d5780..84e016016d9b5 100644
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@@ -224,12 +224,7 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
     void* ptr = (dst_device == kCPU ? dst : src);
     AT_CUDA_CHECK(THCCachingHostAllocator_recordEvent(ptr, stream));
   } else {
-#if HIP_VERSION >= 301
-    AT_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
-#else
-    AT_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
-    AT_CUDA_CHECK(cudaStreamSynchronize(stream));
-#endif
+    at::cuda::memcpy_and_sync(dst, src, nbytes, kind, stream);
   }
 
   if (iter.tensor(0).is_conj() != iter.tensor(1).is_conj()) {
diff --git a/aten/src/ATen/native/cuda/MiscUtils.h b/aten/src/ATen/native/cuda/MiscUtils.h
index 5373f61edb8fb..b28f77c701744 100644
--- a/aten/src/ATen/native/cuda/MiscUtils.h
+++ b/aten/src/ATen/native/cuda/MiscUtils.h
@@ -75,14 +75,14 @@ struct MagmaStreamSyncGuard {
   MagmaStreamSyncGuard() {
     auto stream = at::cuda::getCurrentCUDAStream();
     if (stream != at::cuda::getDefaultCUDAStream()) {
-      AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+      at::cuda::stream_synchronize(stream);
     }
   }
 
   ~MagmaStreamSyncGuard() noexcept(false) {
     auto default_stream = at::cuda::getDefaultCUDAStream();
     if (at::cuda::getCurrentCUDAStream() != default_stream) {
-      AT_CUDA_CHECK(cudaStreamSynchronize(default_stream));
+      at::cuda::stream_synchronize(default_stream);
     }
   }
 };
diff --git a/aten/src/ATen/native/cuda/Nonzero.cu b/aten/src/ATen/native/cuda/Nonzero.cu
index 79ee419336a6d..e72493738a407 100644
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@@ -61,9 +61,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){
   auto temp_storage = allocator.allocate(temp_storage_bytes);
   cub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
   int num_nonzeros_h;
-  C10_CUDA_CHECK(cudaMemcpyAsync(&num_nonzeros_h, num_nonzeros.get(), sizeof(int), cudaMemcpyDeviceToHost, stream));
-  //need to synchronize to make sure data is available on the host
-  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+  at::cuda::memcpy_and_sync(&num_nonzeros_h, num_nonzeros.get(), sizeof(int), cudaMemcpyDeviceToHost, stream);
   //expected output size is num_nonzeros x ndim
   //we are producing output with size {num_nonzeros, ndim} and strides {num_nonzeros, 1} (that is, transposed ndim x num_nonzeros output)
   //we are able to directly use passed output with this size and strides, and we can also (per contract)
diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cu b/aten/src/ATen/native/cuda/TensorModeKernel.cu
index c7455f55760e3..c24d7fb25037d 100644
--- a/aten/src/ATen/native/cuda/TensorModeKernel.cu
+++ b/aten/src/ATen/native/cuda/TensorModeKernel.cu
@@ -105,9 +105,8 @@ void calculate_mode(
 
   AT_CUDA_CHECK(cudaMemcpyAsync(
       values_data, &mode, sizeof(scalar_t), cudaMemcpyHostToDevice, stream));
-  AT_CUDA_CHECK(cudaMemcpyAsync(
-      indices_data, &index, sizeof(scalar_t), cudaMemcpyHostToDevice, stream));
-  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  //memcpy_and_sync will synchronize results
+  at::cuda::memcpy_and_sync(indices_data, &index, sizeof(scalar_t), cudaMemcpyHostToDevice, stream);
 }
 
 template <typename scalar_t>
diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp
index 59664404880d3..63ce73388e831 100644
--- a/aten/src/THC/generic/THCStorage.cpp
+++ b/aten/src/THC/generic/THCStorage.cpp
@@ -5,9 +5,6 @@
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/typeid.h>
 
-#ifdef __HIP_PLATFORM_HCC__
-#include <hip/hip_version.h>
-#endif
 
 scalar_t* THCStorage_(data)(THCState *state, const THCStorage *self)
 {
@@ -26,16 +23,9 @@ void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, scalar
       2,
       "index out of bounds");
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
-#if HIP_VERSION >= 301
-  THCudaCheck(hipMemcpyWithStream(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t),
-                                  cudaMemcpyHostToDevice,
-                                  stream));
-#else
-  THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t),
+  at::cuda::memcpy_and_sync(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t),
                               cudaMemcpyHostToDevice,
-                              stream));
-  THCudaCheck(cudaStreamSynchronize(stream));
-#endif
+                              stream);
 }
 
 scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
@@ -46,14 +36,8 @@ scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t ind
       "index out of bounds");
   scalar_t value;
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
-#if HIP_VERSION >= 301
-  THCudaCheck(hipMemcpyWithStream(&value, THCStorage_(data)(state, self) + index, sizeof(scalar_t),
-                                  cudaMemcpyDeviceToHost, stream));
-#else
-  THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(scalar_t),
-                              cudaMemcpyDeviceToHost, stream));
-  THCudaCheck(cudaStreamSynchronize(stream));
-#endif
+  at::cuda::memcpy_and_sync(&value, THCStorage_(data)(state, self) + index, sizeof(scalar_t),
+                                  cudaMemcpyDeviceToHost, stream);
   return value;
 }
 
diff --git a/aten/src/THC/generic/THCStorageCopy.cpp b/aten/src/THC/generic/THCStorageCopy.cpp
index 1411e30fd2c54..086b6d421a00a 100644
--- a/aten/src/THC/generic/THCStorageCopy.cpp
+++ b/aten/src/THC/generic/THCStorageCopy.cpp
@@ -2,30 +2,18 @@
 #define THC_GENERIC_FILE "THC/generic/THCStorageCopy.cpp"
 #else
 
-#ifdef __HIP_PLATFORM_HCC__
-#include <hip/hip_version.h>
-#endif
+#include <c10/cuda/CUDAFunctions.h>
 
 void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src)
 {
   THArgCheck(self->nbytes() == src->nbytes(), 2, "size does not match");
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
-#if HIP_VERSION >= 301
-  THCudaCheck(hipMemcpyWithStream(
-      THCStorage_(data)(state, self),
+  at::cuda::memcpy_and_sync(THCStorage_(data)(state, self),
       THStorage_(data)(src),
       self->nbytes(),
       cudaMemcpyHostToDevice,
-      stream));
-#else
-  THCudaCheck(cudaMemcpyAsync(
-      THCStorage_(data)(state, self),
-      THStorage_(data)(src),
-      self->nbytes(),
-      cudaMemcpyHostToDevice,
-      stream));
-  THCudaCheck(cudaStreamSynchronize(stream));
-#endif
+      stream);
+
 }
 
 #define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC)                                 \
@@ -61,22 +49,12 @@ void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *s
 {
   THArgCheck(self->nbytes() == src->nbytes(), 2, "size does not match");
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
-#if HIP_VERSION >= 301
-  THCudaCheck(hipMemcpyWithStream(
+  at::cuda::memcpy_and_sync(
       THStorage_(data)(self),
       THCStorage_(data)(state, src),
       self->nbytes(),
       cudaMemcpyDeviceToHost,
-      stream));
-#else
-  THCudaCheck(cudaMemcpyAsync(
-      THStorage_(data)(self),
-      THCStorage_(data)(state, src),
-      self->nbytes(),
-      cudaMemcpyDeviceToHost,
-      stream));
-  THCudaCheck(cudaStreamSynchronize(stream));
-#endif
+      stream);
 }
 
 #define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC)                               \
diff --git a/c10/cuda/CMakeLists.txt b/c10/cuda/CMakeLists.txt
index 256fc54b08a18..3803498b33523 100644
--- a/c10/cuda/CMakeLists.txt
+++ b/c10/cuda/CMakeLists.txt
@@ -22,10 +22,10 @@ configure_file(
 set(C10_CUDA_SRCS
     CUDAStream.cpp
     CUDAFunctions.cpp
+    CUDAMiscFunctions.cpp
     CUDACachingAllocator.cpp
     impl/CUDAGuardImpl.cpp
     impl/CUDATest.cpp
-    CUDAFunctions.cpp
 )
 set(C10_CUDA_HEADERS
     CUDAException.h
@@ -34,6 +34,7 @@ set(C10_CUDA_HEADERS
     CUDAMathCompat.h
     CUDAStream.h
     CUDAFunctions.h
+    CUDAMiscFunctions.h
     impl/CUDAGuardImpl.h
     impl/CUDATest.h
 )
diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h
index 2c084d6cbf93e..c3ff821bb3322 100644
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@@ -1,6 +1,7 @@
 #pragma once
 
-#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAMiscFunctions.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <cuda.h>
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 6838b05a7fa72..1f781b37099a3 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -1,6 +1,3 @@
-#include <cuda_runtime_api.h>
-
-#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/macros/Macros.h>
 
@@ -141,18 +138,5 @@ void device_synchronize() {
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 }
 
-const char* get_cuda_check_suffix() noexcept {
-  static char* device_blocking_flag = getenv("CUDA_LAUNCH_BLOCKING");
-  static bool blocking_enabled =
-      (device_blocking_flag && atoi(device_blocking_flag));
-  if (blocking_enabled) {
-    return "";
-  } else {
-    return "\nCUDA kernel errors might be asynchronously reported at some"
-           " other API call,so the stacktrace below might be incorrect."
-           "\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.";
-  }
-}
-
 } // namespace cuda
 } // namespace c10
diff --git a/c10/cuda/CUDAFunctions.h b/c10/cuda/CUDAFunctions.h
index 25f687827dfc3..1464999d715cc 100644
--- a/c10/cuda/CUDAFunctions.h
+++ b/c10/cuda/CUDAFunctions.h
@@ -8,7 +8,12 @@
 // The naming convention used here matches the naming convention of torch.cuda
 
 #include <c10/core/Device.h>
+#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMacros.h>
+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_version.h>
+#endif
+#include <cuda_runtime_api.h>
 
 namespace c10 {
 namespace cuda {
@@ -30,7 +35,25 @@ C10_CUDA_API void set_device(DeviceIndex device);
 
 C10_CUDA_API void device_synchronize();
 
-C10_CUDA_API const char* get_cuda_check_suffix() noexcept;
+// the subsequent functions are defined in the header because for performance
+// reasons we want them to be inline
+C10_CUDA_API void __inline__ memcpy_and_sync(
+    void* dst,
+    void* src,
+    int64_t nbytes,
+    cudaMemcpyKind kind,
+    cudaStream_t stream) {
+#if defined(HIP_VERSION) && (HIP_VERSION >= 301)
+  C10_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
+#else
+  C10_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
+  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+#endif
+}
+
+C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) {
+  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+}
 
 } // namespace cuda
 } // namespace c10
diff --git a/c10/cuda/CUDAMiscFunctions.cpp b/c10/cuda/CUDAMiscFunctions.cpp
new file mode 100644
index 0000000000000..7655ca8c6a600
--- /dev/null
+++ b/c10/cuda/CUDAMiscFunctions.cpp
@@ -0,0 +1,20 @@
+#include <c10/cuda/CUDAMiscFunctions.h>
+#include <stdlib.h>
+
+namespace c10 {
+namespace cuda {
+
+const char* get_cuda_check_suffix() noexcept {
+  static char* device_blocking_flag = getenv("CUDA_LAUNCH_BLOCKING");
+  static bool blocking_enabled =
+      (device_blocking_flag && atoi(device_blocking_flag));
+  if (blocking_enabled) {
+    return "";
+  } else {
+    return "\nCUDA kernel errors might be asynchronously reported at some"
+           " other API call,so the stacktrace below might be incorrect."
+           "\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.";
+  }
+}
+} // namespace cuda
+} // namespace c10
diff --git a/c10/cuda/CUDAMiscFunctions.h b/c10/cuda/CUDAMiscFunctions.h
new file mode 100644
index 0000000000000..eca8fd042f615
--- /dev/null
+++ b/c10/cuda/CUDAMiscFunctions.h
@@ -0,0 +1,11 @@
+#pragma once
+// this file is to avoid circular dependency between CUDAFunctions.h and
+// CUDAExceptions.h
+
+#include <c10/cuda/CUDAMacros.h>
+
+namespace c10 {
+namespace cuda {
+C10_CUDA_API const char* get_cuda_check_suffix() noexcept;
+}
+} // namespace c10
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index f46ae5bf007a1..07d4997a087f0 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -7,8 +7,7 @@
 
 #include <c10/core/DeviceGuard.h>
 #include <c10/core/Stream.h>
-#include <c10/cuda/CUDAException.h>
-#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/Exception.h>
 
 /*
@@ -128,7 +127,7 @@ class C10_CUDA_API CUDAStream {
 
   void synchronize() const {
     DeviceGuard guard{stream_.device()};
-    C10_CUDA_CHECK(cudaStreamSynchronize(stream()));
+    c10::cuda::stream_synchronize(stream());
   }
 
   int priority() const {
diff --git a/torch/csrc/CudaIPCTypes.cpp b/torch/csrc/CudaIPCTypes.cpp
index 62574976b57f5..13801bc1fa19f 100644
--- a/torch/csrc/CudaIPCTypes.cpp
+++ b/torch/csrc/CudaIPCTypes.cpp
@@ -161,7 +161,7 @@ CudaIPCSentData::CudaIPCSentData(
     event_sync_required_ = true;
   } else {
     auto stream = c10::cuda::getCurrentCUDAStream(device.index());
-    C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+    at::cuda::stream_synchronize(stream);
     event_ = nullptr;
     event_sync_required_ = false;
   }
@@ -169,7 +169,7 @@ CudaIPCSentData::CudaIPCSentData(
   // cuIpcGetEventHandle with HIP is not supported, so we have to sync
   // stream instead of passing event
   auto stream = c10::cuda::getCurrentCUDAStream(device.index());
-  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+  at::cuda::stream_synchronize(stream);
   event_sync_required_ = false;
 #endif
 }
diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp
index 98706dda0f0e4..30511534253d8 100644
--- a/torch/csrc/generic/StorageSharing.cpp
+++ b/torch/csrc/generic/StorageSharing.cpp
@@ -426,7 +426,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
 
         // TODO: Instead of cudaStreamSynchronize it is possible to add Stream
         // Callback and release counter inside of it (need to check performance impact)
-        cudaStreamSynchronize(c10::cuda::getCurrentCUDAStream(device));
+        at::cuda::stream_synchronize(c10::cuda::getCurrentCUDAStream(device));
 
         // We don't want to break existing code, so resource deletion is best
         // effort basis. Exception expected if producer process terminated
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index d9ca0cc977ed1..e2c40bd3d5d64 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -9,7 +9,6 @@
 
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/cuda/CUDAFunctions.h>
@@ -529,7 +528,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
         stream,
         kernel_arguments.getBuffer(),
         nullptr));
-    AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+    at::cuda::stream_synchronize(stream);
   }
 
   return alloced_outputs;
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 0494df31d899e..5ab001c82c7e2 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -8123,6 +8123,7 @@
         ("c10/cuda/CUDAMacros.h", ("c10/hip/HIPMacros.h", API_C10)),
         ("c10/cuda/CUDAMathCompat.h", ("c10/hip/HIPMathCompat.h", API_C10)),
         ("c10/cuda/CUDAFunctions.h", ("c10/hip/HIPFunctions.h", API_C10)),
+        ("c10/cuda/CUDAMiscFunctions.h", ("c10/hip/HIPMiscFunctions.h", API_C10)),
         ("c10/cuda/CUDAStream.h", ("c10/hip/HIPStream.h", API_C10)),
         ("c10/cuda/CUDAGraphsC10Utils.h", ("c10/hip/HIPGraphsC10Utils.h", API_C10)),
         ("c10/cuda/CUDACachingAllocator.h", ("c10/hip/HIPCachingAllocator.h", API_C10)),