diff --git a/pytorch_blade/scripts/build_pytorch_blade_dcu.sh b/pytorch_blade/scripts/build_pytorch_blade_dcu.sh
index 73c66ec0e5e..ea0b6508450 100644
--- a/pytorch_blade/scripts/build_pytorch_blade_dcu.sh
+++ b/pytorch_blade/scripts/build_pytorch_blade_dcu.sh
@@ -33,7 +33,7 @@ function ci_build() {
     elif [ "$TORCH_BLADE_BUILD_WITH_DCU_ROCM_SUPPORT" = "ON"  ]; then
       export TORCH_BLADE_BUILD_TENSORRT=OFF
       export TORCH_BLADE_BUILD_TENSORRT_STATIC=${TORCH_BLADE_BUILD_TENSORRT_STATIC:-OFF}
-      python3 ../scripts/python/common_setup.py 
+      python3 ../scripts/python/common_setup.py --rocm_path=/opt/dtk 
     else
       python3 ../scripts/python/common_setup.py --cpu_only
     fi
diff --git a/tao_compiler/mlir/xla/ral/BUILD b/tao_compiler/mlir/xla/ral/BUILD
index 7e891e1f094..4390fabc0c2 100644
--- a/tao_compiler/mlir/xla/ral/BUILD
+++ b/tao_compiler/mlir/xla/ral/BUILD
@@ -16,7 +16,7 @@ load(
     "if_cuda_is_configured",
     "cuda_library",
 )
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl","if_dcu", "if_rocm_is_configured")
 load(
     "@com_google_protobuf//:protobuf.bzl",
     "cc_proto_library",
@@ -323,7 +323,8 @@ cc_library(
     copts = [
         # "-DTF_1_12",
         "-fopenmp",
-    ] + if_cuda_or_rocm(["-DTAO_RAL_USE_STREAM_EXECUTOR"]),
+    ] + if_dcu(["-DTENSORFLOW_USE_DCU=1"])
+    + if_cuda_or_rocm(["-DTAO_RAL_USE_STREAM_EXECUTOR"]),
     linkopts = [
         "-fopenmp",
         "-ldl",
@@ -508,6 +509,8 @@ cc_library(
     ]),
     copts = if_cuda_is_configured([
         "-DGOOGLE_CUDA=1"
+    ]) + if_dcu([
+        "-DTENSORFLOW_USE_DCU=1"
     ]) + if_cuda_or_rocm(["-DTAO_RAL_USE_STREAM_EXECUTOR"]),
 
     alwayslink = 1,
@@ -695,6 +698,8 @@ cc_library(
         "-DDISC_BUILD_FROM_TF_BRIDGE"
     ] + if_cuda_or_rocm([
         "-DTAO_RAL_USE_STREAM_EXECUTOR"
+    ]) + if_dcu([
+        "-DTENSORFLOW_USE_DCU=1"
     ]) + if_rocm_is_configured([
         "-DTENSORFLOW_USE_ROCM=1",
         "-x rocm",
@@ -908,6 +913,8 @@ cc_library(
     ] + if_rocm_is_configured([
         "-DTENSORFLOW_USE_ROCM=1",
         "-x rocm",
+    ]) + if_dcu([
+        "-DTENSORFLOW_USE_DCU=1"
     ]) + if_cuda_or_rocm([
         "-DTAO_RAL_USE_STREAM_EXECUTOR"
     ]) + cuda_default_copts(),
diff --git a/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc b/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc
index 49caff24735..b3a98ca4297 100644
--- a/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc
+++ b/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc
@@ -202,7 +202,8 @@ static bool DoGemmWithAlgorithm(
             /*leading dim of LHS=*/lhs_matrix.num_cols,
             /*beta=*/static_cast<OutT>(beta), &output_data,
             /*leading dim of output=*/n, computation_type, *algorithm,
-#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM
+#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \
+    (!TENSORFLOW_USE_DCU)
             output_profile_result, se::blas::CallContext::kNone)
 #elif (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION >= 11)
             se::blas::kDefaultComputePrecision, output_profile_result)
@@ -225,7 +226,8 @@ static bool DoGemmWithAlgorithm(
             /*leading dim of LHS=*/lhs_matrix.num_cols, lhs_stride,
             /*beta=*/static_cast<AlphaBeta>(beta), &output_data,
             /*leading dim of output=*/n, output_stride,
-#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM
+#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \
+    (!TENSORFLOW_USE_DCU)
             batch_size, se::blas::CallContext::kNone)
 #elif (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION >= 11)
             batch_size, se::blas::kDefaultComputePrecision)
@@ -242,7 +244,8 @@ static bool DoGemmWithAlgorithm(
                      /*leading dim of RHS=*/rhs_matrix.num_cols, lhs_data,
                      /*leading dim of LHS=*/lhs_matrix.num_cols,
                      /*beta=*/static_cast<AlphaBeta>(beta), &output_data,
-#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM
+#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \
+    (!TENSORFLOW_USE_DCU)
                      /*leading dim of output=*/n, se::blas::CallContext::kNone)
 #else
                      /*leading dim of output=*/n)
@@ -738,7 +741,8 @@ std::vector<ProfileResult> GetMIOpenAlgorithms(
           params.input_descriptor, operand_buffers[0], params.filter_descriptor,
           operand_buffers[1], params.output_descriptor, result_buffer,
           params.convolution_descriptor, scratch_allocator,
-#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM
+#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \
+    (!TENSORFLOW_USE_DCU)
           se::dnn::CallContext::kNone, &algorithms)) {
 #else
           &algorithms)) {
@@ -1340,13 +1344,15 @@ Status RunCudnnConvolution(CudnnConvParams& params,
   }
 
   Status status = Status::OK();
-#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM
+#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \
+    (!TENSORFLOW_USE_DCU)
   se::dnn::CallContext call_context = se::dnn::CallContext::kNone;
 #endif
 
   switch (kind) {
     case ConvolutionKind::FORWARD:
-#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM
+#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \
+    (!TENSORFLOW_USE_DCU)
       // TF2.9 and ROCM
       call_context = se::dnn::CallContext::kForward;
       status = stream->ConvolveWithAlgorithm(