diff --git a/pytorch_blade/scripts/build_pytorch_blade_dcu.sh b/pytorch_blade/scripts/build_pytorch_blade_dcu.sh index 73c66ec0e5e..ea0b6508450 100644 --- a/pytorch_blade/scripts/build_pytorch_blade_dcu.sh +++ b/pytorch_blade/scripts/build_pytorch_blade_dcu.sh @@ -33,7 +33,7 @@ function ci_build() { elif [ "$TORCH_BLADE_BUILD_WITH_DCU_ROCM_SUPPORT" = "ON" ]; then export TORCH_BLADE_BUILD_TENSORRT=OFF export TORCH_BLADE_BUILD_TENSORRT_STATIC=${TORCH_BLADE_BUILD_TENSORRT_STATIC:-OFF} - python3 ../scripts/python/common_setup.py + python3 ../scripts/python/common_setup.py --rocm_path=/opt/dtk else python3 ../scripts/python/common_setup.py --cpu_only fi diff --git a/tao_compiler/mlir/xla/ral/BUILD b/tao_compiler/mlir/xla/ral/BUILD index 7e891e1f094..4390fabc0c2 100644 --- a/tao_compiler/mlir/xla/ral/BUILD +++ b/tao_compiler/mlir/xla/ral/BUILD @@ -16,7 +16,7 @@ load( "if_cuda_is_configured", "cuda_library", ) -load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured") +load("@local_config_rocm//rocm:build_defs.bzl","if_dcu", "if_rocm_is_configured") load( "@com_google_protobuf//:protobuf.bzl", "cc_proto_library", @@ -323,7 +323,8 @@ cc_library( copts = [ # "-DTF_1_12", "-fopenmp", - ] + if_cuda_or_rocm(["-DTAO_RAL_USE_STREAM_EXECUTOR"]), + ] + if_dcu(["-DTENSORFLOW_USE_DCU=1"]) + + if_cuda_or_rocm(["-DTAO_RAL_USE_STREAM_EXECUTOR"]), linkopts = [ "-fopenmp", "-ldl", @@ -508,6 +509,8 @@ cc_library( ]), copts = if_cuda_is_configured([ "-DGOOGLE_CUDA=1" + ]) + if_dcu([ + "-DTENSORFLOW_USE_DCU=1" ]) + if_cuda_or_rocm(["-DTAO_RAL_USE_STREAM_EXECUTOR"]), alwayslink = 1, @@ -695,6 +698,8 @@ cc_library( "-DDISC_BUILD_FROM_TF_BRIDGE" ] + if_cuda_or_rocm([ "-DTAO_RAL_USE_STREAM_EXECUTOR" + ]) + if_dcu([ + "-DTENSORFLOW_USE_DCU=1" ]) + if_rocm_is_configured([ "-DTENSORFLOW_USE_ROCM=1", "-x rocm", @@ -908,6 +913,8 @@ cc_library( ] + if_rocm_is_configured([ "-DTENSORFLOW_USE_ROCM=1", "-x rocm", + ]) + if_dcu([ + "-DTENSORFLOW_USE_DCU=1" ]) + if_cuda_or_rocm([ "-DTAO_RAL_USE_STREAM_EXECUTOR" ]) + cuda_default_copts(), diff --git a/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc b/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc index 49caff24735..b3a98ca4297 100644 --- a/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc +++ b/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc @@ -202,7 +202,8 @@ static bool DoGemmWithAlgorithm( /*leading dim of LHS=*/lhs_matrix.num_cols, /*beta=*/static_cast(beta), &output_data, /*leading dim of output=*/n, computation_type, *algorithm, -#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM +#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \ + (!TENSORFLOW_USE_DCU) output_profile_result, se::blas::CallContext::kNone) #elif (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION >= 11) se::blas::kDefaultComputePrecision, output_profile_result) @@ -225,7 +226,8 @@ static bool DoGemmWithAlgorithm( /*leading dim of LHS=*/lhs_matrix.num_cols, lhs_stride, /*beta=*/static_cast(beta), &output_data, /*leading dim of output=*/n, output_stride, -#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM +#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \ + (!TENSORFLOW_USE_DCU) batch_size, se::blas::CallContext::kNone) #elif (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION >= 11) batch_size, se::blas::kDefaultComputePrecision) @@ -242,7 +244,8 @@ static bool DoGemmWithAlgorithm( /*leading dim of RHS=*/rhs_matrix.num_cols, lhs_data, /*leading dim of LHS=*/lhs_matrix.num_cols, /*beta=*/static_cast(beta), &output_data, -#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM +#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \ + (!TENSORFLOW_USE_DCU) /*leading dim of output=*/n, se::blas::CallContext::kNone) #else /*leading dim of output=*/n) @@ -738,7 +741,8 @@ std::vector GetMIOpenAlgorithms( params.input_descriptor, operand_buffers[0], params.filter_descriptor, operand_buffers[1], params.output_descriptor, result_buffer, params.convolution_descriptor, scratch_allocator, -#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM +#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \ + (!TENSORFLOW_USE_DCU) se::dnn::CallContext::kNone, &algorithms)) { #else &algorithms)) { @@ -1340,13 +1344,15 @@ Status RunCudnnConvolution(CudnnConvParams& params, } Status status = Status::OK(); -#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM +#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \ + (!TENSORFLOW_USE_DCU) se::dnn::CallContext call_context = se::dnn::CallContext::kNone; #endif switch (kind) { case ConvolutionKind::FORWARD: -#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM +#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 8) && TENSORFLOW_USE_ROCM && \ + (!TENSORFLOW_USE_DCU) // TF2.9 and ROCM call_context = se::dnn::CallContext::kForward; status = stream->ConvolveWithAlgorithm(