pytorch · zhuhaozhe · May 10, 2024 · May 10, 2024 · May 10, 2024 · May 10, 2024
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -19,9 +19,69 @@
 #if defined(__aarch64__) && !defined(C10_MOBILE)
 #include <cpuinfo.h>
 #endif
-
 namespace at {
 
+namespace {
+
+/*
+  These const variables defined the fp32 precisions for different backend
+  We have "generic", "cuda", "mkldnn" backend now and we can choose fp32
+  prevision from "ieee", "tf32", "bf16" and "none". The "ieee" precision means
+  IEEE standard floating point format "tf32" and "bf16" means we are allowed to
+  use "tf32" or "bf16" as internal computation data types for fp32 computations.
+  And "none" means it is override-able by parent's node
+
+  generic->mkldnn->matmul
+                ->conv
+                ->rnn
+         ->cuda ->matmul
+                ->conv
+                ->rnn
+*/
+const std::map<std::string, std::vector<std::string>> _fp32_precisions = {
+    {"generic", {{"ieee", "tf32", "bf16", "none"}}},
+    {"mkldnn", {{"ieee", "bf16", "none"}}},
+    {"cuda", {{"ieee", "tf32", "none"}}}};
+
+// Check whether the backend and op are legal
+void check_fp32_prec_backend_and_op(
+    const std::string& backend,
+    const std::string& op) {
+  static std::vector<std::string> backends = {"generic", "mkldnn", "cuda"};
+  static std::vector<std::string> operators = {"conv", "matmul", "rnn", "all"};
+  TORCH_CHECK(
+      std::find(backends.begin(), backends.end(), backend) != backends.end(),
+      "Invalid backend: ",
+      backend);
+  TORCH_CHECK(
+      std::find(operators.begin(), operators.end(), op) != operators.end(),
+      "Invalid operator: ",
+      op);
+  if (backend == "generic") {
+    TORCH_CHECK(op == "all", "Invalid operation for generic backend: ", op);
+  }
+  }
+
+  // Return whether the precision is supported by backends
+  bool validate_fp32_prec(
+      const std::string& backend,
+      const std::string& precision) {
+    auto iterp = _fp32_precisions.find(backend);
+    TORCH_CHECK(iterp != _fp32_precisions.end());
+    auto precisions = iterp->second;
+    bool valid = std::find(precisions.begin(), precisions.end(), precision) !=
+        precisions.end();
+    return valid;
+  }
+
+  C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
+    TORCH_WARN_ONCE(
+      "This API is going to be deprecated, please see "
+      "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"
+    );
+  }
+} // namespace
+
 Context::Context() = default;
 
 // TODO: This could be bad juju if someone calls globalContext() in the
@@ -115,12 +175,29 @@ void Context::setUserEnabledNNPACK(bool e) {
   enabled_nnpack = e;
 }
 
-bool Context::allowTF32CuDNN() const {
+bool Context::allowTF32CuDNN(const std::string& op) const {
+  if (op.size() == 0){
+    bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
+    bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
+    TORCH_CHECK(
+        allow_tf32_rnn == allow_tf32_conv && allow_tf32_rnn == allow_tf32_cudnn,
+        "PyTorch is checking whether allow_tf32 is enabled for cuDNN without a specific operator name,",
+        "but the current flag(s) indicate that cuDNN conv and cuDNN RNN have different TF32 flags.",
+        "This combination indicates that you have used a mix of the legacy and new APIs to set the TF32 flags. ",
+        "We suggest only using the new API to set the TF32 flag(s). See also: ",
+        "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
+  } else {
+    return float32Precision("cuda", op) == "tf32";
+  }
+  warn_deprecated_fp32_precision_api();
   return allow_tf32_cudnn;
 }
 
 void Context::setAllowTF32CuDNN(bool b) {
+  setFloat32Precision("cuda", "rnn", b ? "tf32" : "none");
+  setFloat32Precision("cuda", "conv", b ? "tf32" : "none");
   allow_tf32_cudnn = b;
+  warn_deprecated_fp32_precision_api();
 }
 
 void Context::setSDPPriorityOrder(const std::vector<int64_t>& order) {
@@ -259,7 +336,16 @@ bool Context::allowTF32CuBLAS() const {
       return false;
     }
 #endif
-  return float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
+  bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
+  bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
+  TORCH_CHECK(
+      legacy_allow_tf32 == allow_tf32_new,
+      "PyTorch is checking whether allow_tf32_new is enabled for cuBlas matmul,",
+      "Current status indicate that you have used mix of the legacy and new APIs to set the TF32 status for cublas matmul. ",
+      "We suggest only using the new API to set the TF32 flag. See also: ",
+      "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
+  warn_deprecated_fp32_precision_api();
+  return allow_tf32_new;
 }
 
 void Context::setAllowTF32CuBLAS(bool b) {
@@ -272,27 +358,54 @@ void Context::setAllowTF32CuBLAS(bool b) {
   }
 #endif
   float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
+  setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
 }
 
 Float32MatmulPrecision Context::float32MatmulPrecision() const {
+  bool invalid = float32Precision("cuda", "matmul") == "tf32" &&
+      float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST;
+  invalid = invalid ||
+      (float32Precision("mkldnn", "matmul") == "bf16" &&
+       float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM);
+  TORCH_CHECK(
+      !invalid,
+      "PyTorch is checking the matmul precision without a specific backend name,",
+      "Current status indicate that you have used mix of the legacy and new APIs to set the matmul precision. ",
+      "We suggest only using the new API for matmul precision. See also: ",
+      "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
+  warn_deprecated_fp32_precision_api();
   return float32_matmul_precision;
 }
 
-void Context::setFloat32MatmulPrecision(Float32MatmulPrecision p) {
-  float32_matmul_precision = p;
+std::string Context::float32Precision(const std::string& backend, const std::string& op) const {
+  check_fp32_prec_backend_and_op(backend, op);
+  auto precision = fp32_precision.find(backend)->second.find(op)->second;
+  if (precision == "none")
+    precision = fp32_precision.find(backend)->second.find("all")->second;
+  if (precision == "none")
+    precision = fp32_precision.find("generic")->second.find("all")->second;
+  bool valid_prec = validate_fp32_prec(backend, precision);
+  return valid_prec ? precision : "none";
 }
 
 void Context::setFloat32MatmulPrecision(const std::string &s) {
   auto match = [this](const std::string & s_) {
+    warn_deprecated_fp32_precision_api();
     // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
     if (s_ == "highest") {
       float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
+      setFloat32Precision("cuda", "matmul", "ieee");
+      setFloat32Precision("mkldnn", "matmul", "ieee");
       return true;
     } else if (s_ == "high") {
       float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
+      setFloat32Precision("cuda", "matmul", "tf32");
+      setFloat32Precision("mkldnn", "matmul", "ieee");
       return true;
     } else if (s_ == "medium") {
       float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
+      setFloat32Precision("cuda", "matmul", "tf32");
+      setFloat32Precision("mkldnn", "matmul", "bf16");
       return true;
     }
     return false;
@@ -306,6 +419,27 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
     "setFloat32MatmulPrecision call has no effect.");
 }
 
+void Context::setFloat32Precision(const std::string& backend, const std::string& op, const std::string& p) {
+  check_fp32_prec_backend_and_op(backend, op);
+  if (validate_fp32_prec(backend, p)) {
+    fp32_precision[backend][op] = p;
+  } else {
+    std::string msg;
+    auto iterp = _fp32_precisions.find(backend);
+    TORCH_CHECK(iterp != _fp32_precisions.end());
+    for (auto p : iterp->second) {
+      msg += p;
+      msg += " ";
+    }
+    TORCH_WARN(
+        "you have set wrong precision for backend:",
+        backend,
+        " setFloat32Precision call has no effect.",
+        "Please choose precision from: ",
+        msg);
+  }
+}
+
 at::LinalgBackend Context::linalgPreferredBackend() const {
   return linalg_preferred_backend;
 }

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -28,6 +28,7 @@
 #include <c10/util/irange.h>
 
 #include <cstdint>
+#include <map>
 #include <mutex>
 
 namespace at {
@@ -336,14 +337,20 @@ class TORCH_API Context {
   void alertCuBLASConfigNotDeterministic() const;
 
   void setFloat32MatmulPrecision(const std::string& s);
-  bool allowTF32CuDNN() const;
+  void setFloat32Precision(
+      const std::string& backend,
+      const std::string& op,
+      const std::string& s);
+  bool allowTF32CuDNN(const std::string& op = std::string()) const;
   void setAllowTF32CuDNN(bool);
   bool allowTF32OneDNN() const;
   void setAllowTF32OneDNN(bool);
   bool allowTF32CuBLAS() const;
   void setAllowTF32CuBLAS(bool);
   Float32MatmulPrecision float32MatmulPrecision() const;
-  void setFloat32MatmulPrecision(Float32MatmulPrecision p);
+  std::string float32Precision(
+      const std::string& backend,
+      const std::string& op) const;
   bool allowFP16ReductionCuBLAS() const;
   void setAllowFP16ReductionCuBLAS(bool);
   bool allowBF16ReductionCuBLAS() const;
@@ -469,6 +476,23 @@ class TORCH_API Context {
   bool enable_sparse_tensor_invariant_checks = false;
   bool allow_fp16_reduction_cpu = false;
 
+  std::map<std::string, std::map<std::string, std::string>> fp32_precision = {
+      {"generic", {{"all", "none"}}},
+      {"mkldnn",
+       {{"matmul", "none"},
+        {"conv", "none"},
+        {"rnn", "none"},
+        {"all", "none"}}},
+      {"cuda",
+       {{"matmul",
+         float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
+             ? "none"
+             : "tf32"},
+        {"conv", "tf32"},
+        {"rnn", "tf32"},
+        {"all", "none"}}},
+  };
+
   Allocator* prev_allocator_ptr_{nullptr};
 };
 

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -407,7 +407,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
     computeType = CUBLAS_COMPUTE_64F;
     scaleType = CUDA_R_64F;
   } else if constexpr (std::is_same_v<Dtype, float>) {
-    if (at::globalContext().allowTF32CuBLAS()) {
+    if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
       computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
     }
   } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
@@ -1589,7 +1589,7 @@ bool gemm_and_bias(
     computeType = CUBLAS_COMPUTE_64F;
     scaleType = CUDA_R_64F;
   } else if constexpr (std::is_same_v<Dtype, float>) {
-    if (at::globalContext().allowTF32CuBLAS()) {
+    if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
       computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
     }
   } else if constexpr (std::is_same_v<Dtype, at::Half>) {

diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -218,7 +218,8 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
   // FP32 data type calculations based on the value of the allow_tf32 flag.
   // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
-  if (!NoTF32Guard::should_disable_tf32() && at::globalContext().allowTF32CuBLAS()) {
+  if (!NoTF32Guard::should_disable_tf32() &&
+      at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
   } else {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h
@@ -160,7 +160,7 @@ inline std::string ComputeTypeFor() {
 // ROCBLAS and hipBLASLt.
 template <>
 inline std::string ComputeTypeFor<float>() {
-  if (!at::globalContext().allowTF32CuBLAS()) {
+  if (at::globalContext().float32Precision("cuda", "matmul") != "tf32") {
     return "f32_r";
   } else {
     return "xf32_r";

diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -499,7 +499,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
       }
 
       hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
-      if (at::globalContext().allowTF32CuBLAS()) {
+      if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
         computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
       }
       HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);

diff --git a/aten/src/ATen/cuda/tunable/GemmRocblas.h b/aten/src/ATen/cuda/tunable/GemmRocblas.h
@@ -141,7 +141,7 @@ class RocblasGemmOp : public Callable<GemmParams<T>> {
 
     TuningStatus Call(const GemmParams<T>* params) override {
       auto input_output_type = RocBlasDataTypeFor<T>();
-      if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r)
+      if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r)
         return FAIL;  // no support for TF32 in rocBLAS
       auto compute_type = RocBlasComputeTypeFor<T>();
       auto h_a = DoCastForHalfOrBfloat16(params->alpha);
@@ -209,7 +209,7 @@ class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
 
     TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
       auto input_output_type = RocBlasDataTypeFor<T>();
-      if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r)
+      if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r)
         return FAIL;  // no support for TF32 in rocBLAS
       auto compute_type = RocBlasComputeTypeFor<T>();
       auto h_a = DoCastForHalfOrBfloat16(params->alpha);

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
@@ -1174,7 +1174,7 @@ at::Tensor convolution(
   bool deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
   return at::_convolution(input, weight, bias, stride, padding, dilation,
                           transposed, output_padding, groups,
-                          ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN());
+                          ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN("conv"));
 }
 
 at::Tensor convolution_overrideable(
@@ -1319,7 +1319,7 @@ ConvBackend select_conv_backend(
   params.benchmark = ctx.benchmarkCuDNN();
   params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
   params.cudnn_enabled = ctx.userEnabledCuDNN();
-  params.allow_tf32 = ctx.allowTF32CuDNN();
+  params.allow_tf32 = ctx.allowTF32CuDNN("conv");
 
   auto input = input_r;
   auto weight = weight_r;
@@ -1705,7 +1705,7 @@ at::Tensor _convolution(
   c10::MaybeOwned<Tensor> bias_r_maybe_owned = at::borrow_from_optional_tensor(bias_r_opt);
   const Tensor& bias_r = *bias_r_maybe_owned;
 
-  return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN());
+  return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN("conv"));
 }
 
 std::tuple<Tensor, Tensor, Tensor> convolution_backward_overrideable(
@@ -2003,7 +2003,7 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
   params.benchmark = ctx.benchmarkCuDNN();
   params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
   params.cudnn_enabled = ctx.userEnabledCuDNN();
-  params.allow_tf32 = ctx.allowTF32CuDNN();
+  params.allow_tf32 = ctx.allowTF32CuDNN("conv");
 
   // Validate inputs.
   check_shape_backward(input, weight.sizes(), params);

diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp
@@ -169,7 +169,8 @@ std::string repro_from_args(const ConvolutionParams& params) {
   ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n";
   ss << "import torch\n";
   ss << "torch.backends.cuda.matmul.allow_tf32 = "
-     << pybool(at::globalContext().allowTF32CuBLAS()) << "\n";
+     << pybool(at::globalContext().float32Precision("cuda", "matmul") == "tf32")
+     << "\n";
   ss << "torch.backends.cudnn.benchmark = "
      << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
   ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
@@ -725,7 +726,7 @@ Tensor cudnn_convolution_relu(
 
   auto& ctx = at::globalContext();
   bool benchmark = ctx.benchmarkCuDNN();
-  bool allow_tf32 = ctx.allowTF32CuDNN();
+  bool allow_tf32 = ctx.allowTF32CuDNN("conv");
   auto _bias = bias_t.has_value()
       ? bias_t.value()
       : at::zeros(
@@ -783,7 +784,7 @@ Tensor cudnn_convolution_add_relu(
   }
 
   auto& ctx = at::globalContext();
-  bool allow_tf32 = ctx.allowTF32CuDNN();
+  bool allow_tf32 = ctx.allowTF32CuDNN("conv");
   bool benchmark = ctx.benchmarkCuDNN();
   auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
   auto _bias = bias_t.has_value()