support config fragment shape and layout using intrinsic

Siyuan Feng · Siyuan Feng · commit 0c6b3780ec50 · 2019-10-15T19:51:18.000-07:00
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
@@ -1310,6 +1310,16 @@ constexpr const char* opengl_stage_scope = "opengl_stage_scope";
  */
 constexpr const char* device_scope = "device_scope";
 
+/*!
+ * \brief Mark that the shape of TensorCore fragment
+ */
+constexpr const char* fragment_shape = "fragment_shape";
+
+/*!
+ * \brief Mark that the layout of TensorCore fragment
+ */
+constexpr const char* fragment_layout = "fragment_layout";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
@@ -1319,6 +1329,7 @@ inline bool IsPragmaKey(const std::string& attr_key) {
   return attr_key.compare(0, 7, "pragma_") == 0;
 }
 
+
 }  // namespace attr
 
 /*! \brief namespace of TVM Intrinsic functions */
@@ -1559,7 +1570,6 @@ constexpr const char* tvm_load_matrix_sync = "tvm_load_matrix_sync";
 constexpr const char* tvm_mma_sync = "tvm_mma_sync";
 constexpr const char* tvm_fill_fragment = "tvm_fill_fragment";
 constexpr const char* tvm_store_matrix_sync = "tvm_store_matrix_sync";
-constexpr const char* tvm_access_fragement = "tvm_access_fragement";
 
 }   // namespace intrinsic
 
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
@@ -525,6 +525,14 @@ LoweredFunc LowerIntrin(LoweredFunc f, const std::string& target);
  */
 LoweredFunc LowerCustomDatatypes(LoweredFunc f, const std::string& target);
 
+/*!
+ * \brief Infer the TensorCore fragment infomation using tensor intrinsics
+ *
+ * \param stmt The stmt to be transformed
+ * \return Transformed stmt.
+ */
+LoweredFunc InferFragment(LoweredFunc f);
+
 /*!
  * \brief Verify if memory accesses are legal for a specific target device type.
  *
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
@@ -464,6 +464,7 @@ def _build_for_device(flist, target, target_host):
                 func = ir_pass.ThreadSync(func, "global")
             func = ir_pass.ThreadSync(func, "shared")
             func = ir_pass.ThreadSync(func, "warp")
+            func = ir_pass.InferFragment(func)
             warp_size = target.thread_warp_size
             func = ir_pass.LowerThreadAllreduce(func, warp_size)
             fsplits = [s for s in ir_pass.SplitHostDevice(func)]
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
@@ -160,5 +160,6 @@ REGISTER_PASS(VerifyGPUCode);
 REGISTER_PASS(DecorateDeviceScope);
 REGISTER_PASS(InstrumentBoundCheckers);
 REGISTER_PASS(VerifyCompactBuffer);
+REGISTER_PASS(InferFragment)
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
@@ -305,39 +305,39 @@ void CodeGenCUDA::PrintStorageScope(
 void CodeGenCUDA::VisitExpr_(const Call *op, std::ostream& os) {
   if (op->is_intrinsic(intrinsic::tvm_fill_fragment)) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 3U);
+    CHECK_EQ(op->args.size(), 6U);
     os << "nvcuda::wmma::fill_fragment(";
     this->PrintExpr(op->args[0], os);
     os << "[";
-    this->PrintExpr(op->args[1], os);
+    this->PrintExpr(op->args[4], os);
     os << "], ";
-    this->PrintExpr(op->args[2], os);
+    this->PrintExpr(op->args[5], os);
     os << ")";
   } else if (op->is_intrinsic(intrinsic::tvm_load_matrix_sync)) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 4U);
+    CHECK_EQ(op->args.size(), 8U);
     os << "nvcuda::wmma::load_matrix_sync(";
     this->PrintExpr(op->args[0], os);
     os << "[";
-    this->PrintExpr(op->args[1], os);
+    this->PrintExpr(op->args[4], os);
     os << "], ";
-    this->PrintExpr(op->args[2], os);
+    this->PrintExpr(op->args[5], os);
     os << ", ";
-    this->PrintExpr(op->args[3], os);
+    this->PrintExpr(op->args[6], os);
     os << ")";
   } else if (op->is_intrinsic(intrinsic::tvm_store_matrix_sync)) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 5U);
+    CHECK_EQ(op->args.size(), 8U);
     os << "nvcuda::wmma::store_matrix_sync(";
-    this->PrintExpr(op->args[2], os);
+    this->PrintExpr(op->args[5], os);
     os << ", ";
     this->PrintExpr(op->args[0], os);
     os << "[";
-    this->PrintExpr(op->args[1], os);
+    this->PrintExpr(op->args[4], os);
     os << "], ";
-    this->PrintExpr(op->args[3], os);
-    if (const StringImm *str = op->args[4].as<StringImm>()) {
-      os << ", nvcuda::wmma::" << str->value;
+    this->PrintExpr(op->args[6], os);
+    if (const StringImm *str = op->args[7].as<StringImm>()) {
+      os << ", nvcuda::wmma::mem_" << str->value;
     } else {
       LOG(FATAL) << "Invalid parameters";
     }
@@ -357,6 +357,19 @@ void CodeGenCUDA::VisitExpr_(const Call *op, std::ostream& os) {
   }
 }
 
+void CodeGenCUDA::VisitStmt_(const AttrStmt* op) {
+  if (op->attr_key == attr::fragment_shape) {
+    const Variable* buffer = op->node.as<Variable>();
+    const StringImm* shape_str = op->value.as<StringImm>();
+    fragment_shapes[buffer] = shape_str->value;
+  } else if (op->attr_key == attr::fragment_layout) {
+    const Variable* buffer = op->node.as<Variable>();
+    const StringImm* layout_str = op->value.as<StringImm>();
+    fragment_layouts[buffer] = layout_str->value;
+  }
+  CodeGenC::VisitStmt_(op);
+}
+
 void CodeGenCUDA::VisitStmt_(const Allocate* op) {
   CHECK(!is_zero(op->condition));
   std::string vid = AllocVarID(op->buffer_var.get());
@@ -383,7 +396,7 @@ void CodeGenCUDA::VisitStmt_(const Allocate* op) {
           << "Accumulator only support half and float type for now";
       }
       constant_size /= 256;
-      PrintWmmaScope(scope, op->type, stream);
+      PrintWmmaScope(scope, op->type, buffer, stream);
     } else {
       PrintStorageScope(scope, stream);
       stream << ' ';
@@ -498,18 +511,23 @@ void CodeGenCUDA::VisitExpr_(const FloatImm *op, std::ostream& os) { // NOLINT(*
   PrintConst(op, os, this);
 }
 
-void CodeGenCUDA::PrintWmmaScope(const std::string &scope, Type t, std::ostream &os) {
+void CodeGenCUDA::PrintWmmaScope(const std::string &scope, Type t, const Variable* variable, std::ostream &os) {
   std::stringstream type;
   PrintType(t, type);
+  std::string shape_str = fragment_shapes[variable];
   if (scope == "wmma.matrix_a") {
     need_mma_h_ = true;
-    os << "nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, " << type.str() << ", nvcuda::wmma::row_major>";
+    std::string layout_str = fragment_layouts[variable];
+    os << "nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, "
+      << shape_str << ", " << type.str() << ", nvcuda::wmma::" << layout_str <<">";
   } else if (scope == "wmma.matrix_b") {
     need_mma_h_ = true;
-    os << "nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 16, "<< type.str() << ", nvcuda::wmma::row_major>";
+    std::string layout_str = fragment_layouts[variable];
+    os << "nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, "
+       << shape_str << ", " << type.str() << ", nvcuda::wmma::" << layout_str <<">";
   } else if (scope == "wmma.accumulator") {
     need_mma_h_ = true;
-    os << "nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, "<< type.str() << ">";
+    os << "nvcuda::wmma::fragment<nvcuda::wmma::accumulator, " << shape_str << ", "<< type.str() << ">";
   }
 }
 
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
@@ -63,6 +63,7 @@ class CodeGenCUDA final : public CodeGenC {
   void VisitExpr_(const Call *op, std::ostream& os) final;
   void VisitStmt_(const Evaluate *op) final;
   void VisitStmt_(const Allocate *op) final;
+  void VisitStmt_(const AttrStmt *op) final;
 
  private:
   // Whether global barrier is needed.
@@ -79,8 +80,11 @@ class CodeGenCUDA final : public CodeGenC {
   bool need_math_constants_h_{false};
   // whether need mma.h
   bool need_mma_h_{false};
+
+  std::unordered_map<const Variable*, std::string> fragment_shapes;
+  std::unordered_map<const Variable*, std::string> fragment_layouts;
   friend void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p);
-  void PrintWmmaScope(const std::string& scope, Type t, std::ostream& os);
+  void PrintWmmaScope(const std::string& scope, Type t, const Variable* variable, std::ostream& os);
 };
 
 }  // namespace codegen
diff --git a/src/pass/infer_fragment.cc b/src/pass/infer_fragment.cc
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tensorcore_fragment.cc
+ */
+#include <tvm/ir.h>
+#include <tvm/ir_pass.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_visitor.h>
+#include <unordered_map>
+#include <unordered_set>
+#include "ir_util.h"
+#include "storage_access.h"
+#include "../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace ir {
+
+class FragmentGetter : public IRVisitor {
+ public:
+  struct FragmentInfo {
+    int m, n, k;
+    std::string layout;
+    FragmentInfo() = default;
+    FragmentInfo(int _m, int _n, int _k, const std::string& _layout)
+      : m(_m), n(_n), k(_k), layout(_layout) {}
+  };
+
+  void Visit_(const Call* op) final {
+    IRVisitor::Visit_(op);
+
+    if (op->is_intrinsic(intrinsic::tvm_load_matrix_sync) ||
+        op->is_intrinsic(intrinsic::tvm_store_matrix_sync)) {
+      CHECK_EQ(op->args.size(), 8U);
+      const Variable* buffer_var = op->args[0].as<Variable>();
+      CHECK(buffer_var);
+      const IntImm* m = op->args[1].as<IntImm>();
+      const IntImm* n = op->args[2].as<IntImm>();
+      const IntImm* k = op->args[3].as<IntImm>();
+      const StringImm* layout = op->args[7].as<StringImm>();
+      CHECK(m);
+      CHECK(n);
+      CHECK(k);
+      CHECK(layout);
+
+      std::string scope = scopes[buffer_var];
+      if (fragments.count(buffer_var)) {
+        FragmentInfo info = fragments[buffer_var];
+        CHECK_EQ(m->value, info.m);
+        CHECK_EQ(n->value, info.n);
+        CHECK_EQ(k->value, info.k);
+        if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
+          CHECK_EQ(layout->value, info.layout);
+        }
+      } else {
+        FragmentInfo info;
+        if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
+          info = FragmentInfo(m->value, n->value, k->value, layout->value);
+        } else if (scope == "wmma.accumulator") {
+          info = FragmentInfo(m->value, n->value, k->value, "");
+        }
+        fragments[buffer_var] = info;
+      }
+    } else if (op->is_intrinsic(intrinsic::tvm_fill_fragment)) {
+      CHECK_EQ(op->args.size(), 6U);
+      const Variable* buffer_var = op->args[0].as<Variable>();
+      CHECK(buffer_var);
+      const IntImm* m = op->args[1].as<IntImm>();
+      const IntImm* n = op->args[2].as<IntImm>();
+      const IntImm* k = op->args[3].as<IntImm>();
+      CHECK(m);
+      CHECK(n);
+      CHECK(k);
+
+      std::string scope = scopes[buffer_var];
+      CHECK_EQ(scope, "wmma.accumulator");
+      if (fragments.count(buffer_var)) {
+        FragmentInfo info = fragments[buffer_var];
+        CHECK_EQ(m->value, info.m);
+        CHECK_EQ(n->value, info.n);
+        CHECK_EQ(k->value, info.k);
+      } else {
+        FragmentInfo info(m->value, n->value, k->value, "");
+        fragments[buffer_var] = info;
+      }
+    }
+  }
+
+  void Visit_(const AttrStmt* op) final {
+    if (op->attr_key == attr::storage_scope) {
+      const Variable* buffer = op->node.as<Variable>();
+      CHECK(buffer);
+      scopes[buffer] = op->value.as<StringImm>()->value;
+    }
+    IRVisitor::Visit_(op);
+  }
+
+  std::unordered_map<const Variable*, std::string> scopes;
+  std::unordered_map<const Variable*, FragmentInfo> fragments;
+};
+
+class FragmentChecker : public IRVisitor {
+ public:
+  FragmentChecker(const FragmentGetter &getter) : fragment_getter(getter) {}
+
+  void Visit_(const Call* op) final {
+    if (op->is_intrinsic(intrinsic::tvm_mma_sync)) {
+      CHECK_EQ(op->args.size(), 8U);
+      const Variable* buffer_var_d = op->args[0].as<Variable>();
+      const Variable* buffer_var_a = op->args[2].as<Variable>();
+      const Variable* buffer_var_b = op->args[4].as<Variable>();
+      const Variable* buffer_var_c = op->args[6].as<Variable>();
+      CHECK(buffer_var_d);
+      CHECK(buffer_var_a);
+      CHECK(buffer_var_b);
+      CHECK(buffer_var_c);
+      CHECK(CheckShape(buffer_var_d, buffer_var_a));
+      CHECK(CheckShape(buffer_var_d, buffer_var_b));
+      CHECK(CheckShape(buffer_var_d, buffer_var_c));
+    }
+  }
+ private:
+  bool CheckShape(const Variable* buffer1, const Variable* buffer2) {
+    CHECK(fragment_getter.fragments.count(buffer1));
+    CHECK(fragment_getter.fragments.count(buffer2));
+    FragmentGetter::FragmentInfo info1 = fragment_getter.fragments.at(buffer1);
+    FragmentGetter::FragmentInfo info2 = fragment_getter.fragments.at(buffer2);
+    return info1.m == info2.m && info1.n == info2.n && info1.k == info2.k;
+
+  }
+  const FragmentGetter &fragment_getter;
+
+};
+
+class InferFragmenter : public IRMutator {
+ public:
+  InferFragmenter(const FragmentGetter &getter) : fragment_getter(getter) {}
+
+  Stmt Mutate_(const Allocate* op, const Stmt& s) final {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    const Variable* buffer = op->buffer_var.get();
+    if (fragment_getter.fragments.count(buffer)) {
+      FragmentGetter::FragmentInfo info = fragment_getter.fragments.at(buffer);
+      std::string shape = std::to_string(info.n) + ", " +
+                          std::to_string(info.m) + ", " +
+                          std::to_string(info.k);
+      Expr shape_expr = StringImm::make(shape);
+      Stmt shape_attr = AttrStmt::make(op->buffer_var, attr::fragment_shape, shape_expr, stmt);
+      if (info.layout != "") {
+        Stmt layout_attr = AttrStmt::make(op->buffer_var, attr::fragment_layout,
+                                          StringImm::make(info.layout), shape_attr);
+        return layout_attr;
+      } else {
+        return shape_attr;
+      }
+    }
+    return stmt;
+  }
+ private:
+  const FragmentGetter &fragment_getter;
+};
+
+Stmt InferFragment(Stmt stmt) {
+  FragmentGetter getter;
+  getter.Visit(stmt);
+  FragmentChecker(getter).Visit(stmt);
+  stmt = InferFragmenter(getter).Mutate(stmt);
+  return stmt;
+}
+
+LoweredFunc InferFragment(LoweredFunc f) {
+  CHECK_NE(f->func_type, kHostFunc);
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
+  n->body = InferFragment(f->body);
+  return LoweredFunc(n);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_schedule_tensor_core.py b/tests/python/unittest/test_schedule_tensor_core.py