add TensorCore tutorial

Siyuan Feng · Siyuan Feng · commit 3db8d6463e8d · 2019-10-15T19:51:18.000-07:00
diff --git a/tests/python/unittest/test_schedule_tensor_core.py b/tests/python/unittest/test_schedule_tensor_core.py
@@ -19,7 +19,7 @@
 from topi.testing import conv2d_nhwc_python
 from tvm.contrib import nvcc
 
-VERIFY = False
+VERIFY = True
 
 
 def intrin_wmma_load_matrix(scope):
@@ -99,8 +99,8 @@ def intrin_func(ins, outs):
 
 
 def test_tensor_core_batch_matmal():
-    batch_size = 20
-    n = 2048
+    batch_size = 4
+    n = 512
     m, l = n, n
     assert (n % 16 == 0)
     assert (m % 16 == 0)
@@ -205,11 +205,11 @@ def test_tensor_core_batch_matmal():
 
 def test_tensor_core_batch_conv():
     # The sizes of inputs and filters
-    batch_size = 256
+    batch_size = 32
     height = 14
     width = 14
-    in_channels = 256
-    out_channels = 512
+    in_channels = 32
+    out_channels = 64
     kernel_h = 3
     kernel_w = 3
     pad_h = 1
diff --git a/tutorials/optimize/opt_conv_tensorcore.py b/tutorials/optimize/opt_conv_tensorcore.py
@@ -0,0 +1,348 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _opt-conv-tensorcore:
+
+How to optimize convolution using TensorCores
+==================================
+**Author**: `Siyuan Feng <https://github.com/Hzfengsy>`_
+
+In this tutorial, we will demonstrate how to write a high performance convolution
+schedule using TensorCores in TVM. In this example, we assume the input to
+convolution has a large batch. We strongly recommend covering the :ref:`opt-conv-gpu` tutorial first.
+
+"""
+
+################################################################
+# TensorCore Introduction
+# -------------------------
+# Each Tensor Core provides a 4x4x4 matrix processing array that operates
+# :code:`D = A * B + C`, where A, B, C and D are 4x4 matrices as Figure shows.
+# The matrix multiplication inputs A and B are FP16 matrices, while the accumulation
+# matrices C and D may be FP16 or FP32 matrices.
+#
+# However, CUDA programmers can only use warp-level primitive
+# :code:`wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag)` to perform
+# 16x16x16 half-precision matrix multiplication on tensor cores. Before invoking
+# the matrix multiplication, programmers must load data from memory into registers
+# with primitive :code:`wmma::load_matrix_sync`, explicitly. The NVCC compiler translates
+# that primitive into multiple memory load instructions. At run time, every thread loads
+# 16 elements from matrix A and 16 elements from B.
+
+################################################################
+# Preparation and Algorithm
+# --------------------------
+# We use the fixed size for input tensors with 256 channels and 14 x 14 dimensions.
+# The batch size is 256. Convolution filters contain 512 filters of size 3 x 3.
+# We use stride size 1 and padding size 1 for the convolution. In the example, we use
+# NHWCnc memory layout.The following code defines the convolution algorithm in TVM.
+
+import tvm
+import numpy as np
+from tvm.contrib import nvcc
+
+# The sizes of inputs and filters
+batch_size = 256
+height = 14
+width = 14
+in_channels = 256
+out_channels = 512
+kernel_h = 3
+kernel_w = 3
+pad_h = 1
+pad_w = 1
+stride_h = 1
+stride_w = 1
+
+# TensorCore shape
+block_size = 16
+
+assert (batch_size % block_size == 0)
+assert (in_channels % block_size == 0)
+assert (out_channels % block_size == 0)
+
+# Input feature map: (N, H, W, IC, n, ic)
+data_shape = (batch_size // block_size,
+              height,
+              width,
+              in_channels // block_size,
+              block_size,
+              block_size)
+# Kernel: (H, W, IC, OC, ic, oc)
+kernel_shape = (kernel_h,
+                kernel_w,
+                in_channels // block_size,
+                out_channels // block_size,
+                block_size,
+                block_size)
+# Output feature map: (N, H, W, OC, n, oc)
+output_shape = (batch_size // block_size,
+                height,
+                width,
+                out_channels // block_size,
+                block_size,
+                block_size)
+
+# Reduction axes
+kh = tvm.reduce_axis((0, kernel_h), name='kh')
+kw = tvm.reduce_axis((0, kernel_w), name='kw')
+ic = tvm.reduce_axis((0, in_channels // block_size), name='ic')
+ii = tvm.reduce_axis((0, block_size), name='ii')
+
+# Algorithm
+A = tvm.placeholder(data_shape, name='A', dtype="float16")
+W = tvm.placeholder(kernel_shape, name='W', dtype="float16")
+Apad = tvm.compute(
+    (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size,
+     block_size),
+    lambda n, h, w, i, nn, ii: tvm.if_then_else(
+        tvm.all(h >= pad_h, h - pad_h < height,
+                w >= pad_w, w - pad_w < width),
+        A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.const(0., "float16")),
+    name='Apad')
+Conv = tvm.compute(output_shape,
+                   lambda n, h, w, o, nn, oo: tvm.sum(
+                       Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") *
+                       W[kh, kw, ic, o, ii, oo].astype("float32"),
+                       axis=[ic, kh, kw, ii]),
+                   name="Conv")
+
+s = tvm.create_schedule(Conv.op)
+s[Apad].compute_inline()
+
+###############################################################################
+# Memory Scope
+# ----------------
+#
+# In traditional GPU schedule, we have global, shared and local memory scope.
+# To support TensorCores, we add another three special memory scope: :code:`wmma.matrix_a`,
+# :code:`wmma.matrix_b` and :code:`wmma.accumulator`. On hardware, all fragments scope
+# stores at the on-chip registers level, the same place with local memory.
+
+# Designate the memory hierarchy
+AS = s.cache_read(Apad, 'shared', [Conv])
+WS = s.cache_read(W, 'shared', [Conv])
+AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
+WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
+ConvF = s.cache_write(Conv, 'wmma.accumulator')
+
+###############################################################################
+# Define Tensor Intrinsic
+# In fact, TensorCore is a special hardware operation. So, we can just use tensorize
+# to replace a unit of computation with the TensorCore instruction. The first thing is
+# that we need to define tensor intrinsic.
+#
+# There are four basic operation in TensorCore: :code:`fill_fragment`, :code:`load_matrix`,
+# :code:`mma_sync` and :code:`store_matrix`. Since :code:`fill_fragment` and :code:`mma_sync`
+# are both used in matrix multiplication, so we can just write following three intrinsics.
+
+def intrin_wmma_load_matrix(scope):
+    n = 16
+    A = tvm.placeholder((n, n), name='A', dtype='float16')
+    BA = tvm.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=256)
+    C = tvm.compute((n, n), lambda i, j: A[i, j], name='C')
+    BC = tvm.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+
+        BA = ins[0]
+        BC = outs[0]
+        ib.emit(tvm.call_intrin('handle', 'tvm_load_matrix_sync',
+                                BC.data, n, n, n, BC.elem_offset // 256,
+                                BA.access_ptr('r'), n, 'row_major'))
+        return ib.get()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+
+
+def intrin_wmma_gemm():
+    n = 16
+    A = tvm.placeholder((n, n), name='A', dtype='float16')
+    B = tvm.placeholder((n, n), name='B', dtype='float16')
+    k = tvm.reduce_axis((0, n), name="k")
+    C = tvm.compute((n, n),
+                    lambda ii, jj:
+                    tvm.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
+                    name='C')
+    BA = tvm.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=256)
+    BB = tvm.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=256)
+    BC = tvm.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=256)
+
+    def intrin_func(ins, outs):
+        BA, BB = ins
+        BC, = outs
+
+        def init():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, n, n, BC.elem_offset // 256, 0.0))
+            return ib.get()
+
+        def update():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_intrin('handle', 'tvm_mma_sync',
+                                    BC.data, BC.elem_offset // 256,
+                                    BA.data, BA.elem_offset // 256,
+                                    BB.data, BB.elem_offset // 256,
+                                    BC.data, BC.elem_offset // 256))
+            return ib.get()
+
+        return update(), init(), update()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
+
+
+def intrin_wmma_store_matrix():
+    n = 16
+    A = tvm.placeholder((n, n), name='A', dtype='float32')
+    BA = tvm.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=256)
+    C = tvm.compute((n, n), lambda i, j: A[i, j], name='C')
+    BC = tvm.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=256)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+        BA = ins[0]
+        BC = outs[0]
+        ib.emit(tvm.call_intrin('handle', 'tvm_store_matrix_sync',
+                                BA.data, n, n, n, BA.elem_offset // 256,
+                                BC.access_ptr('w'), n, 'row_major'))
+        return ib.get()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+
+###############################################################################
+# Scheduling the Computation
+# --------------------------
+# To use TensorCores in TVM, we must schedule the computation into specific structure
+# to match the tensor intrinsic. The same as traditional GPU programs, we can also use
+# shared memory to boost the speed. If you have any questions about blocking and shared
+# memory, please refer :ref:`opt-conv-gpu`.
+#
+# In this example, each block contains 2x4 warps, and each warp calls 4x2 TensorCore
+# instructions. Thus, the output shape of each warp is 64x32 and each block outputs
+# 128x128 titles. Due to the limit of shared memory space, we only load 2 blocks (2x128x128 tiles)
+# one time.
+#
+# .. note::
+#
+#   *Warp-level Operation*
+#
+#   Note that all TensorCore instructions are warp-level instructions, which means all 32 threads
+#   in a warp should do this instruction simultaneously. Making theadIdx.x extent=32 is one of the
+#   easiest way to solve this. Then We can bind threadIdx.x to any loops except those contain
+#   TensorCore intrinsics directly or indirectly. Also note that it is not the unique solution.
+#   The only thing we should do is to make sure all threads in a warp can call TensorCore at the same time.
+#
+
+
+# Define tiling sizes
+block_row_warps = 2
+block_col_warps = 4
+warp_row_tiles = 4
+warp_col_tiles = 2
+warp_size = 32
+chunk = 2
+
+block_x = tvm.thread_axis('blockIdx.x')
+block_y = tvm.thread_axis('blockIdx.y')
+block_z = tvm.thread_axis('blockIdx.z')
+thread_x = tvm.thread_axis('threadIdx.x')
+thread_y = tvm.thread_axis('threadIdx.y')
+thread_z = tvm.thread_axis('threadIdx.z')
+
+nc, hc, wc, oc, nnc, ooc = Conv.op.axis
+block_k = s[Conv].fuse(hc, wc)
+s[Conv].bind(block_k, block_z)
+nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
+block_i, nc = s[Conv].split(nc, factor=block_row_warps)
+oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
+block_j, oc = s[Conv].split(oc, factor=block_col_warps)
+s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
+s[Conv].bind(block_i, block_x)
+s[Conv].bind(block_j, block_y)
+s[Conv].bind(nc, thread_y)
+s[Conv].bind(oc, thread_z)
+
+# Schedule local computation
+s[ConvF].compute_at(s[Conv], oc)
+n, h, w, o, nnf, oof = ConvF.op.axis
+ko, ki = s[ConvF].split(ic, factor=chunk)
+s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)
+
+# Move intermediate computation into each output compute tile
+s[AF].compute_at(s[ConvF], kw)
+s[WF].compute_at(s[ConvF], kw)
+
+# Schedule for A's share memory
+s[AS].compute_at(s[ConvF], kh)
+n, h, w, i, nn, ii = AS.op.axis
+tx, xo = s[AS].split(n, nparts=block_row_warps)
+ty, yo = s[AS].split(xo, nparts=block_col_warps)
+t = s[AS].fuse(nn, ii)
+to, ti = s[AS].split(t, factor=warp_size)
+s[AS].bind(tx, thread_y)
+s[AS].bind(ty, thread_z)
+s[AS].bind(ti, thread_x)
+
+# Schedule for W's share memory
+s[WS].compute_at(s[ConvF], kh)
+kh, kw, ic, o, ii, oo = WS.op.axis
+tx, xo = s[WS].split(o, nparts=block_row_warps)
+ty, yo = s[WS].split(xo, nparts=block_col_warps)
+t = s[WS].fuse(ii, oo)
+to, ti = s[WS].split(t, nparts=warp_size)
+s[WS].bind(tx, thread_y)
+s[WS].bind(ty, thread_z)
+s[WS].bind(to, thread_x)
+s[WS].vectorize(ti)
+print(tvm.lower(s, [A, W, Conv], simple_mode=True))
+
+###############################################################################
+# Lowering Computation to Intrinsics
+# --------------------------
+# The last phase is to lower the computation loops down to TensorCore hardware intrinsics
+# by mapping the 2D convolution to tensor intrinsics
+#
+
+s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a'))
+s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b'))
+s[Conv].tensorize(nnc, intrin_wmma_store_matrix())
+s[ConvF].tensorize(nnf, intrin_wmma_gemm())
+print(tvm.lower(s, [A, W, Conv], simple_mode=True))
+
+###############################################################################
+# Generate CUDA Kernel
+# --------------------
+# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution.
+# Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not
+# be able to run on our build server
+
+ctx = tvm.gpu(0)
+if nvcc.have_tensorcore(ctx.compute_version):
+    func = tvm.build(s, [A, W, Conv], 'cuda')
+    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
+    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))
+
+###############################################################################
+# Summary
+# This tutorial demonstrates how TVM scheduling primitives can be used to
+# call TensorCores on specific GPUs.