[VTA][OpenCL] intelfocl (#6126)

* intelfocl support * disable tsim test * bugfix to vta autotvm * disable tsim test in task_python_vta_tsim.sh * fix integration test * update vta submodule and re-enable tsim tests * remove unnecessary comments
apache · Apr 20, 2021 · d0a0194 · d0a0194
1 parent fbdffeb
commit d0a0194
Show file tree

Hide file tree

Showing 23 changed files with 676 additions and 52 deletions.
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
@@ -104,6 +104,10 @@ elseif(PYTHON)
       find_library(__cma_lib NAMES cma PATH /usr/lib)
     elseif(${VTA_TARGET} STREQUAL "de10nano")  # DE10-Nano rules
       file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/de10nano/*.cc ${VTA_HW_PATH}/src/*.cc)
+    elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
+      file(GLOB FOCL_SRC ${VTA_HW_PATH}/src/oclfpga/*.cc)
+      list(APPEND FPGA_RUNTIME_SRCS ${FOCL_SRC})
+      list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc ${VTA_HW_PATH}/src/vmem/virtual_memory.h)
     endif()
     # Target lib: vta
     add_library(vta SHARED ${FPGA_RUNTIME_SRCS})
@@ -123,6 +127,10 @@ elseif(PYTHON)
       target_include_directories(vta SYSTEM PUBLIC 3rdparty)
       target_include_directories(vta SYSTEM PUBLIC
         "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")
+    elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
+      target_include_directories(vta PUBLIC 3rdparty)
+      set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+      target_link_libraries(vta -lOpenCL)
     endif()
   endif()
 

diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
@@ -227,7 +227,7 @@ def _decorate(topi_schedule):
         @_register_task_schedule(task_name)
         def wrapper(outs, *args, **kwargs):
             """wrapper function for topi schedule"""
-            workload = get_workload(outs)
+            workload = get_workload(outs, task_name)
             if workload is None:
                 raise RuntimeError("Cannot find workload in attribute of this schedule")
             tgt = Target.current()
@@ -241,18 +241,21 @@ def wrapper(outs, *args, **kwargs):
     return _decorate
 
 
-def get_workload(outs):
+def get_workload(outs, task_name=None):
     """Retrieve the workload from outputs"""
 
     def traverse(tensors):
         """traverse all ops to find attached workload"""
         for t in tensors:
             op = t.op
-            if "workload" in op.attrs:
-                return args_to_workload(op.attrs["workload"])
             wkl = traverse(op.input_tensors)
             if wkl:
                 return wkl
+
+            if "workload" in op.attrs:
+                ret = args_to_workload(op.attrs["workload"])
+                if task_name is None or ret[0] == task_name:
+                    return ret
         return None
 
     outs = [outs] if isinstance(outs, tensor.Tensor) else outs

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
@@ -53,6 +53,15 @@ def wrapper(attrs, outs, target):
     return wrapper
 
 
+def wrap_topi_compute(topi_compute):
+    """Wrap TOPI compute which doesn't use attrs"""
+
+    def wrapper(attrs, inputs, out_type):
+        return [topi_compute(*inputs)]
+
+    return wrapper
+
+
 def get_conv2d_in_channels(data_shape, data_layout):
     """Get conv2d input channels"""
     data_shape = get_const_tuple(data_shape)

diff --git a/python/tvm/relay/testing/tf.py b/python/tvm/relay/testing/tf.py
@@ -32,7 +32,7 @@
 
 try:
     tf_compat_v1 = tf.compat.v1
-except ImportError:
+except (ImportError, AttributeError):
     tf_compat_v1 = tf
 
 ######################################################################

diff --git a/python/tvm/topi/x86/bitserial_dense.py b/python/tvm/topi/x86/bitserial_dense.py
@@ -122,7 +122,7 @@ def bitserial_dense(
     return matmul
 
 
-@autotvm.register_topi_schedule("biserial_dense.x86")
+@autotvm.register_topi_schedule("bitserial_dense.x86")
 def schedule_bitserial_dense(cfg, outs):
     """Schedule for bitserial_dense.
 

diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
@@ -251,7 +251,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
           << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
           << " anchor=" << anchor_op_ << " current=" << op;
     }
-    if (op_pattern >= anchor_op_pattern_) {
+    if (op_pattern > anchor_op_pattern_) {
       anchor_op_ = op;
       anchor_attrs_ = call_node->attrs;
       anchor_op_pattern_ = op_pattern;
@@ -309,7 +309,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   tvm::Target target_;
   Op anchor_op_;
   Attrs anchor_attrs_;
-  int anchor_op_pattern_{0};
+  int anchor_op_pattern_{-1};
   OpImplementation anchor_implementation_;
   std::ostringstream readable_name_stream_;
   Array<te::Operation> scalars_;

diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc
@@ -115,7 +115,6 @@ class WorkspacePool::Pool {
   }
   // Release all resources
   void Release(Device dev, DeviceAPI* device) {
-    ICHECK_EQ(allocated_.size(), 1);
     for (size_t i = 1; i < free_list_.size(); ++i) {
       device->FreeDataSpace(dev, free_list_[i].data);
     }

diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
@@ -109,16 +109,6 @@ class BuiltinLower : public StmtExprMutator {
     op = stmt.as<AllocateNode>();
     // Get constant allocation bound.
     int64_t nbytes = GetVectorBytes(op->dtype);
-    if (device_type_.defined()) {
-      if (const auto* dev_type = device_type_.as<IntImmNode>()) {
-        if (dev_type->value == kDLCPU) {
-          int32_t constant_size = op->constant_allocation_size();
-          if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
-            return stmt;
-          }
-        }
-      }
-    }
     PrimExpr total_bytes = make_const(op->extents[0].dtype(), nbytes);
     for (size_t i = 0; i < op->extents.size(); ++i) {
       total_bytes = total_bytes * op->extents[i];

diff --git a/vta/python/vta/autotvm.py b/vta/python/vta/autotvm.py
@@ -46,7 +46,7 @@ def reprogram_fpga(remote, _build_result):
         _build_result : tvm.autotvm.measure.measure_methods.BuildResult
             Artifact from the build phase, unused here.
         """
-        rpc_client.program_bitstream(remote, bitstream)
+        rpc_client.program_fpga(remote, bitstream)
         rpc_client.reconfig_runtime(remote)
 
     return default_module_loader(reprogram_fpga)
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
@@ -66,11 +66,13 @@ class DevContext(object):
     MEM_ID_INP = 2
     MEM_ID_ACC = 3
     MEM_ID_OUT = 4
+    MEM_ID_ACC_8BIT = 5
     # VTA ALU Opcodes
     ALU_OPCODE_MIN = 0
     ALU_OPCODE_MAX = 1
     ALU_OPCODE_ADD = 2
     ALU_OPCODE_SHR = 3
+    ALU_OPCODE_MUL = 4
     # Task queue id (pipeline stage)
     QID_LOAD_INP = 1
     QID_LOAD_WGT = 1
@@ -232,7 +234,7 @@ def target_host(self):
             return "llvm -mtriple=armv7-none-linux-gnueabihf"
         if self.TARGET == "ultra96":
             return "llvm -mtriple=aarch64-linux-gnu"
-        if self.TARGET in ["sim", "tsim"]:
+        if self.TARGET in ["sim", "tsim", "intelfocl"]:
             return "llvm"
         raise ValueError("Unknown target %s" % self.TARGET)
 

diff --git a/vta/python/vta/program_bitstream.py b/vta/python/vta/program_bitstream.py
@@ -57,14 +57,26 @@ def de10nano_bitstream_program(bitstream_path):
     program(bitstream_path)
 
 
-def bitstream_program(target, bitstream):
+def intelfocl_bitstream_program(bitstream_path, mem_size=4 * 1024 * 1024 * 1024):
+    # pylint: disable=import-outside-toplevel
+    from tvm import get_global_func
+
+    program = get_global_func("vta.oclfpga.program")
+    program(bitstream_path, mem_size)
+
+
+def bitstream_program(target, bitstream, *args):
+    """program bitstream to devices"""
+
     if target in ["pynq", "ultra96"]:
         pynq_bitstream_program(bitstream)
     elif target in ["de10nano"]:
         de10nano_bitstream_program(bitstream)
     elif target in ["sim", "tsim"]:
         # In simulation, bit stream programming is a no-op
         return
+    elif target in ["intelfocl"]:
+        intelfocl_bitstream_program(bitstream, *args)
     else:
         raise RuntimeError("Unknown target {}".format(target))
 

diff --git a/vta/python/vta/rpc_client.py b/vta/python/vta/rpc_client.py
@@ -17,6 +17,8 @@
 """VTA RPC client function"""
 import os
 
+from tvm import rpc
+from vta import program_bitstream
 from .environment import get_env
 from .bitstream import download_bitstream, get_bitstream_path
 
@@ -45,16 +47,20 @@ def program_fpga(remote, bitstream=None):
     bitstream : str, optional
         Path to a local bistream file. If unset, tries to download from cache server.
     """
+    env = get_env()
+
     if bitstream:
         assert os.path.isfile(bitstream)
     else:
         bitstream = get_bitstream_path()
         if not os.path.isfile(bitstream):
-            env = get_env()
             if env.TARGET == "de10nano":
                 return
             download_bitstream()
 
-    fprogram = remote.get_function("tvm.contrib.vta.init")
-    remote.upload(bitstream)
-    fprogram(os.path.basename(bitstream))
+    if isinstance(remote, rpc.LocalSession):
+        program_bitstream.bitstream_program(env.TARGET, bitstream)
+    else:
+        fprogram = remote.get_function("tvm.contrib.vta.init")
+        remote.upload(bitstream)
+        fprogram(os.path.basename(bitstream))
diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py
@@ -27,7 +27,13 @@ def _load_sw():
     """Load hardware library for simulator."""
 
     env = get_env()
-    lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim"
+    lib_driver_name = (
+        "libvta_tsim"
+        if env.TARGET == "tsim"
+        else "libvta"
+        if env.TARGET == "intelfocl"
+        else "libvta_fsim"
+    )
     require_sim = env.TARGET in ("sim", "tsim")
     libs = []
 

diff --git a/vta/python/vta/testing/utils.py b/vta/python/vta/testing/utils.py
@@ -32,7 +32,7 @@ def run(run_func):
     """
     env = get_env()
 
-    if env.TARGET in ["sim", "tsim"]:
+    if env.TARGET in ["sim", "tsim", "intelfocl"]:
         # Talk to local RPC if necessary to debug RPC server.
         # Compile vta on your host with make at the root.
         # Make sure TARGET is set to "sim" in the config.json file.

diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
@@ -423,7 +423,7 @@ def visit_call(self, call):
                 self.start_pack and call.op == op.op.get("cast") and input_types[0].dtype == "int32"
             ):
                 cast = relay.Call(op.op.get("cast"), [args[0]], call.attrs)
-                return relay.Call(op.op.get("copy"), [cast])
+                return cast
             elif call.op == self.pad:
                 pad_width = call.attrs.pad_width
                 if len(pad_width) == 6: