From 374be000cdc8bbbee7770290e493de987752c944 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 11 Apr 2018 13:45:11 -0700 Subject: [PATCH] [DRIVER][RUNTIME] Make runtime fully device agnostic (#23) --- include/vta/driver.h | 87 ++++++++++++----------- src/data_buffer.h | 4 +- src/pynq/pynq_driver.cc | 140 ++++++++++++++++++++++++++++++-------- src/pynq/pynq_driver.h | 8 ++- src/runtime.cc | 95 +++++++------------------- src/tvm/vta_device_api.cc | 4 -- 6 files changed, 188 insertions(+), 150 deletions(-) diff --git a/include/vta/driver.h b/include/vta/driver.h index c93021d96e4b..8a29fc47aa84 100644 --- a/include/vta/driver.h +++ b/include/vta/driver.h @@ -1,7 +1,9 @@ /*! * Copyright (c) 2018 by Contributors * \file vta_driver.h - * \brief General driver interface. + * \brief Driver interface that is used by runtime. + * + * Driver's implementation is device specific. */ #ifndef VTA_DRIVER_H_ @@ -11,16 +13,50 @@ extern "C" { #endif -#include #include +#include -/*! \brief Memory management constants */ +/*! \brief Memory management constants for cached memory */ #define VTA_CACHED 1 -/*! \brief Memory management constants */ +/*! \brief Memory management constants for non-cached memory */ #define VTA_NOT_CACHED 0 -/*! \brief VTA command handle */ -typedef void * VTAHandle; +/*! \brief Physically contiguous buffer size limit */ +#ifndef VTA_MAX_XFER +#define VTA_MAX_XFER (1<<22) +#endif + +/*! \brief Device resource context */ +typedef void * VTADeviceHandle; + +/*! \brief physical address */ +typedef uint32_t vta_phy_addr_t; + +/*! + * \brief Allocate a device resource handle + * \return The device handle. + */ +VTADeviceHandle VTADeviceAlloc(); + +/*! + * \brief Free a device handle + * \param handle The device handle to be freed. + */ +void VTADeviceFree(VTADeviceHandle handle); + +/*! + * \brief Launch the instructions block until done. + * \param The device handle. + * \param insn_phy_addr The physical address of instruction stream. + * \param insn_count Instruction count. + * \param wait_cycles The maximum of cycles to wait + * + * \return 0 if running is successful, 1 if timeout. + */ +int VTADeviceRun(VTADeviceHandle device, + vta_phy_addr_t insn_phy_addr, + uint32_t insn_count, + uint32_t wait_cycles); /*! * \brief Allocates physically contiguous region in memory (limited by MAX_XFER). @@ -41,52 +77,23 @@ void VTAMemFree(void* buf); * \param buf Pointer to memory region allocated with VTAMemAlloc. * \return The physical address of the memory region. */ -uint32_t VTAGetMemPhysAddr(void* buf); +vta_phy_addr_t VTAGetMemPhysAddr(void* buf); /*! * \brief Flushes the region of memory out of the CPU cache to DRAM. * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed. + * This need to be the physical address. * \param size Size of the region to flush in Bytes. */ -void VTAFlushCache(void* buf, int size); +void VTAFlushCache(vta_phy_addr_t buf, int size); /*! * \brief Invalidates the region of memory that is cached. * \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated. + * This need to be the physical address. * \param size Size of the region to invalidate in Bytes. */ -void VTAInvalidateCache(void* buf, int size); - -/*! - * \brief Returns a memory map to FPGA configuration registers. - * \param addr The base physical address of the configuration registers. - * \param length The size of the memory mapped region in bytes. - * \return A pointer to the memory mapped region. - */ -void *VTAMapRegister(unsigned addr, size_t length); - -/*! - * \brief Deletes the configuration register memory map. - * \param vta The memory mapped region. - * \param length The size of the memory mapped region in bytes. - */ -void VTAUnmapRegister(void *vta, size_t length); - -/*! - * \brief Writes to a memory mapped configuration register. - * \param vta_base The handle to the memory mapped configuration registers. - * \param offset The offset of the register to write to. - * \param val The value to be written to the memory mapped register. - */ -void VTAWriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val); - -/*! - * \brief Reads from the memory mapped configuration register. - * \param vta_base The handle to the memory mapped configuration registers. - * \param offset The offset of the register to read from. - * \return The value read from the memory mapped register. - */ -unsigned VTAReadMappedReg(VTAHandle vta_base, unsigned offset); +void VTAInvalidateCache(vta_phy_addr_t buf, int size); /*! * \brief Programming the bit stream on the FPGA. diff --git a/src/data_buffer.h b/src/data_buffer.h index 117a423d034e..aed92c49e795 100644 --- a/src/data_buffer.h +++ b/src/data_buffer.h @@ -35,7 +35,7 @@ struct DataBuffer { */ void InvalidateCache(size_t offset, size_t size) { if (!kBufferCoherent) { - VTAInvalidateCache(reinterpret_cast(phy_addr_ + offset), size); + VTAInvalidateCache(phy_addr_ + offset, size); } } /*! @@ -45,7 +45,7 @@ struct DataBuffer { */ void FlushCache(size_t offset, size_t size) { if (!kBufferCoherent) { - VTAFlushCache(reinterpret_cast(phy_addr_ + offset), size); + VTAFlushCache(phy_addr_ + offset, size); } } /*! diff --git a/src/pynq/pynq_driver.cc b/src/pynq/pynq_driver.cc index 1787af8da526..0330450db285 100644 --- a/src/pynq/pynq_driver.cc +++ b/src/pynq/pynq_driver.cc @@ -5,6 +5,7 @@ */ #include +#include #include "./pynq_driver.h" @@ -16,16 +17,16 @@ void VTAMemFree(void* buf) { cma_free(buf); } -uint32_t VTAGetMemPhysAddr(void* buf) { +vta_phy_addr_t VTAGetMemPhysAddr(void* buf) { return cma_get_phy_addr(buf); } -void VTAFlushCache(void* buf, int size) { - xlnkFlushCache(buf, size); +void VTAFlushCache(vta_phy_addr_t buf, int size) { + xlnkFlushCache(reinterpret_cast(buf), size); } -void VTAInvalidateCache(void* buf, int size) { - xlnkInvalidateCache(buf, size); +void VTAInvalidateCache(vta_phy_addr_t buf, int size) { + xlnkInvalidateCache(reinterpret_cast(buf), size); } void *VTAMapRegister(uint32_t addr, size_t length) { @@ -57,33 +58,112 @@ uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) { return *((volatile uint32_t *) (reinterpret_cast(base_addr) + offset)); } +class VTADevice { + public: + VTADevice() { + // VTA stage handles + vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); + vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); + vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); + vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); + } + + ~VTADevice() { + // Close VTA stage handle + VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE); + VTAUnmapRegister(vta_load_handle_, VTA_RANGE); + VTAUnmapRegister(vta_compute_handle_, VTA_RANGE); + VTAUnmapRegister(vta_store_handle_, VTA_RANGE); + } + + int Run(vta_phy_addr_t insn_phy_addr, + uint32_t insn_count, + uint32_t wait_cycles) { + // NOTE: Register address map is derived from the auto-generated + // driver files available under hardware/build/vivado//export/driver + // FETCH @ 0x10 : Data signal of insn_count_V + VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_count); + // FETCH @ 0x18 : Data signal of insns_V + VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_phy_addr); + // LOAD @ 0x10 : Data signal of inputs_V + VTAWriteMappedReg(vta_load_handle_, 0x10, 0); + // LOAD @ 0x18 : Data signal of weight_V + VTAWriteMappedReg(vta_load_handle_, 0x18, 0); + // COMPUTE @ 0x20 : Data signal of uops_V + VTAWriteMappedReg(vta_compute_handle_, 0x20, 0); + // COMPUTE @ 0x28 : Data signal of biases_V + VTAWriteMappedReg(vta_compute_handle_, 0x28, 0); + // STORE @ 0x10 : Data signal of outputs_V + VTAWriteMappedReg(vta_store_handle_, 0x10, 0); + + // VTA start + VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START); + VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART); + VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART); + VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART); + + // Loop until the VTA is done + unsigned t, flag = 0; + for (t = 0; t < wait_cycles; ++t) { + flag = VTAReadMappedReg(vta_compute_handle_, 0x18); + if (flag == VTA_DONE) break; + std::this_thread::yield(); + } + // Report error if timeout + return t < wait_cycles ? 0 : 1; + } + + private: + // VTA handles (register maps) + void* vta_fetch_handle_{nullptr}; + void* vta_load_handle_{nullptr}; + void* vta_compute_handle_{nullptr}; + void* vta_store_handle_{nullptr}; +}; + +VTADeviceHandle VTADeviceAlloc() { + return new VTADevice(); +} + +void VTADeviceFree(VTADeviceHandle handle) { + delete static_cast(handle); +} + +int VTADeviceRun(VTADeviceHandle handle, + vta_phy_addr_t insn_phy_addr, + uint32_t insn_count, + uint32_t wait_cycles) { + return static_cast(handle)->Run( + insn_phy_addr, insn_count, wait_cycles); +} + void VTAProgram(const char* bitstream) { - int elem; - FILE *src, *dst, *partial; - partial = fopen(VTA_PYNQ_BS_IS_PARTIAL, "w"); - if (partial == NULL) { - printf("Cannot open partial config file %s\n", VTA_PYNQ_BS_IS_PARTIAL); + int elem; + FILE *src, *dst, *partial; + partial = fopen(VTA_PYNQ_BS_IS_PARTIAL, "w"); + if (partial == NULL) { + printf("Cannot open partial config file %s\n", VTA_PYNQ_BS_IS_PARTIAL); fclose(partial); exit(1); - } - fputc('0', partial); - fclose(partial); - src = fopen(bitstream, "rb"); - if (src == NULL) { - printf("Cannot open bitstream %s\n", bitstream); - exit(1); - } - dst = fopen(VTA_PYNQ_BS_XDEVCFG, "wb"); - if (dst == NULL) { - printf("Cannot open device file %s\n", VTA_PYNQ_BS_XDEVCFG); - fclose(dst); - exit(1); - } - elem = fgetc(src); - while (elem != EOF) { - fputc(elem, dst); - elem = fgetc(src); - } - fclose(src); + } + fputc('0', partial); + fclose(partial); + src = fopen(bitstream, "rb"); + if (src == NULL) { + printf("Cannot open bitstream %s\n", bitstream); + exit(1); + } + dst = fopen(VTA_PYNQ_BS_XDEVCFG, "wb"); + if (dst == NULL) { + printf("Cannot open device file %s\n", VTA_PYNQ_BS_XDEVCFG); fclose(dst); + exit(1); + } + elem = fgetc(src); + while (elem != EOF) { + fputc(elem, dst); + elem = fgetc(src); + } + fclose(src); + fclose(dst); } diff --git a/src/pynq/pynq_driver.h b/src/pynq/pynq_driver.h index 481df6bbe077..74da67d7c035 100644 --- a/src/pynq/pynq_driver.h +++ b/src/pynq/pynq_driver.h @@ -32,6 +32,11 @@ void xlnkFlushCache(void* buf, int size); void xlnkInvalidateCache(void* buf, int size); #endif +void *VTAMapRegister(uint32_t addr, size_t length); +void VTAUnmapRegister(void *vta, size_t length); +void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val); +uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset); + /*! \brief (Pynq only) Partial bitstream status file path */ #define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream" /*! \brief (Pynq only) Bitstream destination file path */ @@ -44,9 +49,6 @@ void xlnkInvalidateCache(void* buf, int size); /*! \brief (Pynq only) MMIO driver constant */ #define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1)) -/*! \brief Physically contiguous buffer size limit */ -#define VTA_MAX_XFER (1<<22) - /*! \brief VTA configuration register address range */ #define VTA_RANGE 0x100 /*! \brief VTA configuration register start value */ diff --git a/src/runtime.cc b/src/runtime.cc index 9333e393fc12..a8819323fc11 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -1,13 +1,13 @@ /*! * Copyright (c) 2018 by Contributors * \file runtime.cc - * \brief VTA runtime for PYNQ in C++11 + * \brief Generic VTA runtime in C++11. + * + * The runtime depends on specific instruction + * stream spec as specified in hw_spec.h + * It is intended to be used as a dynamic library + * to enable hot swapping of hardware configurations. */ - -#ifdef VTA_PYNQ_TARGET -#include "./pynq/pynq_driver.h" -#endif // VTA_PYNQ_TARGET - #include #include #include @@ -245,8 +245,8 @@ class BaseQueue { if (!coherent_ && always_cache_ && dram_extent != 0) { dram_begin = dram_begin * elem_bits / 8; dram_extent = dram_extent * elem_bits / 8; - VTAFlushCache(reinterpret_cast(dram_phy_addr_ + dram_begin), - dram_extent); + VTAFlushCache(dram_phy_addr_ + dram_begin, + dram_extent); } } /*! \brief Read barrier to make sure that data written by VTA is visible to CPU. */ @@ -254,8 +254,8 @@ class BaseQueue { if (!coherent_ && always_cache_ && dram_extent != 0) { dram_begin = dram_begin * elem_bits / 8; dram_extent = dram_extent * elem_bits / 8; - VTAInvalidateCache(reinterpret_cast(dram_phy_addr_ + dram_begin), - dram_extent); + VTAInvalidateCache(dram_phy_addr_ + dram_begin, + dram_extent); } } @@ -818,20 +818,13 @@ class CommandQueue { void InitSpace() { uop_queue_.InitSpace(); insn_queue_.InitSpace(); - // VTA stage handles - vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); - vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); - vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); - vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); + device_ = VTADeviceAlloc(); + assert(device_ != nullptr); printf("Initialize VTACommandHandle...\n"); } ~CommandQueue() { - // Close VTA stage handle - VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE); - VTAUnmapRegister(vta_load_handle_, VTA_RANGE); - VTAUnmapRegister(vta_compute_handle_, VTA_RANGE); - VTAUnmapRegister(vta_store_handle_, VTA_RANGE); + VTADeviceFree(device_); printf("Close VTACommandhandle...\n"); } @@ -951,44 +944,14 @@ class CommandQueue { assert(reinterpret_cast( insn_queue_.data())[insn_queue_.count()-1].opcode == VTA_OPCODE_FINISH); -#ifdef VTA_PYNQ_TARGET // Make sure that we don't exceed contiguous physical memory limits - assert(insn_queue_.count() < VTA_MAX_XFER); - - // NOTE: Register address map is derived from the auto-generated - // driver files available under hardware/build/vivado//export/driver - // FETCH @ 0x10 : Data signal of insn_count_V - VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_queue_.count()); - // FETCH @ 0x18 : Data signal of insns_V - VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_queue_.dram_phy_addr()); - // LOAD @ 0x10 : Data signal of inputs_V - VTAWriteMappedReg(vta_load_handle_, 0x10, 0); - // LOAD @ 0x18 : Data signal of weight_V - VTAWriteMappedReg(vta_load_handle_, 0x18, 0); - // COMPUTE @ 0x20 : Data signal of uops_V - VTAWriteMappedReg(vta_compute_handle_, 0x20, 0); - // COMPUTE @ 0x28 : Data signal of biases_V - VTAWriteMappedReg(vta_compute_handle_, 0x28, 0); - // STORE @ 0x10 : Data signal of outputs_V - VTAWriteMappedReg(vta_store_handle_, 0x10, 0); - - // VTA start - VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START); - VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART); - VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART); - VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART); - - // Loop until the VTA is done - unsigned t, flag = 0; - for (t = 0; t < wait_cycles; ++t) { - flag = VTAReadMappedReg(vta_compute_handle_, 0x18); - if (flag == VTA_DONE) break; - std::this_thread::yield(); - } - // Report error if timeout - assert(t < wait_cycles); -#endif // VTA_PYNQ_TARGET - + assert(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER); + int timeout = VTADeviceRun( + device_, + insn_queue_.dram_phy_addr(), + insn_queue_.count(), + wait_cycles); + assert(timeout == 0); // Reset buffers uop_queue_.Reset(); insn_queue_.Reset(); @@ -1147,7 +1110,7 @@ class CommandQueue { void CheckInsnOverFlow() { // At each API call, we can at most commit: // one pending store, one pending load, and one uop - if (insn_queue_.count() >= VTA_MAX_XFER) { + if ((insn_queue_.count() + 4) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) { this->AutoSync(); } } @@ -1155,11 +1118,7 @@ class CommandQueue { void AutoSync() { this->Synchronize(1 << 31); } - // VTA handles (register maps) - VTAHandle vta_fetch_handle_{nullptr}; - VTAHandle vta_load_handle_{nullptr}; - VTAHandle vta_compute_handle_{nullptr}; - VTAHandle vta_store_handle_{nullptr}; + // Internal debug flag int debug_flag_{0}; // The kernel we currently recording @@ -1168,6 +1127,8 @@ class CommandQueue { UopQueue uop_queue_; // instruction queue InsnQueue insn_queue_; + // Device handle + VTADeviceHandle device_{nullptr}; }; } // namespace vta @@ -1302,11 +1263,3 @@ void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) { static_cast(cmd)-> Synchronize(wait_cycles); } - -extern "C" int VTARuntimeDynamicMagic() { -#ifdef VTA_DYNAMIC_MAGIC - return VTA_DYNAMIC_MAGIC; -#else - return 0; -#endif -} diff --git a/src/tvm/vta_device_api.cc b/src/tvm/vta_device_api.cc index ce864df09402..450b23b05fee 100644 --- a/src/tvm/vta_device_api.cc +++ b/src/tvm/vta_device_api.cc @@ -11,10 +11,6 @@ #include "../../nnvm/tvm/src/runtime/workspace_pool.h" -extern "C" { - typedef void (*FShutdown)(); - typedef int (*FDynamicMagic)(); -} namespace tvm { namespace runtime {