From 9d64d321435d9149058b151644b51425d183a9e2 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 22 Jul 2019 08:31:37 -0700 Subject: [PATCH] [VTA] Runtime refactor to allow for non-shared memory FPGAs (e.g. F1) (#3554) * updated runtime to support non-shared memory FPGAs for instruction and micro-op kernels * adding driver-defined memcpy function to handle F1 cases * refactor to include flush/invalidate in memcpy driver function * update tsim driver * bug fixes * cleanup * pre-allocate fpga readable buffers to improve perf * fix * remove instruction stream address rewrite pass for micro op kernels * fix: * white spaces * fix lint * avoid signed/unsigned compilation warning * avoid signed/unsigned compilation warning * fix * fix * addressing comments * whitespace * moving flush/invalidate out of memmove * clearnup * fix * cosmetic * rename API * comment fix --- vta/include/vta/driver.h | 20 +++- vta/include/vta/runtime.h | 1 + vta/src/pynq/pynq_driver.cc | 19 +++- vta/src/pynq/pynq_driver.h | 7 -- vta/src/runtime.cc | 227 ++++++++++++++++++++++++++------------------ vta/src/sim/sim_driver.cc | 8 ++ vta/src/tsim/tsim_driver.cc | 8 ++ 7 files changed, 190 insertions(+), 100 deletions(-) diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h index 2d8e9c2..a6f5fd2 100644 --- a/vta/include/vta/driver.h +++ b/vta/include/vta/driver.h @@ -98,7 +98,7 @@ int VTADeviceRun(VTADeviceHandle device, #endif /*! - * \brief Allocates physically contiguous region in memory (limited by MAX_XFER). + * \brief Allocates physically contiguous region in memory readable/writeable by FPGA. * \param size Size of the region in Bytes. * \param cached Region can be set to not cached (write-back) if set to 0. * \return A pointer to the allocated region. @@ -106,7 +106,7 @@ int VTADeviceRun(VTADeviceHandle device, void* VTAMemAlloc(size_t size, int cached); /*! - * \brief Frees a physically contiguous region in memory. + * \brief Frees a physically contiguous region in memory readable/writeable by FPGA. * \param buf Buffer to free. */ void VTAMemFree(void* buf); @@ -119,6 +119,22 @@ void VTAMemFree(void* buf); vta_phy_addr_t VTAMemGetPhyAddr(void* buf); /*! + * \brief Performs a copy operation from host memory to buffer allocated with VTAMemAlloc. + * \param dst The desination buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc. + * \param src The source buffer in host memory. + * \param size Size of the region in Bytes. + */ +void VTAMemCopyFromHost(void* dst, const void* src, size_t size); + +/*! + * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory. + * \param dst The destination buffer in host memory. + * \param src The source buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc. + * \param size Size of the region in Bytes. + */ +void VTAMemCopyToHost(void* dst, const void* src, size_t size); + +/*! * \brief Flushes the region of memory out of the CPU cache to DRAM. * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed. * This need to be the physical address. diff --git a/vta/include/vta/runtime.h b/vta/include/vta/runtime.h index e19e1ab..5e99b41 100644 --- a/vta/include/vta/runtime.h +++ b/vta/include/vta/runtime.h @@ -105,6 +105,7 @@ TVM_DLL void VTAWriteBarrier(VTACommandHandle cmd, uint32_t elem_bits, uint32_t start, uint32_t extent); + /*! * \brief Perform a read barrier to a memory region visible to VTA. * \param cmd The VTA command handle. diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc index 5f96b65..47ca604 100644 --- a/vta/src/pynq/pynq_driver.cc +++ b/vta/src/pynq/pynq_driver.cc @@ -29,10 +29,13 @@ void* VTAMemAlloc(size_t size, int cached) { + assert(size <= VTA_MAX_XFER); + // Rely on the pynq-specific cma library return cma_alloc(size, cached); } void VTAMemFree(void* buf) { + // Rely on the pynq-specific cma library cma_free(buf); } @@ -40,11 +43,25 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) { return cma_get_phy_addr(buf); } +void VTAMemCopyFromHost(void* dst, const void* src, size_t size) { + // For SoC-based FPGAs that used shared memory with the CPU, use memcopy() + memcpy(dst, src, size); +} + +void VTAMemCopyToHost(void* dst, const void* src, size_t size) { + // For SoC-based FPGAs that used shared memory with the CPU, use memcopy() + memcpy(dst, src, size); +} + void VTAFlushCache(vta_phy_addr_t buf, int size) { + // Call the xlnkFlushCache on the CMA buffer + // so that the FPGA can read the buffer data. xlnkFlushCache(reinterpret_cast(buf), size); } void VTAInvalidateCache(vta_phy_addr_t buf, int size) { + // Call the xlnkInvalidateCache on the CMA buffer + // so that the host needs to read the buffer data. xlnkInvalidateCache(reinterpret_cast(buf), size); } @@ -54,7 +71,7 @@ void *VTAMapRegister(uint32_t addr, size_t length) { // Calculate base address offset w.r.t the base address uint32_t virt_offset = addr - virt_base; // Open file and mmap - uint32_t mmap_file = open(VTA_PYNQ_DEV_MEM_PATH, O_RDWR|O_SYNC); + uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC); return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, diff --git a/vta/src/pynq/pynq_driver.h b/vta/src/pynq/pynq_driver.h index 61ff7f2..7cfee4c 100644 --- a/vta/src/pynq/pynq_driver.h +++ b/vta/src/pynq/pynq_driver.h @@ -56,13 +56,6 @@ void VTAUnmapRegister(void *vta, size_t length); void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val); uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset); -/*! \brief (Pynq only) Path to /dev/mem */ -#define VTA_PYNQ_DEV_MEM_PATH "/dev/mem" -/*! \brief (Pynq only) MMIO driver constant */ -#define VTA_PYNQ_MMIO_WORD_LENGTH 4 -/*! \brief (Pynq only) MMIO driver constant */ -#define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1)) - /*! \brief VTA configuration register address range */ #define VTA_RANGE 0x100 /*! \brief VTA configuration register start value */ diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc index f44e3ca..cebfaf7 100644 --- a/vta/src/runtime.cc +++ b/vta/src/runtime.cc @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -44,7 +44,7 @@ namespace vta { static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8, "VTA_UOP_WIDTH do not match VTAUop size"); -/*! \brief Enable coherent access between VTA and CPU. */ +/*! \brief Enable coherent access between VTA and CPU (used on shared mem systems). */ static const bool kBufferCoherent = true; /*! @@ -80,6 +80,24 @@ struct DataBuffer { } } /*! + * \brief Performs a copy operation from host memory to buffer allocated with VTAMemAlloc. + * \param dst The desination buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc(). + * \param src The source buffer in host memory. + * \param size Size of the region in Bytes. + */ + void MemCopyFromHost(void* dst, const void* src, size_t size) { + VTAMemCopyFromHost(dst, src, size); + } + /*! + * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory. + * \param dst The desination buffer in host memory. + * \param src The source buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc(). + * \param size Size of the region in Bytes. + */ + void MemCopyToHost(void* dst, const void* src, size_t size) { + VTAMemCopyToHost(dst, src, size); + } + /*! * \brief Allocate a buffer of a given size. * \param size The size of the buffer. */ @@ -274,7 +292,7 @@ class UopKernel { template friend class UopQueue; friend class CommandQueue; - // SRAM location if begin != end. + // SRAM location if begin != end uint32_t sram_begin_{0}; uint32_t sram_end_{0}; // The signature used for verification @@ -290,11 +308,12 @@ class UopKernel { /*! * \brief Base class of all queues to send and recv serial data. */ +template class BaseQueue { public: ~BaseQueue() { - if (dram_buffer_ != nullptr) { - VTAMemFree(dram_buffer_); + if (fpga_buff_ != nullptr) { + VTAMemFree(fpga_buff_); } } /*! \return Content of DRAM buffer. */ @@ -303,7 +322,8 @@ class BaseQueue { } /*! \return Physical address of DRAM. */ vta_phy_addr_t dram_phy_addr() const { - return dram_phy_addr_; + CHECK(fpga_buff_phy_); + return fpga_buff_phy_; } /*! \return Whether there is pending information. */ bool pending() const { @@ -314,43 +334,23 @@ class BaseQueue { coherent_ = coherent; always_cache_ = always_cache; elem_bytes_ = elem_bytes; - dram_buffer_ = static_cast(VTAMemAlloc( - max_bytes, coherent || always_cache_)); - CHECK(dram_buffer_ != nullptr); - dram_phy_addr_ = VTAMemGetPhyAddr(dram_buffer_); + // Allocate buffer ahead of time + fpga_buff_ = static_cast(VTAMemAlloc( + max_bytes, coherent_ || always_cache_)); + CHECK(fpga_buff_ != nullptr); + fpga_buff_phy_ = VTAMemGetPhyAddr(fpga_buff_); } /*! * \brief Reset the pointer of the buffer. * Set SRAM pointer to be the current end. */ void Reset() { - dram_begin_ = dram_end_ = 0; + dram_buffer_.clear(); sram_begin_ = sram_end_; } - void AutoReadBarrier() { - ReadBarrier(elem_bytes_ * 8, 0, dram_end_); - } - /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */ - void ReadBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) { - if (!coherent_ && always_cache_ && dram_extent != 0) { - dram_begin = dram_begin * elem_bits / 8; - dram_extent = dram_extent * elem_bits / 8; - VTAFlushCache(dram_phy_addr_ + dram_begin, - dram_extent); - } - } - /*! \brief Read barrier to make sure that data written by VTA is visible to CPU. */ - void WriteBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) { - if (!coherent_ && always_cache_ && dram_extent != 0) { - dram_begin = dram_begin * elem_bits / 8; - dram_extent = dram_extent * elem_bits / 8; - VTAInvalidateCache(dram_phy_addr_ + dram_begin, - dram_extent); - } - } protected: - // Cache coherence access + // Cache coherence access (shared memory only) bool coherent_{false}; // Make the buffer cacheable bool always_cache_{false}; @@ -360,21 +360,19 @@ class BaseQueue { uint32_t sram_begin_{0}; // End location of current SRAM write in FIFO mode uint32_t sram_end_{0}; - // The current pending offset in DRAM in FIFO mode - uint32_t dram_begin_{0}; - // The current pending offset in DRAM in FIFO mode - uint32_t dram_end_{0}; // The buffer in DRAM - char* dram_buffer_{nullptr}; - // Physics address of the buffer - vta_phy_addr_t dram_phy_addr_; + std::vector dram_buffer_; + // FPGA accessible buffer + void* fpga_buff_{NULL}; + // Physical address of the FPGA buffer + vta_phy_addr_t fpga_buff_phy_{0}; }; /*! * \brief Micro op buffer that manages the micro op cache. */ template -class UopQueue : public BaseQueue { +class UopQueue : public BaseQueue { public: void InitSpace() { BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache); @@ -382,17 +380,20 @@ class UopQueue : public BaseQueue { // Push data to the queue template void Push(UopKernel* kernel, FAutoSync fautosync) { + // if the micro-op is cached in VTA SRAM, skip if (kernel->cached()) return; + // check if we've exceeded the size of the allocated FPGA readable buffer size_t num_op = kernel->size(); - if (dram_end_ + num_op > kMaxElems) { + if (dram_buffer_.size() + num_op > kMaxElems) { fautosync(); - CHECK(dram_end_ <= kMaxElems); + CHECK(dram_buffer_.size() <= kMaxElems); } + // Cannot have a micro-op kernel larger than SRAM buffer CHECK(num_op <= kMaxNumUop); uint32_t uop_begin = 0; if (sram_end_ + num_op > kMaxNumUop) { // Need to evict - cache_ptr_ = 0; + cache_idx_ = 0; sram_begin_ = 0; sram_end_ = num_op; } else { @@ -400,51 +401,81 @@ class UopQueue : public BaseQueue { sram_end_ += num_op; } // Simple eviction policy - uint32_t evict_begin = cache_ptr_; - for (; cache_ptr_ < cache_.size(); ++cache_ptr_) { - if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break; - cache_[cache_ptr_]->sram_begin_ = 0; - cache_[cache_ptr_]->sram_end_ = 0; + uint32_t evict_begin = cache_idx_; + for (; cache_idx_ < cache_.size(); ++cache_idx_) { + if (cache_[cache_idx_]->sram_begin_ >= sram_end_) break; + // Mark the kernel as "invalid" + cache_[cache_idx_]->sram_begin_ = 0; + cache_[cache_idx_]->sram_end_ = 0; } - memcpy(dram_buffer_ + dram_end_ * kElemBytes, - kernel->data(), - num_op * kElemBytes); - dram_end_ += num_op; + // Increase size of buffer kernel->sram_begin_ = uop_begin; kernel->sram_end_ = sram_end_; CHECK(kernel->cached()); - CHECK(uop_begin != sram_end_); - cache_.insert(cache_.begin() + cache_ptr_, kernel); - cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_ptr_); - cache_ptr_ = evict_begin + 1; + cache_.insert(cache_.begin() + cache_idx_, kernel); + cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_idx_); + cache_idx_ = evict_begin + 1; } - // Flush as weight load + // Flush micro op load instruction void FlushUopLoad(VTAMemInsn* insn) { if (sram_begin_ != sram_end_) { - CHECK((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_)); + // Derive offset in FPGA-readable buffer + int32_t offset = 0; + for (uint32_t i = 0; i < cache_idx_ - 1; ++i) { + offset += cache_[i]->size() * kElemBytes; + } insn->memory_type = VTA_MEM_ID_UOP; insn->sram_base = sram_begin_; + // Update cache idx to physical address map #ifdef USE_TSIM - insn->dram_base = (uint32_t) dram_phy_addr_ + dram_begin_*kElemBytes; + insn->dram_base = fpga_buff_phy_ + offset; #else - insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_; + insn->dram_base = (fpga_buff_phy_ + offset) / kElemBytes; #endif insn->y_size = 1; - insn->x_size = (dram_end_ - dram_begin_); - insn->x_stride = (dram_end_ - dram_begin_); + insn->x_size = (sram_end_ - sram_begin_); + insn->x_stride = (sram_end_ - sram_begin_); insn->y_pad_0 = 0; insn->y_pad_1 = 0; insn->x_pad_0 = 0; insn->x_pad_1 = 0; // Reset indices sram_begin_ = sram_end_; - dram_begin_ = dram_end_; + } + } + void AutoReadBarrier() { + ReadBarrier(); + } + /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */ + void ReadBarrier() { + CHECK(fpga_buff_ != nullptr); + CHECK(fpga_buff_phy_); + // Iterate over caches; allocate buffer in FPGA-readable memory + uint32_t buff_size = 0; + for (uint32_t i = 0; i < cache_.size(); ++i) { + buff_size += cache_[i]->size() * kElemBytes; + } + CHECK(buff_size <= kMaxBytes); + // Move kernel contents to FPGA readable buffer + uint32_t offset = 0; + for (uint32_t i = 0; i < cache_.size(); ++i) { + uint32_t ksize = cache_[i]->size() * kElemBytes; + VTAMemCopyFromHost(static_cast(fpga_buff_) + offset, + cache_[i]->data(), + ksize); + // Update offset + offset += ksize; + } + // Flush if we're using a shared memory system + // and if interface is non-coherent + if (!coherent_ && always_cache_) { + VTAFlushCache(fpga_buff_phy_, offset); } } private: // Cache pointer - uint32_t cache_ptr_{0}; + uint32_t cache_idx_{0}; // Cached ring, sorted by sram_begin std::vector cache_; // Constants @@ -485,7 +516,7 @@ enum PipelineStage : int { // Instruction Queue template -class InsnQueue : public BaseQueue { +class InsnQueue : public BaseQueue { public: /*! \brief Initialize the space. */ void InitSpace() { @@ -496,11 +527,11 @@ class InsnQueue : public BaseQueue { } /*! \return The data pointer. */ VTAGenericInsn* data() { - return reinterpret_cast(dram_buffer_); + return dram_buffer_.data(); } /*! \return Number of instructions. */ uint32_t count() { - return dram_end_; + return dram_buffer_.size(); } // Insert dependency push of load void DepPop(int from, int to) { @@ -524,9 +555,8 @@ class InsnQueue : public BaseQueue { void DepPush(int from, int to) { // NOTE: this instruction executes on queue[from] this->CommitPendingPop(from); - if (dram_end_ != 0) { - VTAMemInsn* mptr = - reinterpret_cast(dram_buffer_) + dram_end_ - 1; + if (!dram_buffer_.empty()) { + VTAMemInsn* mptr = reinterpret_cast(&dram_buffer_.back()); if (GetPipelineStage(mptr) == from) { if (from < to && !mptr->push_next_dep) { // push(LD->C) or push(C->ST) @@ -600,7 +630,6 @@ class InsnQueue : public BaseQueue { } } } - // Helper function: Get Opcode string const char* getOpcodeString(int opcode, bool use_imm) { // The string name @@ -628,7 +657,6 @@ class InsnQueue : public BaseQueue { return "unknown op"; } - // Dump instructions in the queue void DumpInsn() { // Keep tabs on dependence queues @@ -790,7 +818,6 @@ class InsnQueue : public BaseQueue { printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); } } - // Commit all pending pop of corresponding stage void CommitPendingPop(int stage) { // Handle the LD<->compute queue @@ -805,13 +832,11 @@ class InsnQueue : public BaseQueue { pending_pop_next_[stage] = 0; } } - void CommitPending() { for (int i = kLoadStage; i <= kStoreStage; ++i) { CommitPendingPop(i); } } - bool PendingPop() { for (int i = kLoadStage; i <= kStoreStage; ++i) { if (pending_pop_prev_[i]) return true; @@ -819,14 +844,32 @@ class InsnQueue : public BaseQueue { } return false; } + void AutoReadBarrier() { + ReadBarrier(); + } + /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */ + void ReadBarrier() { + CHECK(fpga_buff_ != nullptr); + CHECK(fpga_buff_phy_); + uint32_t buff_size = dram_buffer_.size() * elem_bytes_; + CHECK(buff_size <= kMaxBytes); + // Copy contents of DRAM buffer to FPGA buff + VTAMemCopyFromHost(fpga_buff_, + dram_buffer_.data(), + buff_size); + // Flush if we're using a shared memory system + // and if interface is non-coherent + if (!coherent_ && always_cache_) { + VTAFlushCache(fpga_buff_phy_, buff_size); + } + } protected: /*! \return Add new instruction to the buffer. */ VTAGenericInsn* NextInsn() { - VTAGenericInsn* insn = data() + dram_end_; - ++dram_end_; - CHECK(dram_end_ < kMaxElems); - return insn; + VTAGenericInsn insn; + dram_buffer_.push_back(insn); + return &dram_buffer_.back(); } // Create a new instruction for a given stage VTAGenericInsn* Create(PipelineStage stage) { @@ -859,7 +902,7 @@ class InsnQueue : public BaseQueue { if (insn->opcode == VTA_OPCODE_STORE) { // FIXME: Right now memory_type is a 2-bit field which means that // VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from - // checking the memory_type to avoid an CHECKion error... + // checking the memory_type to avoid an CHECK error... return kStoreStage; } LOG(FATAL) << "not reached"; @@ -938,7 +981,7 @@ class CommandQueue { } /* * elements size should not larger than VTA_PAGE_BYTES. - * + * */ CHECK_GE(VTA_PAGE_BYTES, elem_bytes); return elem_bytes; @@ -1256,7 +1299,7 @@ class CommandQueue { // Internal debug flag int debug_flag_{0}; - // The kernel we currently recording + // The kernel we are currently recording UopKernel* record_kernel_{nullptr}; // Micro op queue UopQueue uop_queue_; @@ -1303,14 +1346,18 @@ void VTABufferCopy(const void* from, to_buffer = vta::DataBuffer::FromHandle(to); to = to_buffer->virt_addr(); } + if (from_buffer) { + // This is an FPGA to host mem transfer from_buffer->InvalidateCache(from_offset, size); - } - - memcpy(static_cast(to) + to_offset, - static_cast(from) + from_offset, - size); - if (to_buffer) { + from_buffer->MemCopyToHost(static_cast(to) + to_offset, + static_cast(from) + from_offset, + size); + } else if (to_buffer) { + // This is a host to FPGA mem transfer + to_buffer->MemCopyFromHost(static_cast(to) + to_offset, + static_cast(from) + from_offset, + size); to_buffer->FlushCache(to_offset, size); } } diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc index cf7d6dc..9d81bef 100644 --- a/vta/src/sim/sim_driver.cc +++ b/vta/src/sim/sim_driver.cc @@ -607,6 +607,14 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) { return vta::sim::DRAM::Global()->GetPhyAddr(buf); } +void VTAMemCopyFromHost(void* dst, const void* src, size_t size) { + memcpy(dst, src, size); +} + +void VTAMemCopyToHost(void* dst, const void* src, size_t size) { + memcpy(dst, src, size); +} + void VTAFlushCache(vta_phy_addr_t buf, int size) { } diff --git a/vta/src/tsim/tsim_driver.cc b/vta/src/tsim/tsim_driver.cc index 67716ea..799ee27 100644 --- a/vta/src/tsim/tsim_driver.cc +++ b/vta/src/tsim/tsim_driver.cc @@ -220,6 +220,14 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) { return reinterpret_cast(reinterpret_cast(buf)); } +void VTAMemCopyFromHost(void* dst, const void* src, size_t size) { + memcpy(dst, src, size); +} + +void VTAMemCopyToHost(void* dst, const void* src, size_t size) { + memcpy(dst, src, size); +} + void VTAFlushCache(vta_phy_addr_t buf, int size) { } -- 2.7.4