From: Haichen Shen Date: Thu, 25 Jun 2020 14:55:40 +0000 (-0700) Subject: [Relay][Vm] Some performance improvement to VM (#5901) X-Git-Tag: upstream/0.7.0~498 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7830561446f471c4ecd22f03044e0b640017a3a1;p=platform%2Fupstream%2Ftvm.git [Relay][Vm] Some performance improvement to VM (#5901) * make alignment constant * tweak copyto and loadscalarint * some safety check * x * lint * fix --- diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h index 552edc5..b9ccbf9 100644 --- a/include/tvm/runtime/vm.h +++ b/include/tvm/runtime/vm.h @@ -241,7 +241,7 @@ struct Instruction { /*! \brief The size of the allocation. */ RegName allocation_size; /*! \brief The alignment of the allocation. */ - RegName alignment; + Index alignment; /*! \brief The hint of the dtype. */ DLDataType dtype_hint; } alloc_storage; @@ -386,7 +386,7 @@ struct Instruction { * \param dst The destination to place the storage. * \return The alloc storage instruction. */ - static Instruction AllocStorage(RegName size, RegName alignment, DLDataType dtype_hint, + static Instruction AllocStorage(RegName size, Index alignment, DLDataType dtype_hint, RegName dst); Instruction(); @@ -733,7 +733,7 @@ class VirtualMachine : public runtime::ModuleNode { * \param reg The register to read from. * \return The read scalar. */ - int32_t LoadScalarInt(RegName reg) const; + inline int64_t LoadScalarInt(RegName reg) const; /*! * \brief Invoke a VM function. diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc index 0af1949..0b839a2 100644 --- a/src/relay/backend/vm/compiler.cc +++ b/src/relay/backend/vm/compiler.cc @@ -204,6 +204,9 @@ TreeObjectPtr BuildDecisionTreeFromClauses(MatchValuePtr data, tvm::Array ToAllocTensorShape(NDArray shape) { std::vector raw_shape; + if (shape->ndim == 0) { + return raw_shape; + } CHECK_EQ(shape->ndim, 1u); CHECK_EQ(shape->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got " << DLDataType2String(shape->dtype); @@ -425,10 +428,8 @@ class VMFunctionCompiler : ExprFunctor { // Prepare input and output registers std::vector argument_registers; for (auto input : inputs) { - auto reg = var_register_map_.find(Downcast(input)); - CHECK(reg != var_register_map_.end()) - << "internal error: all variables should be in the register mapping"; - argument_registers.push_back(reg->second); + VisitExpr(input); + argument_registers.push_back(last_register_); } for (auto output : outputs) { @@ -457,10 +458,8 @@ class VMFunctionCompiler : ExprFunctor { << "please file a bug in the memory manifestation pass"; for (auto input : input_tuple->fields) { - auto reg = var_register_map_.find(Downcast(input)); - CHECK(reg != var_register_map_.end()) - << "internal error: all variables should be in the register mapping"; - argument_registers.push_back(reg->second); + VisitExpr(input); + argument_registers.push_back(last_register_); } for (auto output : output_tuple->fields) { @@ -566,16 +565,20 @@ class VMFunctionCompiler : ExprFunctor { this->VisitExpr(args[0]); auto size_register = last_register_; - this->VisitExpr(args[1]); - auto alignment_register = last_register_; + CHECK(args[1].as()); + NDArray alignment_arr = args[1].as()->data; + CHECK_EQ(alignment_arr->dtype.code, 0U) + << "The dtype of constant shape must be int32 or int64, but got " + << DLDataType2String(alignment_arr->dtype); + CHECK_EQ(alignment_arr->dtype.bits, 64U); + Index alignment = reinterpret_cast(alignment_arr->data)[0]; // Get the dtype hint from the attributes. auto alloc_attrs = attrs.as(); CHECK(alloc_attrs != nullptr) << "must be the alloc tensor attrs"; auto dtype = alloc_attrs->dtype; - Emit(Instruction::AllocStorage(size_register, alignment_register, dtype, - NewRegister())); + Emit(Instruction::AllocStorage(size_register, alignment, dtype, NewRegister())); }) .Match("memory.shape_func", [this](const Array& args, const Attrs& attrs, const Array& type_arg) { @@ -890,7 +893,9 @@ transform::Sequential MemoryOpt(tvm::Target host_target) { pass_seqs.push_back(transform::FoldConstant()); // Lift constants to the top-level of the block to simplify VM code generation. - pass_seqs.push_back(transform::LiftConstants()); + // TODO(@icemelon9, @jroesch): Remove this pass for now because some + // instructions need to access to constant + // pass_seqs.push_back(transform::LiftConstants()); return transform::Sequential(pass_seqs); } diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc index 47bdd1c..65b1a2f 100644 --- a/src/runtime/vm/executable.cc +++ b/src/runtime/vm/executable.cc @@ -552,7 +552,7 @@ Instruction DeserializeInstruction(const VMInstructionSerializer& instr) { case Opcode::AllocTensor: { // Number of fields = 7 + instr.alloc_tensor.ndim DCHECK_GE(instr.fields.size(), 7U); - DCHECK_EQ(instr.fields.size(), 7U + static_cast(instr.fields[4])); + DCHECK_EQ(instr.fields.size(), 7U + static_cast(instr.fields[5])); RegName storage_reg = instr.fields[0]; RegName offset = instr.fields[1]; diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc index 42bca37..0c0ca35 100644 --- a/src/runtime/vm/vm.cc +++ b/src/runtime/vm/vm.cc @@ -529,8 +529,7 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) { } case Opcode::AllocTensorReg: { os << "alloc_tensor_reg $" << instr.dst << " $" << instr.alloc_tensor_reg.storage << " $" - << instr.alloc_tensor_reg.storage << " $" << instr.alloc_tensor_reg.offset << " $" - << instr.alloc_tensor_reg.shape_register << " "; + << instr.alloc_tensor_reg.offset << " $" << instr.alloc_tensor_reg.shape_register << " "; DLDatatypePrint(os, instr.alloc_tensor_reg.dtype); break; } @@ -581,7 +580,7 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) { break; } case Opcode::AllocStorage: { - os << "alloc_storage $" << instr.dst << " $" << instr.alloc_storage.allocation_size << " $" + os << "alloc_storage $" << instr.dst << " $" << instr.alloc_storage.allocation_size << " " << instr.alloc_storage.alignment << " " << DLDataType2String(instr.alloc_storage.dtype_hint); break; @@ -822,6 +821,9 @@ void VirtualMachine::LoadExecutable(const Executable* exec) { CHECK(pf != nullptr) << "Cannot find function in module: " << packed_name; packed_funcs_[packed_index] = pf; } + for (size_t i = 0; i < packed_funcs_.size(); ++i) { + CHECK(packed_funcs_[i] != nullptr) << "Packed function " << i << " is not initialized"; + } } void VirtualMachine::Init(const std::vector& ctxs) { ctxs_ = ctxs; } @@ -834,18 +836,34 @@ inline ObjectRef VirtualMachine::ReadRegister(Index r) const { return frames_.back().register_file[r]; } -inline int32_t VirtualMachine::LoadScalarInt(Index r) const { - int32_t result; +inline int64_t VirtualMachine::LoadScalarInt(Index r) const { + int64_t result = 0; const auto& obj = ReadRegister(r); - auto nd_array = Downcast(obj); - NDArray array = nd_array.CopyTo({kDLCPU, 0}); + NDArray array = Downcast(CopyTo(obj, {kDLCPU, 0})); - if (array->dtype.bits <= 8) { - result = reinterpret_cast(array->data)[0]; - } else if (array->dtype.bits <= 16) { - result = reinterpret_cast(array->data)[0]; - } else { - result = reinterpret_cast(array->data)[0]; + switch (array->dtype.bits) { + case 1: { + result = reinterpret_cast(array->data)[0]; + break; + } + case 8: { + result = reinterpret_cast(array->data)[0]; + break; + } + case 16: { + result = reinterpret_cast(array->data)[0]; + break; + } + case 32: { + result = reinterpret_cast(array->data)[0]; + break; + } + case 64: { + result = reinterpret_cast(array->data)[0]; + break; + } + default: + LOG(FATAL) << "Unknown scalar int type: " << DLDataType2String(array->dtype); } return result; } @@ -908,8 +926,8 @@ void VirtualMachine::RunLoop() { goto main_loop; } case Opcode::InvokePacked: { - DLOG(INFO) << "InvokedPacked " - << "arity=" << instr.arity; + DLOG(INFO) << "InvokedPacked " << instr.packed_index << " arity=" << instr.arity; + CHECK_LE(instr.packed_index, packed_funcs_.size()); const auto& func = packed_funcs_[instr.packed_index]; const auto& arity = instr.arity; std::vector args; @@ -996,9 +1014,8 @@ void VirtualMachine::RunLoop() { DLContext cpu_ctx; cpu_ctx.device_type = kDLCPU; cpu_ctx.device_id = 0; - auto shape_tensor_obj = ReadRegister(instr.alloc_tensor_reg.shape_register); - const auto shape_arr = Downcast(shape_tensor_obj); - NDArray shape_tensor = shape_arr.CopyTo(cpu_ctx); + auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register); + NDArray shape_tensor = Downcast(CopyTo(shape_obj, cpu_ctx)); auto shape = ToShape(shape_tensor); auto storage_obj = ReadRegister(instr.alloc_tensor_reg.storage); auto storage = Downcast(storage_obj); @@ -1030,7 +1047,7 @@ void VirtualMachine::RunLoop() { } case Opcode::AllocStorage: { auto size = LoadScalarInt(instr.alloc_storage.allocation_size); - auto alignment = LoadScalarInt(instr.alloc_storage.alignment); + auto alignment = instr.alloc_storage.alignment; DLOG(INFO) << "AllocStorage: allocation_size=" << size << "alignment=" << alignment << "dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint);