From: Zhigang Gong Date: Tue, 6 May 2014 10:31:13 +0000 (+0800) Subject: GBE: implement uniform analysis. X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=746420f2af5ccc62b5d57b645be72df3deae85af;p=contrib%2Fbeignet.git GBE: implement uniform analysis. We have many uniform (scalar) input values which include the kernel input argument and some special registers. And all those variables derived by all uniform values are also uniform values. This patch analysis this type of register at liveness analysis stage, and change uniform register's type to scalar type. Then latter, these registers need less register space. Signed-off-by: Zhigang Gong Reviewed-by: Ruiling Song --- diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index ceaa199..aa30b05 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -1213,7 +1213,7 @@ namespace gbe assert(dst.file == GEN_GENERAL_REGISTER_FILE); assert(src0.file == GEN_GENERAL_REGISTER_FILE); assert(src1.file == GEN_GENERAL_REGISTER_FILE); - assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1); + assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == GEN_HORIZONTAL_STRIDE_0); if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT || function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER || @@ -1233,7 +1233,7 @@ namespace gbe if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT || function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER) { - insn->header.execution_size = GEN_WIDTH_8; + insn->header.execution_size = this->curr.execWidth == 1 ? GEN_WIDTH_1 : GEN_WIDTH_8; insn->header.quarter_control = GEN_COMPRESSION_Q1; if(this->curr.execWidth == 16) { @@ -1258,7 +1258,7 @@ namespace gbe GenNativeInstruction *insn = this->next(GEN_OPCODE_MATH); assert(dst.file == GEN_GENERAL_REGISTER_FILE); assert(src.file == GEN_GENERAL_REGISTER_FILE); - assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1); + assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == GEN_HORIZONTAL_STRIDE_0); assert(src.type == GEN_TYPE_F); insn->header.destreg_or_condmod = function; diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 88ec408..3be6efc 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -1718,108 +1718,114 @@ namespace gbe const ir::Type insnType = insn.getType(); const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode, insnType)); const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode, insnType)); - switch (opcode) { - case ir::OP_ABS: - if (insn.getType() == ir::TYPE_S32) { - const GenRegister src_ = GenRegister::retype(src, GEN_TYPE_D); - const GenRegister dst_ = GenRegister::retype(dst, GEN_TYPE_D); - sel.MOV(dst_, GenRegister::abs(src_)); - } else { - GBE_ASSERT(insn.getType() == ir::TYPE_FLOAT); - sel.MOV(dst, GenRegister::abs(src)); - } - break; - case ir::OP_MOV: - if (dst.isdf()) { - ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD); - sel.MOV_DF(dst, src, sel.selReg(r)); - } else { - sel.push(); - if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL) { - sel.curr.physicalFlag = 0; + sel.push(); + if (sel.isScalarOrBool(insn.getDst(0)) == true) { + sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + } + switch (opcode) { + case ir::OP_ABS: + if (insn.getType() == ir::TYPE_S32) { + const GenRegister src_ = GenRegister::retype(src, GEN_TYPE_D); + const GenRegister dst_ = GenRegister::retype(dst, GEN_TYPE_D); + sel.MOV(dst_, GenRegister::abs(src_)); + } else { + GBE_ASSERT(insn.getType() == ir::TYPE_FLOAT); + sel.MOV(dst, GenRegister::abs(src)); + } + break; + case ir::OP_MOV: + if (dst.isdf()) { + ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD); + sel.MOV_DF(dst, src, sel.selReg(r)); + } else { + sel.push(); + if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL) { + sel.curr.physicalFlag = 0; sel.curr.flagIndex = (uint16_t)(insn.getDst(0)); sel.curr.modFlag = 1; } sel.MOV(dst, src); - sel.pop(); - } - break; - case ir::OP_RNDD: sel.RNDD(dst, src); break; - case ir::OP_RNDE: sel.RNDE(dst, src); break; - case ir::OP_RNDU: sel.RNDU(dst, src); break; - case ir::OP_RNDZ: sel.RNDZ(dst, src); break; - case ir::OP_FBH: sel.FBH(dst, src); break; - case ir::OP_FBL: sel.FBL(dst, src); break; - case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break; - case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break; - case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break; - case ir::OP_EXP: sel.MATH(dst, GEN_MATH_FUNCTION_EXP, src); break; - case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break; - case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break; - case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break; - case ir::OP_SIMD_ANY: - { - const GenRegister constZero = GenRegister::immuw(0);; - const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one); - const GenRegister flag01 = GenRegister::flag(0, 1); - - sel.push(); - int simdWidth = sel.curr.execWidth; - sel.curr.predicate = GEN_PREDICATE_NONE; - sel.curr.execWidth = 1; - sel.curr.noMask = 1; - sel.MOV(flag01, constZero); - - sel.curr.execWidth = simdWidth; - sel.curr.noMask = 0; - - sel.curr.flag = 0; - sel.curr.subFlag = 1; - sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero); - - if (sel.curr.execWidth == 16) - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; - else if (sel.curr.execWidth == 8) - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; - else - NOT_IMPLEMENTED; - sel.SEL(dst, regOne, constZero); - sel.pop(); - } - break; - case ir::OP_SIMD_ALL: - { - const GenRegister constZero = GenRegister::immuw(0); - const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one); - const GenRegister flag01 = GenRegister::flag(0, 1); - - sel.push(); - int simdWidth = sel.curr.execWidth; - sel.curr.predicate = GEN_PREDICATE_NONE; - sel.curr.execWidth = 1; - sel.curr.noMask = 1; - sel.MOV(flag01, regOne); - - sel.curr.execWidth = simdWidth; - sel.curr.noMask = 0; - - sel.curr.flag = 0; - sel.curr.subFlag = 1; - sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero); - - if (sel.curr.execWidth == 16) - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; - else if (sel.curr.execWidth == 8) - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; - else - NOT_IMPLEMENTED; - sel.SEL(dst, regOne, constZero); - sel.pop(); - } - break; + sel.pop(); + } + break; + case ir::OP_RNDD: sel.RNDD(dst, src); break; + case ir::OP_RNDE: sel.RNDE(dst, src); break; + case ir::OP_RNDU: sel.RNDU(dst, src); break; + case ir::OP_RNDZ: sel.RNDZ(dst, src); break; + case ir::OP_FBH: sel.FBH(dst, src); break; + case ir::OP_FBL: sel.FBL(dst, src); break; + case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break; + case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break; + case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break; + case ir::OP_EXP: sel.MATH(dst, GEN_MATH_FUNCTION_EXP, src); break; + case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break; + case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break; + case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break; + case ir::OP_SIMD_ANY: + { + const GenRegister constZero = GenRegister::immuw(0);; + const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one); + const GenRegister flag01 = GenRegister::flag(0, 1); + + sel.push(); + int simdWidth = sel.curr.execWidth; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.execWidth = 1; + sel.curr.noMask = 1; + sel.MOV(flag01, constZero); + sel.curr.execWidth = simdWidth; + sel.curr.noMask = 0; + + sel.curr.flag = 0; + sel.curr.subFlag = 1; + sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero); + + if (sel.curr.execWidth == 16) + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; + else if (sel.curr.execWidth == 8) + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; + else + NOT_IMPLEMENTED; + sel.SEL(dst, regOne, constZero); + sel.pop(); + } + break; + case ir::OP_SIMD_ALL: + { + const GenRegister constZero = GenRegister::immuw(0); + const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one); + const GenRegister flag01 = GenRegister::flag(0, 1); + + sel.push(); + int simdWidth = sel.curr.execWidth; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.execWidth = 1; + sel.curr.noMask = 1; + sel.MOV(flag01, regOne); + + sel.curr.execWidth = simdWidth; + sel.curr.noMask = 0; + + sel.curr.flag = 0; + sel.curr.subFlag = 1; + sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero); + + if (sel.curr.execWidth == 16) + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; + else if (sel.curr.execWidth == 8) + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; + else + NOT_IMPLEMENTED; + sel.SEL(dst, regOne, constZero); + sel.pop(); + } + break; - default: NOT_SUPPORTED; - } + default: NOT_SUPPORTED; + } + sel.pop(); return true; } DECL_CTOR(UnaryInstruction, 1, 1) @@ -1905,8 +1911,19 @@ namespace gbe const Type type = insn.getType(); GenRegister dst = sel.selReg(insn.getDst(0), type); + sel.push(); + + // Boolean values use scalars + if (sel.isScalarOrBool(insn.getDst(0)) == true) { + sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + } + if(opcode == OP_DIV || opcode == OP_REM) { - return this->emitDivRemInst(sel, dag, opcode); + bool ret = this->emitDivRemInst(sel, dag, opcode); + sel.pop(); + return ret; } // Immediates not supported if (opcode == OP_POW) { @@ -1919,17 +1936,11 @@ namespace gbe NOT_IMPLEMENTED; } markAllChildren(dag); + sel.pop(); return true; } - sel.push(); - - // Boolean values use scalars - if (sel.isScalarOrBool(insn.getDst(0)) == true) { - sel.curr.execWidth = 1; - sel.curr.predicate = GEN_PREDICATE_NONE; - sel.curr.noMask = 1; - } + //printf("reg = %d isscalarorbool %d \n", insn.getDst(0), sel.isScalarOrBool(insn.getDst(0))); // Look for immediate values GenRegister src0, src1; @@ -2252,16 +2263,21 @@ namespace gbe const ir::Opcode opcode = cmpInsn.getOpcode(); if(opcode == OP_ORD) return false; const uint32_t genCmp = getGenCompare(opcode); + sel.push(); + if (sel.isScalarOrBool(insn.getDst(0)) == true) { + sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + } - // Like for regular selects, we need a temporary since we cannot predicate - // properly - const ir::Type type = cmpInsn.getType(); - const uint32_t simdWidth = sel.curr.execWidth; - const GenRegister dst = sel.selReg(insn.getDst(0), type); - const GenRegister src0 = sel.selReg(cmpInsn.getSrc(0), type); - const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type); + // Like for regular selects, we need a temporary since we cannot predicate + // properly + const ir::Type type = cmpInsn.getType(); + const uint32_t simdWidth = sel.curr.execWidth; + const GenRegister dst = sel.selReg(insn.getDst(0), type); + const GenRegister src0 = sel.selReg(cmpInsn.getSrc(0), type); + const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type); - sel.push(); sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.execWidth = simdWidth; sel.SEL_CMP(genCmp, dst, src0, src1); @@ -2289,15 +2305,20 @@ namespace gbe { using namespace ir; const ir::BinaryInstruction &insn = cast(dag.insn); - const uint32_t simdWidth = sel.curr.execWidth; const Type type = insn.getType(); if (type == TYPE_U32 || type == TYPE_S32) { + sel.push(); + if (sel.isScalarOrBool(insn.getDst(0)) == true) { + sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + } + const uint32_t simdWidth = sel.curr.execWidth; + GenRegister dst = sel.selReg(insn.getDst(0), type); GenRegister src0 = sel.selReg(insn.getSrc(0), type); GenRegister src1 = sel.selReg(insn.getSrc(1), type); - sel.push(); - // Either left part of the 16-wide register or just a simd 8 register dst = GenRegister::retype(dst, GEN_TYPE_D); src0 = GenRegister::retype(src0, GEN_TYPE_D); @@ -2308,6 +2329,7 @@ namespace gbe sel.curr.accWrEnable = 1; sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), src0, src1); sel.curr.accWrEnable = 0; + sel.curr.execWidth = simdWidth != 1 ? 8 : 1;; sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::acc()); // Right part of the 16-wide register now @@ -2378,17 +2400,33 @@ namespace gbe const Type type = imm.type; GBE_ASSERT(type == TYPE_U32 || type == TYPE_S32); if (type == TYPE_U32 && imm.data.u32 <= 0xffff) { - sel.MUL(sel.selReg(dst, type), - sel.selReg(src1, type), - GenRegister::immuw(imm.data.u32)); + sel.push(); + if (sel.isScalarOrBool(insn.getDst(0)) == true) { + sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + } + + sel.MUL(sel.selReg(dst, type), + sel.selReg(src1, type), + GenRegister::immuw(imm.data.u32)); + sel.pop(); if (dag.child[childID ^ 1] != NULL) dag.child[childID ^ 1]->isRoot = 1; return true; } if (type == TYPE_S32 && (imm.data.s32 >= -32768 && imm.data.s32 <= 32767)) { - sel.MUL(sel.selReg(dst, type), - sel.selReg(src1, type), - GenRegister::immw(imm.data.s32)); + sel.push(); + if (sel.isScalarOrBool(insn.getDst(0)) == true) { + sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + } + + sel.MUL(sel.selReg(dst, type), + sel.selReg(src1, type), + GenRegister::immw(imm.data.s32)); + sel.pop(); if (dag.child[childID ^ 1] != NULL) dag.child[childID ^ 1]->isRoot = 1; return true; @@ -2407,9 +2445,16 @@ namespace gbe const Register src0 = insn.getSrc(childID); const Register src1 = insn.getSrc(childID ^ 1); if (is16BitSpecialReg(src0)) { - sel.MUL(sel.selReg(dst, type), - sel.selReg(src1, type), - sel.selReg(src0, TYPE_U32)); + sel.push(); + if (sel.isScalarOrBool(insn.getDst(0)) == true) { + sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + } + sel.MUL(sel.selReg(dst, type), + sel.selReg(src1, type), + sel.selReg(src0, TYPE_U32)); + sel.pop(); markAllChildren(dag); return true; } @@ -2660,7 +2705,7 @@ namespace gbe insn.getAddressSpace() == MEM_CONSTANT || insn.getAddressSpace() == MEM_PRIVATE || insn.getAddressSpace() == MEM_LOCAL); - GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false); + //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false); const Type type = insn.getValueType(); const uint32_t elemSize = getByteScatterGatherSize(type); if (insn.getAddressSpace() == MEM_CONSTANT) { @@ -2967,11 +3012,14 @@ namespace gbe const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); const Opcode opcode = insn.getOpcode(); - - if(opcode == ir::OP_SAT_CVT) { - sel.push(); + sel.push(); + if (sel.isScalarOrBool(insn.getDst(0)) == true) { + sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + } + if(opcode == ir::OP_SAT_CVT) sel.curr.saturate = 1; - } // We need two instructions to make the conversion if (opcode == OP_F16TO32) { @@ -3037,8 +3085,7 @@ namespace gbe } else sel.MOV(dst, src); - if(opcode == ir::OP_SAT_CVT) - sel.pop(); + sel.pop(); return true; } diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp index 8831d47..266e652 100644 --- a/backend/src/ir/function.hpp +++ b/backend/src/ir/function.hpp @@ -82,6 +82,7 @@ namespace ir { } } set undefPhiRegs; + set definedPhiRegs; private: friend class Function; //!< Owns the basic blocks BlockSet predecessors; //!< Incoming blocks @@ -176,6 +177,10 @@ namespace ir { uint32_t getSimdWidth(void) const { return simdWidth; } /*! Extract the register from the register file */ INLINE RegisterData getRegisterData(Register reg) const { return file.get(reg); } + /*! set a register to uniform or nonuniform type. */ + INLINE void setRegisterUniform(Register reg, bool uniform) { file.setUniform(reg, uniform); } + /*! return true if the specified regsiter is uniform type */ + INLINE bool isUniformRegister(Register reg) { return file.isUniform(reg); } /*! Get the register family from the register itself */ INLINE RegisterFamily getRegisterFamily(Register reg) const { return this->getRegisterData(reg).family; diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp index 474ed3e..ac61e7e 100644 --- a/backend/src/ir/liveness.cpp +++ b/backend/src/ir/liveness.cpp @@ -64,8 +64,11 @@ namespace ir { const uint32_t srcNum = insn.getSrcNum(); const uint32_t dstNum = insn.getDstNum(); // First look for used before killed + bool uniform = true; for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { const Register reg = insn.getSrc(srcID); + if (!fn.isUniformRegister(reg)) + uniform = false; // Not killed -> it is really an upward use if (info.varKill.contains(reg) == false) info.upwardUsed.insert(reg); @@ -73,6 +76,13 @@ namespace ir { // A destination is a killed value for (uint32_t dstID = 0; dstID < dstNum; ++dstID) { const Register reg = insn.getDst(dstID); + if ( uniform && + fn.getRegisterFamily(reg) != ir::FAMILY_BOOL && + fn.getRegisterFamily(reg) != ir::FAMILY_QWORD && + !info.bb.definedPhiRegs.contains(reg) && + insn.getOpcode() != ir::OP_LOAD && + insn.getOpcode() != ir::OP_ATOMIC ) + fn.setRegisterUniform(reg, true); info.varKill.insert(reg); } } diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp index 340ebc8..0f7c6a1 100644 --- a/backend/src/ir/register.hpp +++ b/backend/src/ir/register.hpp @@ -84,6 +84,7 @@ namespace ir { INLINE ~RegisterData(void) {} RegisterFamily family; //!< Register size or if it is a flag INLINE const bool isUniform() const { return uniform; } + INLINE void setUniform(bool uni) { uniform = uni; } private: bool uniform; GBE_CLASS(RegisterData); @@ -135,6 +136,10 @@ namespace ir { INLINE void appendTuple(void) {} /*! Return a copy of the register at index */ INLINE RegisterData get(Register index) const { return regs[index]; } + /*! Return true if the specified register is uniform type. */ + INLINE bool isUniform(Register index) { return regs[index].isUniform(); } + /*! Set a register to uniform or varying data type*/ + INLINE void setUniform(Register index, bool uniform) { regs[index].setUniform(uniform); } /*! Get the register index from the tuple */ INLINE Register get(Tuple index, uint32_t which) const { return regTuples[uint16_t(index) + which]; diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index c71a17d..82429d0 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -1076,6 +1076,7 @@ namespace gbe ctx.MOV(type, dst, src); } assert(!ctx.getBlock()->undefPhiRegs.contains(dst)); + ctx.getBlock()->definedPhiRegs.insert(dst); } else { // If this is an undefined value, we don't need emit phi copy here. // But we need to record it. As latter, at liveness's backward analysis,