From: Ilia Mirkin Date: Sun, 5 Feb 2017 03:31:04 +0000 (-0500) Subject: nvc0/ir: add support for all the new int64 tgsi opcodes X-Git-Tag: upstream/17.1.0~2406 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1aefd6159c07cd5b646ce99afd96d4500020418a;p=platform%2Fupstream%2Fmesa.git nvc0/ir: add support for all the new int64 tgsi opcodes A few thoughts: - Some of that LegalizeSSA logic should really live much earlier and be subject to the likes of DCE and other useful passes - Some of the "lowering" done in from_tgsi should be done later so that proper optimization might be done. However this all works and the above can be improved upon later. Signed-off-by: Ilia Mirkin --- diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 6320e52..80cc7fa 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -354,6 +354,14 @@ unsigned int Instruction::srcMask(unsigned int s) const case TGSI_OPCODE_DSGE: case TGSI_OPCODE_DSEQ: case TGSI_OPCODE_DSNE: + case TGSI_OPCODE_U64SEQ: + case TGSI_OPCODE_U64SNE: + case TGSI_OPCODE_I64SLT: + case TGSI_OPCODE_U64SLT: + case TGSI_OPCODE_I64SGE: + case TGSI_OPCODE_U64SGE: + case TGSI_OPCODE_I642F: + case TGSI_OPCODE_U642F: switch (util_bitcount(mask)) { case 1: return 0x3; case 2: return 0xf; @@ -557,6 +565,7 @@ nv50_ir::DataType Instruction::inferSrcType() const case TGSI_OPCODE_SHL: case TGSI_OPCODE_U2F: case TGSI_OPCODE_U2D: + case TGSI_OPCODE_U2I64: case TGSI_OPCODE_UADD: case TGSI_OPCODE_UDIV: case TGSI_OPCODE_UMOD: @@ -587,6 +596,7 @@ nv50_ir::DataType Instruction::inferSrcType() const return nv50_ir::TYPE_U32; case TGSI_OPCODE_I2F: case TGSI_OPCODE_I2D: + case TGSI_OPCODE_I2I64: case TGSI_OPCODE_IDIV: case TGSI_OPCODE_IMUL_HI: case TGSI_OPCODE_IMAX: @@ -608,6 +618,8 @@ nv50_ir::DataType Instruction::inferSrcType() const case TGSI_OPCODE_D2F: case TGSI_OPCODE_D2I: case TGSI_OPCODE_D2U: + case TGSI_OPCODE_D2I64: + case TGSI_OPCODE_D2U64: case TGSI_OPCODE_DABS: case TGSI_OPCODE_DNEG: case TGSI_OPCODE_DADD: @@ -630,6 +642,34 @@ nv50_ir::DataType Instruction::inferSrcType() const case TGSI_OPCODE_DFLR: case TGSI_OPCODE_DROUND: return nv50_ir::TYPE_F64; + case TGSI_OPCODE_U64SEQ: + case TGSI_OPCODE_U64SNE: + case TGSI_OPCODE_U64SLT: + case TGSI_OPCODE_U64SGE: + case TGSI_OPCODE_U64MIN: + case TGSI_OPCODE_U64MAX: + case TGSI_OPCODE_U64ADD: + case TGSI_OPCODE_U64MUL: + case TGSI_OPCODE_U64SHL: + case TGSI_OPCODE_U64SHR: + case TGSI_OPCODE_U64DIV: + case TGSI_OPCODE_U64MOD: + case TGSI_OPCODE_U642F: + case TGSI_OPCODE_U642D: + return nv50_ir::TYPE_U64; + case TGSI_OPCODE_I64ABS: + case TGSI_OPCODE_I64SSG: + case TGSI_OPCODE_I64NEG: + case TGSI_OPCODE_I64SLT: + case TGSI_OPCODE_I64SGE: + case TGSI_OPCODE_I64MIN: + case TGSI_OPCODE_I64MAX: + case TGSI_OPCODE_I64SHR: + case TGSI_OPCODE_I64DIV: + case TGSI_OPCODE_I64MOD: + case TGSI_OPCODE_I642F: + case TGSI_OPCODE_I642D: + return nv50_ir::TYPE_S64; default: return nv50_ir::TYPE_F32; } @@ -650,17 +690,35 @@ nv50_ir::DataType Instruction::inferDstType() const case TGSI_OPCODE_DSGE: case TGSI_OPCODE_DSLT: case TGSI_OPCODE_DSNE: + case TGSI_OPCODE_I64SLT: + case TGSI_OPCODE_I64SGE: + case TGSI_OPCODE_U64SEQ: + case TGSI_OPCODE_U64SNE: + case TGSI_OPCODE_U64SLT: + case TGSI_OPCODE_U64SGE: case TGSI_OPCODE_PK2H: return nv50_ir::TYPE_U32; case TGSI_OPCODE_I2F: case TGSI_OPCODE_U2F: case TGSI_OPCODE_D2F: + case TGSI_OPCODE_I642F: + case TGSI_OPCODE_U642F: case TGSI_OPCODE_UP2H: return nv50_ir::TYPE_F32; case TGSI_OPCODE_I2D: case TGSI_OPCODE_U2D: case TGSI_OPCODE_F2D: + case TGSI_OPCODE_I642D: + case TGSI_OPCODE_U642D: return nv50_ir::TYPE_F64; + case TGSI_OPCODE_I2I64: + case TGSI_OPCODE_U2I64: + case TGSI_OPCODE_F2I64: + case TGSI_OPCODE_D2I64: + return nv50_ir::TYPE_S64; + case TGSI_OPCODE_F2U64: + case TGSI_OPCODE_D2U64: + return nv50_ir::TYPE_U64; default: return inferSrcType(); } @@ -676,6 +734,8 @@ nv50_ir::CondCode Instruction::getSetCond() const case TGSI_OPCODE_USLT: case TGSI_OPCODE_FSLT: case TGSI_OPCODE_DSLT: + case TGSI_OPCODE_I64SLT: + case TGSI_OPCODE_U64SLT: return CC_LT; case TGSI_OPCODE_SLE: return CC_LE; @@ -684,6 +744,8 @@ nv50_ir::CondCode Instruction::getSetCond() const case TGSI_OPCODE_USGE: case TGSI_OPCODE_FSGE: case TGSI_OPCODE_DSGE: + case TGSI_OPCODE_I64SGE: + case TGSI_OPCODE_U64SGE: return CC_GE; case TGSI_OPCODE_SGT: return CC_GT; @@ -691,10 +753,12 @@ nv50_ir::CondCode Instruction::getSetCond() const case TGSI_OPCODE_USEQ: case TGSI_OPCODE_FSEQ: case TGSI_OPCODE_DSEQ: + case TGSI_OPCODE_U64SEQ: return CC_EQ; case TGSI_OPCODE_SNE: case TGSI_OPCODE_FSNE: case TGSI_OPCODE_DSNE: + case TGSI_OPCODE_U64SNE: return CC_NEU; case TGSI_OPCODE_USNE: return CC_NE; @@ -832,6 +896,35 @@ static nv50_ir::operation translateOpcode(uint opcode) NV50_IR_OPCODE_CASE(DFLR, FLOOR); NV50_IR_OPCODE_CASE(DROUND, CVT); + NV50_IR_OPCODE_CASE(U64SEQ, SET); + NV50_IR_OPCODE_CASE(U64SNE, SET); + NV50_IR_OPCODE_CASE(U64SLT, SET); + NV50_IR_OPCODE_CASE(U64SGE, SET); + NV50_IR_OPCODE_CASE(I64SLT, SET); + NV50_IR_OPCODE_CASE(I64SGE, SET); + NV50_IR_OPCODE_CASE(I2I64, CVT); + NV50_IR_OPCODE_CASE(U2I64, CVT); + NV50_IR_OPCODE_CASE(F2I64, CVT); + NV50_IR_OPCODE_CASE(F2U64, CVT); + NV50_IR_OPCODE_CASE(D2I64, CVT); + NV50_IR_OPCODE_CASE(D2U64, CVT); + NV50_IR_OPCODE_CASE(I642F, CVT); + NV50_IR_OPCODE_CASE(U642F, CVT); + NV50_IR_OPCODE_CASE(I642D, CVT); + NV50_IR_OPCODE_CASE(U642D, CVT); + + NV50_IR_OPCODE_CASE(I64MIN, MIN); + NV50_IR_OPCODE_CASE(U64MIN, MIN); + NV50_IR_OPCODE_CASE(I64MAX, MAX); + NV50_IR_OPCODE_CASE(U64MAX, MAX); + NV50_IR_OPCODE_CASE(I64ABS, ABS); + NV50_IR_OPCODE_CASE(I64NEG, NEG); + NV50_IR_OPCODE_CASE(U64ADD, ADD); + NV50_IR_OPCODE_CASE(U64MUL, MUL); + NV50_IR_OPCODE_CASE(U64SHL, SHL); + NV50_IR_OPCODE_CASE(I64SHR, SHR); + NV50_IR_OPCODE_CASE(U64SHR, SHR); + NV50_IR_OPCODE_CASE(IMUL_HI, MUL); NV50_IR_OPCODE_CASE(UMUL_HI, MUL); @@ -3721,6 +3814,8 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) case TGSI_OPCODE_INTERP_OFFSET: handleINTERP(dst0); break; + case TGSI_OPCODE_I642F: + case TGSI_OPCODE_U642F: case TGSI_OPCODE_D2I: case TGSI_OPCODE_D2U: case TGSI_OPCODE_D2F: { @@ -3737,16 +3832,79 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) } break; } + case TGSI_OPCODE_I2I64: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + dst0[c] = fetchSrc(0, c / 2); + mkOp2(OP_SHR, TYPE_S32, dst0[c + 1], dst0[c], loadImm(NULL, 31)); + c++; + } + break; + case TGSI_OPCODE_U2I64: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + dst0[c] = fetchSrc(0, c / 2); + dst0[c + 1] = zero; + c++; + } + break; + case TGSI_OPCODE_F2I64: + case TGSI_OPCODE_F2U64: case TGSI_OPCODE_I2D: case TGSI_OPCODE_U2D: case TGSI_OPCODE_F2D: FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { Value *dreg = getSSA(8); - mkCvt(OP_CVT, dstTy, dreg, srcTy, fetchSrc(0, c / 2)); + Instruction *cvt = mkCvt(OP_CVT, dstTy, dreg, srcTy, fetchSrc(0, c / 2)); + if (!isFloatType(dstTy)) + cvt->rnd = ROUND_Z; mkSplit(&dst0[c], 4, dreg); c++; } break; + case TGSI_OPCODE_D2I64: + case TGSI_OPCODE_D2U64: + case TGSI_OPCODE_I642D: + case TGSI_OPCODE_U642D: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + Value *dst = getSSA(8), *tmp[2]; + tmp[0] = fetchSrc(0, c); + tmp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + Instruction *cvt = mkCvt(OP_CVT, dstTy, dst, srcTy, src0); + if (!isFloatType(dstTy)) + cvt->rnd = ROUND_Z; + mkSplit(&dst0[c], 4, dst); + c++; + } + break; + case TGSI_OPCODE_I64NEG: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + Value *dst = getSSA(8), *tmp[2]; + tmp[0] = fetchSrc(0, c); + tmp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + mkOp2(OP_SUB, dstTy, dst, zero, src0); + mkSplit(&dst0[c], 4, dst); + c++; + } + break; + case TGSI_OPCODE_I64ABS: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + Value *neg = getSSA(8), *srcComp[2], *negComp[2]; + srcComp[0] = fetchSrc(0, c); + srcComp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, srcComp[0], srcComp[1]); + mkOp2(OP_SUB, dstTy, neg, zero, src0); + mkSplit(negComp, 4, neg); + mkCmp(OP_SLCT, CC_LT, TYPE_S32, dst0[c], TYPE_S32, + negComp[0], srcComp[0], srcComp[1]); + mkCmp(OP_SLCT, CC_LT, TYPE_S32, dst0[c + 1], TYPE_S32, + negComp[1], srcComp[1], srcComp[1]); + c++; + } + break; case TGSI_OPCODE_DABS: case TGSI_OPCODE_DNEG: case TGSI_OPCODE_DRCP: @@ -3779,6 +3937,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) c++; } break; + case TGSI_OPCODE_U64SEQ: + case TGSI_OPCODE_U64SNE: + case TGSI_OPCODE_U64SLT: + case TGSI_OPCODE_U64SGE: + case TGSI_OPCODE_I64SLT: + case TGSI_OPCODE_I64SGE: case TGSI_OPCODE_DSLT: case TGSI_OPCODE_DSGE: case TGSI_OPCODE_DSEQ: @@ -3800,6 +3964,46 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) } break; } + case TGSI_OPCODE_U64MIN: + case TGSI_OPCODE_U64MAX: + case TGSI_OPCODE_I64MIN: + case TGSI_OPCODE_I64MAX: { + dstTy = isSignedIntType(dstTy) ? TYPE_S32 : TYPE_U32; + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + Value *flag = getSSA(1, FILE_FLAGS); + src0 = fetchSrc(0, c + 1); + src1 = fetchSrc(1, c + 1); + geni = mkOp2(op, dstTy, dst0[c + 1], src0, src1); + geni->subOp = NV50_IR_SUBOP_MINMAX_HIGH; + geni->setFlagsDef(1, flag); + + src0 = fetchSrc(0, c); + src1 = fetchSrc(1, c); + geni = mkOp2(op, TYPE_U32, dst0[c], src0, src1); + geni->subOp = NV50_IR_SUBOP_MINMAX_LOW; + geni->setFlagsSrc(2, flag); + + c++; + } + break; + } + case TGSI_OPCODE_U64SHL: + case TGSI_OPCODE_I64SHR: + case TGSI_OPCODE_U64SHR: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + Value *dst = getSSA(8), *tmp[2]; + tmp[0] = fetchSrc(0, c); + tmp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + src1 = fetchSrc(1, c / 2); + mkOp2(op, dstTy, dst, src0, src1); + mkSplit(&dst0[c], 4, dst); + c++; + } + break; + case TGSI_OPCODE_U64ADD: + case TGSI_OPCODE_U64MUL: case TGSI_OPCODE_DADD: case TGSI_OPCODE_DMUL: case TGSI_OPCODE_DDIV: @@ -3873,6 +4077,22 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) mkSplit(&dst0[c], 4, dst); c++; } + case TGSI_OPCODE_I64SSG: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + Value *tmp[2]; + tmp[0] = fetchSrc(0, c); + tmp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + + val0 = getScratch(); + val1 = getScratch(); + mkCmp(OP_SET, CC_GT, TYPE_U32, val0, TYPE_S64, src0, zero); + mkCmp(OP_SET, CC_LT, TYPE_U32, val1, TYPE_S64, src0, zero); + mkOp2(OP_SUB, TYPE_S32, dst0[c], val1, val0); + mkOp2(OP_SHR, TYPE_S32, dst0[c + 1], dst0[c], loadImm(0, 31)); + c++; + } break; default: ERROR("unhandled TGSI opcode: %u\n", tgsi.getOpcode()); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index ec50578..772ea61 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -147,6 +147,59 @@ NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i) i->moveSources(arg + 1, -1); } +void +NVC0LegalizeSSA::handleShift(Instruction *lo) +{ + Instruction *hi = new_Instruction(func, lo->op, TYPE_U32); + lo->bb->insertAfter(lo, hi); + bld.setPosition(lo, false); + + Value *src[2], *dst[2] = {bld.getSSA(), bld.getSSA()}; + Value *dst64 = lo->getDef(0), *shift = lo->getSrc(1); + bld.mkSplit(src, 4, lo->getSrc(0)); + + hi->sType = lo->sType; + lo->dType = TYPE_U32; + + hi->setDef(0, dst[1]); + if (lo->op == OP_SHR) + hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH; + lo->setDef(0, dst[0]); + + bld.setPosition(hi, true); + + if (lo->op == OP_SHL) + std::swap(hi, lo); + + hi->setSrc(0, new_ImmediateValue(prog, 0u)); + hi->setSrc(1, shift); + hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]); + + lo->setSrc(0, src[0]); + lo->setSrc(1, shift); + lo->setSrc(2, src[1]); + + bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]); +} + +void +NVC0LegalizeSSA::handleSET(CmpInstruction *cmp) +{ + DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32; + Value *carry; + Value *src0[2], *src1[2]; + bld.setPosition(cmp, false); + + bld.mkSplit(src0, 4, cmp->getSrc(0)); + bld.mkSplit(src1, 4, cmp->getSrc(1)); + bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0]) + ->setFlagsDef(1, (carry = bld.getSSA(1, FILE_FLAGS))); + cmp->setFlagsSrc(cmp->srcCount(), carry); + cmp->setSrc(0, src0[1]); + cmp->setSrc(1, src1[1]); + cmp->sType = hTy; +} + bool NVC0LegalizeSSA::visit(Function *fn) { @@ -179,6 +232,18 @@ NVC0LegalizeSSA::visit(BasicBlock *bb) case OP_TXF: handleTEXLOD(i->asTex()); break; + case OP_SHR: + case OP_SHL: + if (typeSizeof(i->sType) == 8) + handleShift(i); + break; + case OP_SET: + case OP_SET_AND: + case OP_SET_OR: + case OP_SET_XOR: + if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64) + handleSET(i->asCmp()); + break; default: break; } @@ -612,7 +677,7 @@ NVC0LegalizePostRA::visit(BasicBlock *bb) } else { // TODO: Move this to before register allocation for operations that // need the $c register ! - if (typeSizeof(i->dType) == 8) { + if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) { Instruction *hi; hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry); if (hi) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h index d91b6aa..7fae7e2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h @@ -35,7 +35,9 @@ private: void handleDIV(Instruction *); // integer division, modulus void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt void handleFTZ(Instruction *); + void handleSET(CmpInstruction *); void handleTEXLOD(TexInstruction *); + void handleShift(Instruction *); protected: BuildUtil bld; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index d788b36..d79e87d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -1054,8 +1054,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); } break; - case OP_ADD: case OP_SUB: + if (imm0.isInteger(0) && s == 0 && typeSizeof(i->dType) == 8 && + !isFloatType(i->dType)) + break; + /* fallthrough */ + case OP_ADD: if (i->usesFlags()) break; if (imm0.isInteger(0)) { diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp index 273ec34..298e7c6 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp @@ -392,7 +392,8 @@ Program::emitBinary(struct nv50_ir_prog_info *info) for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next) { emit->emitInstruction(i); info->bin.instructions++; - if (i->sType == TYPE_F64 || i->dType == TYPE_F64) + if ((typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) && + (isFloatType(i->sType) || isFloatType(i->dType))) info->io.fp64 = true; } } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 8a67b0a..abdb328 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -329,6 +329,10 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s, // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0 if (ld->src(0).isIndirect(0)) return false; + // these are implemented using shf.r and shf.l which can't load consts + if ((i->op == OP_SHL || i->op == OP_SHR) && typeSizeof(i->sType) == 8 && + sf == FILE_MEMORY_CONST) + return false; for (int k = 0; i->srcExists(k); ++k) { if (i->src(k).getFile() == FILE_IMMEDIATE) { @@ -340,7 +344,8 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s, return false; } else if (i->src(k).getFile() != FILE_GPR && - i->src(k).getFile() != FILE_PREDICATE) { + i->src(k).getFile() != FILE_PREDICATE && + i->src(k).getFile() != FILE_FLAGS) { return false; } }