From: Stanislav Mekhanoshin Date: Fri, 26 Apr 2019 16:37:51 +0000 (+0000) Subject: [AMDGPU] gfx1010 VOP2 changes X-Git-Tag: llvmorg-10-init~6914 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8f3da70eed3f002441427be5873560f76a505988;p=platform%2Fupstream%2Fllvm.git [AMDGPU] gfx1010 VOP2 changes Differential Revision: https://reviews.llvm.org/D61156 llvm-svn: 359316 --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 6e44eba..acd90b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -201,8 +201,10 @@ private: SDValue getHi16Elt(SDValue In) const; void SelectADD_SUB_I64(SDNode *N); + void SelectAddcSubb(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); + void SelectDIV_FMAS(SDNode *N); void SelectMAD_64_32(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); @@ -650,6 +652,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectADD_SUB_I64(N); return; } + case ISD::ADDCARRY: + case ISD::SUBCARRY: + if (N->getValueType(0) != MVT::i32) + break; + + SelectAddcSubb(N); + return; case ISD::UADDO: case ISD::USUBO: { SelectUADDO_USUBO(N); @@ -765,6 +774,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectDIV_SCALE(N); return; } + case AMDGPUISD::DIV_FMAS: { + SelectDIV_FMAS(N); + return; + } case AMDGPUISD::MAD_I64_I32: case AMDGPUISD::MAD_U64_U32: { SelectMAD_64_32(N); @@ -928,6 +941,19 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { ReplaceNode(N, RegSequence); } +void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) { + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue CI = N->getOperand(2); + + unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + CurDAG->SelectNodeTo( + N, Opc, N->getVTList(), + {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); +} + void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned // carry out despite the _i32 name. These were renamed in VI to _U32. @@ -983,6 +1009,32 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } +void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) { + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32; + + SDValue CarryIn = N->getOperand(3); + // V_DIV_FMAS implicitly reads VCC. + SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL, + AMDGPU::VCC, CarryIn, SDValue()); + + SDValue Ops[10]; + + SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); + + Ops[8] = VCC; + Ops[9] = VCC.getValue(1); + + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); +} + // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index cd368e5..72dd112 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -2609,61 +2609,114 @@ static bool IsRevOpcode(const unsigned Opcode) switch (Opcode) { case AMDGPU::V_SUBREV_F32_e32: case AMDGPU::V_SUBREV_F32_e64: - case AMDGPU::V_SUBREV_F32_e32_si: + case AMDGPU::V_SUBREV_F32_e32_gfx10: + case AMDGPU::V_SUBREV_F32_e32_gfx6_gfx7: case AMDGPU::V_SUBREV_F32_e32_vi: - case AMDGPU::V_SUBREV_F32_e64_si: + case AMDGPU::V_SUBREV_F32_e64_gfx10: + case AMDGPU::V_SUBREV_F32_e64_gfx6_gfx7: case AMDGPU::V_SUBREV_F32_e64_vi: + case AMDGPU::V_SUBREV_I32_e32: case AMDGPU::V_SUBREV_I32_e64: - case AMDGPU::V_SUBREV_I32_e32_si: - case AMDGPU::V_SUBREV_I32_e64_si: + case AMDGPU::V_SUBREV_I32_e32_gfx6_gfx7: + case AMDGPU::V_SUBREV_I32_e64_gfx6_gfx7: + case AMDGPU::V_SUBBREV_U32_e32: case AMDGPU::V_SUBBREV_U32_e64: - case AMDGPU::V_SUBBREV_U32_e32_si: + case AMDGPU::V_SUBBREV_U32_e32_gfx6_gfx7: case AMDGPU::V_SUBBREV_U32_e32_vi: - case AMDGPU::V_SUBBREV_U32_e64_si: + case AMDGPU::V_SUBBREV_U32_e64_gfx6_gfx7: case AMDGPU::V_SUBBREV_U32_e64_vi: + case AMDGPU::V_SUBREV_U32_e32: case AMDGPU::V_SUBREV_U32_e64: case AMDGPU::V_SUBREV_U32_e32_gfx9: case AMDGPU::V_SUBREV_U32_e32_vi: case AMDGPU::V_SUBREV_U32_e64_gfx9: case AMDGPU::V_SUBREV_U32_e64_vi: + case AMDGPU::V_SUBREV_F16_e32: case AMDGPU::V_SUBREV_F16_e64: + case AMDGPU::V_SUBREV_F16_e32_gfx10: case AMDGPU::V_SUBREV_F16_e32_vi: + case AMDGPU::V_SUBREV_F16_e64_gfx10: case AMDGPU::V_SUBREV_F16_e64_vi: + case AMDGPU::V_SUBREV_U16_e32: case AMDGPU::V_SUBREV_U16_e64: case AMDGPU::V_SUBREV_U16_e32_vi: case AMDGPU::V_SUBREV_U16_e64_vi: + case AMDGPU::V_SUBREV_CO_U32_e32_gfx9: + case AMDGPU::V_SUBREV_CO_U32_e64_gfx10: case AMDGPU::V_SUBREV_CO_U32_e64_gfx9: + case AMDGPU::V_SUBBREV_CO_U32_e32_gfx9: case AMDGPU::V_SUBBREV_CO_U32_e64_gfx9: - case AMDGPU::V_LSHLREV_B32_e32_si: - case AMDGPU::V_LSHLREV_B32_e64_si: - case AMDGPU::V_LSHLREV_B16_e32_vi: - case AMDGPU::V_LSHLREV_B16_e64_vi: + + case AMDGPU::V_SUBREV_NC_U32_e32_gfx10: + case AMDGPU::V_SUBREV_NC_U32_e64_gfx10: + + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_e64_gfx10: + + case AMDGPU::V_LSHRREV_B32_e32: + case AMDGPU::V_LSHRREV_B32_e64: + case AMDGPU::V_LSHRREV_B32_e32_gfx6_gfx7: + case AMDGPU::V_LSHRREV_B32_e64_gfx6_gfx7: + case AMDGPU::V_LSHRREV_B32_e32_vi: + case AMDGPU::V_LSHRREV_B32_e64_vi: + case AMDGPU::V_LSHRREV_B32_e32_gfx10: + case AMDGPU::V_LSHRREV_B32_e64_gfx10: + + case AMDGPU::V_ASHRREV_I32_e32: + case AMDGPU::V_ASHRREV_I32_e64: + case AMDGPU::V_ASHRREV_I32_e32_gfx10: + case AMDGPU::V_ASHRREV_I32_e32_gfx6_gfx7: + case AMDGPU::V_ASHRREV_I32_e32_vi: + case AMDGPU::V_ASHRREV_I32_e64_gfx10: + case AMDGPU::V_ASHRREV_I32_e64_gfx6_gfx7: + case AMDGPU::V_ASHRREV_I32_e64_vi: + + case AMDGPU::V_LSHLREV_B32_e32: + case AMDGPU::V_LSHLREV_B32_e64: + case AMDGPU::V_LSHLREV_B32_e32_gfx10: + case AMDGPU::V_LSHLREV_B32_e32_gfx6_gfx7: case AMDGPU::V_LSHLREV_B32_e32_vi: + case AMDGPU::V_LSHLREV_B32_e64_gfx10: + case AMDGPU::V_LSHLREV_B32_e64_gfx6_gfx7: case AMDGPU::V_LSHLREV_B32_e64_vi: - case AMDGPU::V_LSHLREV_B64_vi: - case AMDGPU::V_LSHRREV_B32_e32_si: - case AMDGPU::V_LSHRREV_B32_e64_si: + + case AMDGPU::V_LSHLREV_B16_e32: + case AMDGPU::V_LSHLREV_B16_e64: + case AMDGPU::V_LSHLREV_B16_e32_vi: + case AMDGPU::V_LSHLREV_B16_e64_vi: + + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_LSHRREV_B16_e64: case AMDGPU::V_LSHRREV_B16_e32_vi: case AMDGPU::V_LSHRREV_B16_e64_vi: - case AMDGPU::V_LSHRREV_B32_e32_vi: - case AMDGPU::V_LSHRREV_B32_e64_vi: - case AMDGPU::V_LSHRREV_B64_vi: - case AMDGPU::V_ASHRREV_I32_e64_si: - case AMDGPU::V_ASHRREV_I32_e32_si: + + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_ASHRREV_I16_e64: case AMDGPU::V_ASHRREV_I16_e32_vi: case AMDGPU::V_ASHRREV_I16_e64_vi: - case AMDGPU::V_ASHRREV_I32_e32_vi: - case AMDGPU::V_ASHRREV_I32_e64_vi: + + case AMDGPU::V_LSHLREV_B64: + case AMDGPU::V_LSHLREV_B64_vi: + + case AMDGPU::V_LSHRREV_B64: + case AMDGPU::V_LSHRREV_B64_vi: + + case AMDGPU::V_ASHRREV_I64: case AMDGPU::V_ASHRREV_I64_vi: + + case AMDGPU::V_PK_LSHLREV_B16: case AMDGPU::V_PK_LSHLREV_B16_vi: + + case AMDGPU::V_PK_LSHRREV_B16: case AMDGPU::V_PK_LSHRREV_B16_vi: + case AMDGPU::V_PK_ASHRREV_I16: case AMDGPU::V_PK_ASHRREV_I16_vi: return true; default: @@ -5523,10 +5576,13 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, // it has src2 register operand that is tied to dst operand // we don't allow modifiers for this operand in assembler so src2_modifiers // should be 0. - if (Opc == AMDGPU::V_MAC_F32_e64_si || + if (Opc == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 || + Opc == AMDGPU::V_MAC_F32_e64_gfx10 || Opc == AMDGPU::V_MAC_F32_e64_vi || Opc == AMDGPU::V_MAC_F16_e64_vi || - Opc == AMDGPU::V_FMAC_F32_e64_vi) { + Opc == AMDGPU::V_FMAC_F32_e64_gfx10 || + Opc == AMDGPU::V_FMAC_F32_e64_vi || + Opc == AMDGPU::V_FMAC_F16_e64_gfx10) { auto it = Inst.begin(); std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 84222e5..9ee6098 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -183,10 +183,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, bool IsSDWA = false; // ToDo: AMDGPUDisassembler supports only VI ISA. - if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]) + if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10()) report_fatal_error("Disassembly not yet supported for subtarget"); - const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size()); + unsigned MaxInstBytesNum = (std::min)( + STI.getFeatureBits()[AMDGPU::FeatureGFX10] ? (size_t) 20 : + STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal] ? (size_t) 12 : (size_t)8, + Bytes_.size()); Bytes = Bytes_.slice(0, MaxInstBytesNum); DecodeStatus Res = MCDisassembler::Fail; @@ -207,6 +210,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); if (Res) { IsSDWA = true; break; } + Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address); + if (Res) { IsSDWA = true; break; } + + // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and + // v_mad_mixhi_f16 for FMA variants. Try to decode using this special + // table first so we print the correct name. + + if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) { + Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address); + if (Res) break; + } + if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) { Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address); if (Res) @@ -238,6 +253,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address); if (Res) break; + Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address); + if (Res) break; + if (Bytes.size() < 4) break; const uint64_t QW = ((uint64_t)eatBytes(Bytes) << 32) | DW; Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address); @@ -247,12 +265,25 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Res) break; Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address); } while (false); + if (Res && (MaxInstBytesNum - Bytes.size()) == 12 && (!HasLiteral || + !(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3))) { + MaxInstBytesNum = 8; + Bytes = Bytes_.slice(0, MaxInstBytesNum); + eatBytes(Bytes); + } + if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || - MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si || + MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 || + MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 || MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi || - MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi)) { + MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi || + MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 || + MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10)) { // Insert dummy unused src2_modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src2_modifiers); @@ -265,6 +296,22 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Res && IsSDWA) Res = convertSDWAInst(MI); + int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vdst_in); + if (VDstIn_Idx != -1) { + int Tied = MCII->get(MI.getOpcode()).getOperandConstraint(VDstIn_Idx, + MCOI::OperandConstraint::TIED_TO); + if (Tied != -1 && (MI.getNumOperands() <= (unsigned)VDstIn_Idx || + !MI.getOperand(VDstIn_Idx).isReg() || + MI.getOperand(VDstIn_Idx).getReg() != MI.getOperand(Tied).getReg())) { + if (MI.getNumOperands() > (unsigned)VDstIn_Idx) + MI.erase(&MI.getOperand(VDstIn_Idx)); + insertNamedMCOperand(MI, + MCOperand::createReg(MI.getOperand(Tied).getReg()), + AMDGPU::OpName::vdst_in); + } + } + // if the opcode was not recognized we'll assume a Size of 4 bytes // (unless there are fewer bytes left) Size = Res ? (MaxInstBytesNum - Bytes.size()) @@ -273,7 +320,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { - if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] || + STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1) // VOPC - insert clamp insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d9dc000..c72a893 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2819,10 +2819,10 @@ static bool shouldReadExec(const MachineInstr &MI) { if (SIInstrInfo::isVALU(MI)) { switch (MI.getOpcode()) { case AMDGPU::V_READLANE_B32: - case AMDGPU::V_READLANE_B32_si: + case AMDGPU::V_READLANE_B32_gfx6_gfx7: case AMDGPU::V_READLANE_B32_vi: case AMDGPU::V_WRITELANE_B32: - case AMDGPU::V_WRITELANE_B32_si: + case AMDGPU::V_WRITELANE_B32_gfx6_gfx7: case AMDGPU::V_WRITELANE_B32_vi: return false; } diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 6823739..bf4b4b1 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -470,6 +470,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { def _sdwa_gfx10 : VOP_SDWA10_Real(NAME#"_sdwa")>, VOP1_SDWA9Ae(NAME#"_sdwa").Pfl> { + let DecoderNamespace = "SDWA10"; } } } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 2ce0c60..a000a84 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -190,6 +190,18 @@ multiclass VOP2bInst : + InstAlias , + PredicateControl { +} + +multiclass VOP2bInstAliases { + def : VOP2bInstAlias; +} + multiclass VOP2eInst : + InstAlias , + PredicateControl { +} + +multiclass VOP2eInstAliases { + def : VOP2eInstAlias; +} + class VOP_MADAK : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm); @@ -242,7 +265,7 @@ def VOP_MADMK_F32 : VOP_MADMK ; // FIXME: Remove src2_modifiers. It isn't used, so is wasting memory // and processing time but it makes it easier to convert to mad. -class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> { +class VOP_MAC : VOPProfile <[vt0, vt1, vt1, vt0]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64, 3, 0, HasModifiers, HasModifiers, HasOMod, @@ -259,11 +282,11 @@ class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> { clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let Asm32 = getAsm32<1, 2, vt>.ret; - let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt>.ret; - let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; - let AsmSDWA = getAsmSDWA<1, 2, vt>.ret; - let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret; + let Asm32 = getAsm32<1, 2, vt0>.ret; + let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt0>.ret; + let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt0>.ret; + let AsmSDWA = getAsmSDWA<1, 2, vt0>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt0>.ret; let HasSrc2 = 0; let HasSrc2Mods = 0; @@ -271,6 +294,7 @@ class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> { let HasExtDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 0; + let TieRegDPP = "$src2"; } def VOP_MAC_F16 : VOP_MAC ; @@ -290,12 +314,6 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp // Write out to vcc or arbitrary SGPR and read in from vcc or // arbitrary SGPR. def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> { - // We use VCSrc_b32 to exclude literal constants, even though the - // encoding normally allows them since the implicit VCC use means - // using one would always violate the constant bus - // restriction. SGPRs are still allowed because it should - // technically be possible to use VCC again as src0. - let Src0RC32 = VCSrc_b32; let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; @@ -326,9 +344,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=* } // Read in from vcc or arbitrary SGPR. -// Enable f32 source modifiers on i32 input type. def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/1> { - let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above. let Asm32 = "$vdst, $src0, $src1, vcc"; let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2"; let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; @@ -471,27 +487,19 @@ defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT, AMDGPUpk_i16_i32>; -def : GCNPat< - (AMDGPUadde i32:$src0, i32:$src1, i1:$src2), - (V_ADDC_U32_e64 $src0, $src1, $src2, 0) ->; - -def : GCNPat< - (AMDGPUsube i32:$src0, i32:$src1, i1:$src2), - (V_SUBB_U32_e64 $src0, $src1, $src2, 0) ->; - let SubtargetPredicate = isGFX6GFX7 in { defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>; defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>; +} // End SubtargetPredicate = isGFX6GFX7 +let SubtargetPredicate = isGFX6GFX7GFX10 in { let isCommutable = 1 in { defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; -defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN, srl>; -defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN, sra>; -defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN, shl>; +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; } // End isCommutable = 1 -} // End SubtargetPredicate = isGFX6GFX7 +} // End SubtargetPredicate = isGFX6GFX7GFX10 class DivergentBinOp : GCNPat< @@ -523,10 +531,9 @@ let SubtargetPredicate = HasAddNoCarryInsts in { def : DivergentBinOp; } - +let SubtargetPredicate = isGFX6GFX7GFX8GFX9, Predicates = [isGFX6GFX7GFX8GFX9] in { def : DivergentBinOp; -def : DivergentClampingBinOp; def : DivergentBinOp; def : DivergentBinOp; @@ -534,6 +541,7 @@ def : DivergentBinOp; def : DivergentBinOp; def : DivergentBinOp; def : DivergentBinOp; +} def : DivergentBinOp; def : DivergentBinOp; @@ -607,45 +615,75 @@ defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>; } // End SubtargetPredicate = HasDLInsts -// Note: 16-bit instructions produce a 0 result in the high 16-bits. -multiclass Arithmetic_i16_Pats { +let SubtargetPredicate = isGFX10Plus in { + +def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">; +let FPDPRounding = 1 in +def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">; + +let isCommutable = 1 in { +def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">; +let FPDPRounding = 1 in +def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">; +} // End isCommutable = 1 + +let Constraints = "$vdst = $src2", + DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, + isCommutable = 1 in { +defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>; +} + +defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; + +} // End SubtargetPredicate = isGFX10Plus + +// Note: 16-bit instructions produce a 0 result in the high 16-bits +// on GFX8 and GFX9 and preserve high 16 bits on GFX10+ +def ClearHI16 : OutPatFrag<(ops node:$op), + (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>; + +multiclass Arithmetic_i16_Pats { def : GCNPat< (op i16:$src0, i16:$src1), - (inst $src0, $src1) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) >; def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), - (inst $src0, $src1) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) >; def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, - (inst $src0, $src1), sub0, + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)), + sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; - } -multiclass Bits_OpsRev_i16_Pats { +multiclass Bits_OpsRev_i16_Pats { def : GCNPat< (op i16:$src0, i16:$src1), - (inst $src1, $src0) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) >; def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), - (inst $src1, $src0) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) >; def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, - (inst $src1, $src0), sub0, + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)), + sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; } @@ -657,8 +695,9 @@ class ZExt_i16_i1_Pat : GCNPat < $src) >; -let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { +let Predicates = [Has16BitInsts] in { +let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; @@ -666,6 +705,17 @@ defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; +} + +let Predicates = [Has16BitInsts, isGFX10Plus] in { +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +} def : GCNPat < (and i16:$src0, i16:$src1), @@ -682,9 +732,17 @@ def : GCNPat < (V_XOR_B32_e64 $src0, $src1) >; +let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { defm : Bits_OpsRev_i16_Pats; defm : Bits_OpsRev_i16_Pats; defm : Bits_OpsRev_i16_Pats; +} + +let Predicates = [Has16BitInsts, isGFX10Plus] in { +defm : Bits_OpsRev_i16_Pats; +defm : Bits_OpsRev_i16_Pats; +defm : Bits_OpsRev_i16_Pats; +} def : ZExt_i16_i1_Pat; def : ZExt_i16_i1_Pat; @@ -705,103 +763,227 @@ def : GCNPat< } // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9] + //===----------------------------------------------------------------------===// -// SI +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// -let AssemblerPredicates = [isGFX6GFX7], DecoderNamespace = "GFX6GFX7" in { - -multiclass VOP2_Real_si op> { - def _si : - VOP2_Real(NAME), SIEncodingFamily.SI>, - VOP2e(NAME).Pfl>; -} - -multiclass VOP2_Real_MADK_si op> { - def _si : VOP2_Real(NAME), SIEncodingFamily.SI>, - VOP2_MADKe(NAME).Pfl>; -} +class VOP2_DPP op, VOP2_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl> : + VOP_DPP { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; -multiclass VOP2_Real_e32_si op> { - def _e32_si : - VOP2_Real(NAME#"_e32"), SIEncodingFamily.SI>, - VOP2e(NAME#"_e32").Pfl>; + bits<8> vdst; + bits<8> src1; + let Inst{8-0} = 0xfa; + let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; } -multiclass VOP2_Real_e32e64_si op> : VOP2_Real_e32_si { - def _e64_si : - VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3e_gfx6_gfx7 <{1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; -} +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// -multiclass VOP2be_Real_e32e64_si op> : VOP2_Real_e32_si { - def _e64_si : - VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3be_gfx6_gfx7 <{1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; -} +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + //===------------------------------- VOP2 -------------------------------===// + multiclass VOP2Only_Real_MADK_gfx10 op> { + def _gfx10 : + VOP2_Real(NAME), SIEncodingFamily.GFX10>, + VOP2_MADKe(NAME).Pfl>; + } + multiclass VOP2Only_Real_MADK_gfx10_with_name op, string opName, + string asmName> { + def _gfx10 : + VOP2_Real(opName), SIEncodingFamily.GFX10>, + VOP2_MADKe(opName).Pfl> { + VOP2_Pseudo ps = !cast(opName); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass VOP2_Real_e32_gfx10 op> { + def _e32_gfx10 : + VOP2_Real(NAME#"_e32"), SIEncodingFamily.GFX10>, + VOP2e(NAME#"_e32").Pfl>; + } + multiclass VOP2_Real_e64_gfx10 op> { + def _e64_gfx10 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; + } + multiclass VOP2_Real_sdwa_gfx10 op> { + def _sdwa_gfx10 : + VOP_SDWA10_Real(NAME#"_sdwa")>, + VOP2_SDWA9Ae(NAME#"_sdwa").Pfl> { + let DecoderNamespace = "SDWA10"; + } + } -} // End AssemblerPredicates = [isGFX6GFX7], DecoderNamespace = "GFX6GFX7" - -defm V_CNDMASK_B32 : VOP2_Real_e32e64_si <0x0>; -defm V_ADD_F32 : VOP2_Real_e32e64_si <0x3>; -defm V_SUB_F32 : VOP2_Real_e32e64_si <0x4>; -defm V_SUBREV_F32 : VOP2_Real_e32e64_si <0x5>; -defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_si <0x7>; -defm V_MUL_F32 : VOP2_Real_e32e64_si <0x8>; -defm V_MUL_I32_I24 : VOP2_Real_e32e64_si <0x9>; -defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_si <0xa>; -defm V_MUL_U32_U24 : VOP2_Real_e32e64_si <0xb>; -defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_si <0xc>; -defm V_MIN_F32 : VOP2_Real_e32e64_si <0xf>; -defm V_MAX_F32 : VOP2_Real_e32e64_si <0x10>; -defm V_MIN_I32 : VOP2_Real_e32e64_si <0x11>; -defm V_MAX_I32 : VOP2_Real_e32e64_si <0x12>; -defm V_MIN_U32 : VOP2_Real_e32e64_si <0x13>; -defm V_MAX_U32 : VOP2_Real_e32e64_si <0x14>; -defm V_LSHRREV_B32 : VOP2_Real_e32e64_si <0x16>; -defm V_ASHRREV_I32 : VOP2_Real_e32e64_si <0x18>; -defm V_LSHLREV_B32 : VOP2_Real_e32e64_si <0x1a>; -defm V_AND_B32 : VOP2_Real_e32e64_si <0x1b>; -defm V_OR_B32 : VOP2_Real_e32e64_si <0x1c>; -defm V_XOR_B32 : VOP2_Real_e32e64_si <0x1d>; -defm V_MAC_F32 : VOP2_Real_e32e64_si <0x1f>; -defm V_MADMK_F32 : VOP2_Real_MADK_si <0x20>; -defm V_MADAK_F32 : VOP2_Real_MADK_si <0x21>; -defm V_ADD_I32 : VOP2be_Real_e32e64_si <0x25>; -defm V_SUB_I32 : VOP2be_Real_e32e64_si <0x26>; -defm V_SUBREV_I32 : VOP2be_Real_e32e64_si <0x27>; -defm V_ADDC_U32 : VOP2be_Real_e32e64_si <0x28>; -defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>; -defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>; - -defm V_READLANE_B32 : VOP2_Real_si <0x01>; + //===------------------------- VOP2 (with name) -------------------------===// + multiclass VOP2_Real_e32_gfx10_with_name op, string opName, + string asmName> { + def _e32_gfx10 : + VOP2_Real(opName#"_e32"), SIEncodingFamily.GFX10>, + VOP2e(opName#"_e32").Pfl> { + VOP2_Pseudo ps = !cast(opName#"_e32"); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass VOP2_Real_e64_gfx10_with_name op, string opName, + string asmName> { + def _e64_gfx10 : + VOP3_Real(opName#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, + !cast(opName#"_e64").Pfl> { + VOP3_Pseudo ps = !cast(opName#"_e64"); + let AsmString = asmName # ps.AsmOperands; + } + } + let DecoderNamespace = "SDWA10" in { + multiclass VOP2_Real_sdwa_gfx10_with_name op, string opName, + string asmName> { + def _sdwa_gfx10 : + VOP_SDWA10_Real(opName#"_sdwa")>, + VOP2_SDWA9Ae(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo ps = !cast(opName#"_sdwa"); + let AsmString = asmName # ps.AsmOperands; + } + } + } // End DecoderNamespace = "SDWA10" + + //===------------------------------ VOP2be ------------------------------===// + multiclass VOP2be_Real_gfx10 op, string opName, string asmName> { + def _e32_gfx10 : + VOP2_Real(opName#"_e32"), SIEncodingFamily.GFX10>, + VOP2e(opName#"_e32").Pfl> { + VOP2_Pseudo Ps = !cast(opName#"_e32"); + let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); + } + def _e64_gfx10 : + VOP3_Real(opName#"_e64"), SIEncodingFamily.GFX10>, + VOP3be_gfx10<{0, 1, 0, 0, op{5-0}}, + !cast(opName#"_e64").Pfl> { + VOP3_Pseudo Ps = !cast(opName#"_e64"); + let AsmString = asmName # Ps.AsmOperands; + } + def _sdwa_gfx10 : + VOP_SDWA10_Real(opName#"_sdwa")>, + VOP2_SDWA9Ae(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast(opName#"_sdwa"); + let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); + let DecoderNamespace = "SDWA10"; + } -let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { -defm V_WRITELANE_B32 : VOP2_Real_si <0x02>; -} + def _sdwa_w64_gfx10 : + Base_VOP_SDWA10_Real(opName#"_sdwa")>, + VOP2_SDWA9Ae(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast(opName#"_sdwa"); + let AsmString = asmName # Ps.AsmOperands; + let isAsmParserOnly = 1; + let DecoderNamespace = "SDWA10"; + } + } -defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>; -defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>; -defm V_MAX_LEGACY_F32 : VOP2_Real_e32e64_si <0xe>; -defm V_LSHR_B32 : VOP2_Real_e32e64_si <0x15>; -defm V_ASHR_I32 : VOP2_Real_e32e64_si <0x17>; -defm V_LSHL_B32 : VOP2_Real_e32e64_si <0x19>; - -defm V_BFM_B32 : VOP2_Real_e32e64_si <0x1e>; -defm V_BCNT_U32_B32 : VOP2_Real_e32e64_si <0x22>; -defm V_MBCNT_LO_U32_B32 : VOP2_Real_e32e64_si <0x23>; -defm V_MBCNT_HI_U32_B32 : VOP2_Real_e32e64_si <0x24>; -defm V_LDEXP_F32 : VOP2_Real_e32e64_si <0x2b>; -defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e32e64_si <0x2c>; -defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e32e64_si <0x2d>; -defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e32e64_si <0x2e>; -defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e32e64_si <0x2f>; -defm V_CVT_PK_U16_U32 : VOP2_Real_e32e64_si <0x30>; -defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>; + //===----------------------------- VOP3Only -----------------------------===// + multiclass VOP3Only_Real_gfx10 op> { + def _e64_gfx10 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10(NAME#"_e64").Pfl>; + } + //===---------------------------- VOP3beOnly ----------------------------===// + multiclass VOP3beOnly_Real_gfx10 op, string opName, string asmName> { + def _e64_gfx10 : + VOP3_Real(opName#"_e64"), SIEncodingFamily.GFX10>, + VOP3be_gfx10(opName#"_e64").Pfl> { + VOP3_Pseudo Ps = !cast(opName#"_e64"); + let AsmString = asmName # Ps.AsmOperands; + } + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +multiclass Base_VOP2_Real_gfx10 op> : + VOP2_Real_e32_gfx10, VOP2_Real_e64_gfx10; + +multiclass VOP2_Real_gfx10 op> : + VOP2_Real_e32_gfx10, VOP2_Real_e64_gfx10, + VOP2_Real_sdwa_gfx10; + +multiclass VOP2_Real_gfx10_with_name op, string opName, + string asmName> : + VOP2_Real_e32_gfx10_with_name, + VOP2_Real_e64_gfx10_with_name, + VOP2_Real_sdwa_gfx10_with_name; + +defm V_CNDMASK_B32 : Base_VOP2_Real_gfx10<0x001>; +defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>; +defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>; +defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>; +defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10<0x02d>; +defm V_ADD_F16 : VOP2_Real_gfx10<0x032>; +defm V_SUB_F16 : VOP2_Real_gfx10<0x033>; +defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>; +defm V_MUL_F16 : VOP2_Real_gfx10<0x035>; +defm V_FMAC_F16 : VOP2_Real_gfx10<0x036>; +defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10<0x037>; +defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>; +defm V_MAX_F16 : VOP2_Real_gfx10<0x039>; +defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>; +defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; +defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; + +// VOP2 no carry-in, carry-out. +defm V_ADD_NC_U32 : + VOP2_Real_gfx10_with_name<0x025, "V_ADD_U32", "v_add_nc_u32">; +defm V_SUB_NC_U32 : + VOP2_Real_gfx10_with_name<0x026, "V_SUB_U32", "v_sub_nc_u32">; +defm V_SUBREV_NC_U32 : + VOP2_Real_gfx10_with_name<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">; + +// VOP2 carry-in, carry-out. +defm V_ADD_CO_CI_U32 : + VOP2be_Real_gfx10<0x028, "V_ADDC_U32", "v_add_co_ci_u32">; +defm V_SUB_CO_CI_U32 : + VOP2be_Real_gfx10<0x029, "V_SUBB_U32", "v_sub_co_ci_u32">; +defm V_SUBREV_CO_CI_U32 : + VOP2be_Real_gfx10<0x02a, "V_SUBBREV_U32", "v_subrev_co_ci_u32">; + +// VOP3 only. +defm V_BFM_B32 : VOP3Only_Real_gfx10<0x363>; +defm V_BCNT_U32_B32 : VOP3Only_Real_gfx10<0x364>; +defm V_MBCNT_LO_U32_B32 : VOP3Only_Real_gfx10<0x365>; +defm V_MBCNT_HI_U32_B32 : VOP3Only_Real_gfx10<0x366>; +defm V_LDEXP_F32 : VOP3Only_Real_gfx10<0x362>; +defm V_CVT_PKNORM_I16_F32 : VOP3Only_Real_gfx10<0x368>; +defm V_CVT_PKNORM_U16_F32 : VOP3Only_Real_gfx10<0x369>; +defm V_CVT_PK_U16_U32 : VOP3Only_Real_gfx10<0x36a>; +defm V_CVT_PK_I16_I32 : VOP3Only_Real_gfx10<0x36b>; + +// VOP3 carry-in, carry-out. +defm V_ADD_CO_U32 : + VOP3beOnly_Real_gfx10<0x30f, "V_ADD_I32", "v_add_co_u32">; +defm V_SUB_CO_U32 : + VOP3beOnly_Real_gfx10<0x310, "V_SUB_I32", "v_sub_co_u32">; +defm V_SUBREV_CO_U32 : + VOP3beOnly_Real_gfx10<0x319, "V_SUBREV_I32", "v_subrev_co_u32">; + +let SubtargetPredicate = isGFX10Plus in { + defm : VOP2eInstAliases; + + defm : VOP2bInstAliases< + V_ADDC_U32_e32, V_ADD_CO_CI_U32_e32_gfx10, "v_add_co_ci_u32">; + defm : VOP2bInstAliases< + V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx10, "v_sub_co_ci_u32">; + defm : VOP2bInstAliases< + V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx10, "v_subrev_co_ci_u32">; +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// -// VI +// GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// class VOP2_DPPe op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : @@ -815,6 +997,110 @@ class VOP2_DPPe op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : let Inst{31} = 0x0; //encoding } +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass VOP2Only_Real_gfx6_gfx7 op> { + def _gfx6_gfx7 : + VOP2_Real(NAME), SIEncodingFamily.SI>, + VOP2e(NAME).Pfl>; + } + multiclass VOP2Only_Real_MADK_gfx6_gfx7 op> { + def _gfx6_gfx7 : + VOP2_Real(NAME), SIEncodingFamily.SI>, + VOP2_MADKe(NAME).Pfl>; + } + multiclass VOP2_Real_e32_gfx6_gfx7 op> { + def _e32_gfx6_gfx7 : + VOP2_Real(NAME#"_e32"), SIEncodingFamily.SI>, + VOP2e(NAME#"_e32").Pfl>; + } + multiclass VOP2_Real_e64_gfx6_gfx7 op> { + def _e64_gfx6_gfx7 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; + } + multiclass VOP2be_Real_e64_gfx6_gfx7 op> { + def _e64_gfx6_gfx7 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass VOP2Only_Real_MADK_gfx6_gfx7_gfx10 op> : + VOP2Only_Real_MADK_gfx6_gfx7, VOP2Only_Real_MADK_gfx10; + +multiclass VOP2_Real_gfx6_gfx7 op> : + VOP2_Real_e32_gfx6_gfx7, VOP2_Real_e64_gfx6_gfx7; + +multiclass VOP2_Real_gfx6_gfx7_gfx10 op> : + VOP2_Real_gfx6_gfx7, VOP2_Real_gfx10; + +multiclass VOP2be_Real_gfx6_gfx7 op> : + VOP2_Real_e32_gfx6_gfx7, VOP2be_Real_e64_gfx6_gfx7; + +defm V_CNDMASK_B32 : VOP2_Real_gfx6_gfx7<0x000>; +defm V_MIN_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00d>; +defm V_MAX_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00e>; +defm V_LSHR_B32 : VOP2_Real_gfx6_gfx7<0x015>; +defm V_ASHR_I32 : VOP2_Real_gfx6_gfx7<0x017>; +defm V_LSHL_B32 : VOP2_Real_gfx6_gfx7<0x019>; +defm V_BFM_B32 : VOP2_Real_gfx6_gfx7<0x01e>; +defm V_BCNT_U32_B32 : VOP2_Real_gfx6_gfx7<0x022>; +defm V_MBCNT_LO_U32_B32 : VOP2_Real_gfx6_gfx7<0x023>; +defm V_MBCNT_HI_U32_B32 : VOP2_Real_gfx6_gfx7<0x024>; +defm V_LDEXP_F32 : VOP2_Real_gfx6_gfx7<0x02b>; +defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_gfx6_gfx7<0x02c>; +defm V_CVT_PKNORM_I16_F32 : VOP2_Real_gfx6_gfx7<0x02d>; +defm V_CVT_PKNORM_U16_F32 : VOP2_Real_gfx6_gfx7<0x02e>; +defm V_CVT_PK_U16_U32 : VOP2_Real_gfx6_gfx7<0x030>; +defm V_CVT_PK_I16_I32 : VOP2_Real_gfx6_gfx7<0x031>; +defm V_ADD_I32 : VOP2be_Real_gfx6_gfx7<0x025>; +defm V_SUB_I32 : VOP2be_Real_gfx6_gfx7<0x026>; +defm V_SUBREV_I32 : VOP2be_Real_gfx6_gfx7<0x027>; +defm V_ADDC_U32 : VOP2be_Real_gfx6_gfx7<0x028>; +defm V_SUBB_U32 : VOP2be_Real_gfx6_gfx7<0x029>; +defm V_SUBBREV_U32 : VOP2be_Real_gfx6_gfx7<0x02a>; + +defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>; + +let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { + defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>; +} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) + +let SubtargetPredicate = isGFX6GFX7 in { + defm : VOP2eInstAliases; +} // End SubtargetPredicate = isGFX6GFX7 + +defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>; +defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>; +defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>; +defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>; +defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>; +defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>; +defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x009>; +defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x00a>; +defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00b>; +defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00c>; +defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x00f>; +defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x010>; +defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x011>; +defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x012>; +defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x013>; +defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x014>; +defm V_LSHRREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x016>; +defm V_ASHRREV_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x018>; +defm V_LSHLREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01a>; +defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01b>; +defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01c>; +defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01d>; +defm V_MAC_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x01f>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x02f>; +defm V_MADMK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x020>; +defm V_MADAK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x021>; + +//===----------------------------------------------------------------------===// +// GFX8, GFX9 (VI). +//===----------------------------------------------------------------------===// + let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { multiclass VOP2_Real_MADK_vi op> { @@ -1061,6 +1347,14 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; +defm : VOP2eInstAliases; + +defm : VOP2bInstAliases; +defm : VOP2bInstAliases; +defm : VOP2bInstAliases; +defm : VOP2bInstAliases; +defm : VOP2bInstAliases; +defm : VOP2bInstAliases; } // End SubtargetPredicate = isGFX8GFX9 let SubtargetPredicate = HasDLInsts in { diff --git a/llvm/test/CodeGen/AMDGPU/max-sgprs.ll b/llvm/test/CodeGen/AMDGPU/max-sgprs.ll new file mode 100644 index 0000000..7c913dd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/max-sgprs.ll @@ -0,0 +1,25 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}max_sgprs_gfx10: +; GCN: NumSgprs: 108 +define amdgpu_kernel void @max_sgprs_gfx10() #0 { + call void asm sideeffect "", "~{s[0:7]}" () + call void asm sideeffect "", "~{s[8:15]}" () + call void asm sideeffect "", "~{s[16:23]}" () + call void asm sideeffect "", "~{s[24:31]}" () + call void asm sideeffect "", "~{s[32:39]}" () + call void asm sideeffect "", "~{s[40:47]}" () + call void asm sideeffect "", "~{s[48:55]}" () + call void asm sideeffect "", "~{s[56:63]}" () + call void asm sideeffect "", "~{s[64:71]}" () + call void asm sideeffect "", "~{s[72:79]}" () + call void asm sideeffect "", "~{s[80:87]}" () + call void asm sideeffect "", "~{s[88:95]}" () + call void asm sideeffect "", "~{s[96:99]}" () + call void asm sideeffect "", "~{s[100:104]}" () + call void asm sideeffect "", "~{s105}" () + call void asm sideeffect "", "~{vcc}" () + ret void +} + +attributes #0 = { nounwind "target-cpu"="gfx1010" } diff --git a/llvm/test/MC/Disassembler/AMDGPU/null-reg.txt b/llvm/test/MC/Disassembler/AMDGPU/null-reg.txt new file mode 100644 index 0000000..b7e9d57 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/null-reg.txt @@ -0,0 +1,7 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding --disassemble < %s | FileCheck -check-prefix=GFX10 %s + +0x7d,0x04,0x00,0x10 +# GFX10: v_mul_f32_e32 v0, null, v2 ; encoding: [0x7d,0x04,0x00,0x10] + +0x7d,0x7d,0x7d,0x80 +# GFX10: s_add_u32 null, null, null ; encoding: [0x7d,0x7d,0x7d,0x80]