Use VOP3 add/addc like usual.
This has some tradeoffs. Inline immediates fold
a little better, but other constants are worse off.
SIShrinkInstructions could be made smarter to handle
these cases.
This allows us to avoid selecting scalar adds where we
need to track the carry in scc and replace its users.
This makes it easier to use the carryless VALU adds.
llvm-svn: 318340
// We are selecting i64 ADD here instead of custom lower it during
// DAG legalization, so we can fold some i64 ADDs used for address
// calculation into the LOAD and STORE instructions.
- case ISD::ADD:
case ISD::ADDC:
case ISD::ADDE:
- case ISD::SUB:
case ISD::SUBC:
case ISD::SUBE: {
if (N->getValueType(0) != MVT::i64)
MachineBasicBlock::iterator I(&MI);
unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
}
switch (MI.getOpcode()) {
- case AMDGPU::SI_INIT_M0:
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src0 = MI.getOperand(1);
+ MachineOperand &Src1 = MI.getOperand(2);
+
+ unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+ &AMDGPU::SReg_32_XM0RegClass);
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+ &AMDGPU::SReg_32_XM0RegClass);
+
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+ &AMDGPU::SReg_32_XM0RegClass);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+ &AMDGPU::SReg_32_XM0RegClass);
+
+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
+ .add(Src0Sub0)
+ .add(Src1Sub0);
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
+ .add(Src0Sub1)
+ .add(Src1Sub1);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::SI_INIT_M0: {
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.add(MI.getOperand(0));
MI.eraseFromParent();
return BB;
-
+ }
case AMDGPU::SI_INIT_EXEC:
// This should be before all vector instructions.
BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
switch (Opcode) {
default:
break;
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO:
+ splitScalar64BitAddSub(Worklist, Inst);
+ Inst.eraseFromParent();
+ continue;
case AMDGPU::S_AND_B64:
splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
Inst.eraseFromParent();
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
+void SIInstrInfo::splitScalar64BitAddSub(
+ SetVectorType &Worklist, MachineInstr &Inst) const {
+ bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ const DebugLoc &DL = Inst.getDebugLoc();
+ MachineBasicBlock::iterator MII = Inst;
+
+ const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
+ const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+ const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
+
+ MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub0, Src0SubRC);
+ MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+ AMDGPU::sub0, Src1SubRC);
+
+
+ MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub1, Src0SubRC);
+ MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+ AMDGPU::sub1, Src1SubRC);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ MachineInstr *LoHalf =
+ BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
+ .addReg(CarryReg, RegState::Define)
+ .add(SrcReg0Sub0)
+ .add(SrcReg1Sub0);
+
+ unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
+ MachineInstr *HiHalf =
+ BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
+ .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+ .add(SrcReg0Sub1)
+ .add(SrcReg1Sub1)
+ .addReg(CarryReg, RegState::Kill);
+
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+ // Try to legalize the operands in case we need to swap the order to keep it
+ // valid.
+ legalizeOperands(*LoHalf);
+ legalizeOperands(*HiHalf);
+
+ // Move all users of this moved vlaue.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
void SIInstrInfo::splitScalar64BitBinaryOp(
SetVectorType &Worklist, MachineInstr &Inst,
unsigned Opcode) const {
static unsigned getBranchOpcode(BranchPredicate Cond);
static BranchPredicate getBranchPredicate(unsigned Opcode);
+public:
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
MachineRegisterInfo &MRI,
MachineOperand &SuperReg,
const TargetRegisterClass *SuperRC,
unsigned SubIdx,
const TargetRegisterClass *SubRC) const;
-
+private:
void swapOperands(MachineInstr &Inst) const;
void lowerScalarAbs(SetVectorType &Worklist,
void splitScalar64BitUnaryOp(SetVectorType &Worklist,
MachineInstr &Inst, unsigned Opcode) const;
+ void splitScalar64BitAddSub(SetVectorType &Worklist,
+ MachineInstr &Inst) const;
+
void splitScalar64BitBinaryOp(SetVectorType &Worklist,
MachineInstr &Inst, unsigned Opcode) const;
let usesCustomInserter = 1;
}
-// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
-// pass to enable folding of inline immediates.
+// 64-bit vector move instruction. This is mainly used by the
+// SIFoldOperands pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;
let Constraints = "$src = $vdst";
}
+
+let usesCustomInserter = 1, Defs = [SCC] in {
+def S_ADD_U64_PSEUDO : SPseudoInstSI <
+ (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+ [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))]
+>;
+
+def S_SUB_U64_PSEUDO : SPseudoInstSI <
+ (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+ [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))]
+>;
+
+def S_ADDC_U64_PSEUDO : SPseudoInstSI <(outs SReg_64:$vdst, SReg_64:$sdst),
+ (ins SSrc_b64:$src0, SSrc_b64:$src1)>;
+def S_SUBC_U64_PSEUDO : SPseudoInstSI <(outs SReg_64:$vdst, SReg_64:$sdst),
+ (ins SSrc_b64:$src0, SSrc_b64:$src1)>;
+} // End usesCustomInserter = 1, Defs = [SCC]
+
let usesCustomInserter = 1, SALU = 1 in {
def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
-; VI: flat_load_ushort v[[A_HI:[0-9]+]]
; VI: flat_load_ushort v[[A_LO:[0-9]+]]
-; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; VI-NOT: and
; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
; SI: buffer_load_dword [[VAL0:v[0-9]+]],
; SI: buffer_load_dword [[VAL1:v[0-9]+]],
-; VI: flat_load_dword [[VAL1:v[0-9]+]],
; VI: flat_load_dword [[VAL0:v[0-9]+]],
+; VI: flat_load_dword [[VAL1:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
%tid = call i32 @llvm.r600.read.tidig.x()
%in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
%in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
- %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4
- %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4
+ %val0 = load volatile i32, i32 addrspace(1)* %in0.gep, align 4
+ %val1 = load volatile i32, i32 addrspace(1)* %in1.gep, align 4
%ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
%ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
%add = add i32 %ctpop0, %ctpop1
; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000
; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
+; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[AND]]
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
; GCN-LABEL: {{^}}v_insertelement_v2f16_1:
; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000
; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
+; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[AND]]
+
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
-
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
-
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
-; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i128:
; CI: v_mad_u64_u32
; CI: v_mad_u64_u32
-; CI: v_mad_u64_u32
; CI: v_mad_i64_i32
+; CI: v_mad_u64_u32
+
; SI-NOT: v_mad_
define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; SI: s_mul_i32
; SI: v_mul_hi_u32
; SI: s_mul_i32
+; SI: s_mul_i32
; SI-DAG: s_mul_i32
; SI-DAG: v_mul_hi_u32
; SI-DAG: s_mul_i32
; SI-DAG: v_mul_hi_u32
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-
-
; VI: s_mul_i32
-; VI: v_mul_hi_u32
; VI: v_mad_u64_u32
; VI: s_mul_i32
; VI: v_mul_hi_u32
; VI: v_mad_u64_u32
+; VI: v_mul_hi_u32
; VI: v_mad_u64_u32
+
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
%mul = mul i128 %a, %b
; GCN: {{buffer|flat}}_load_dwordx4
; GCN: {{buffer|flat}}_load_dwordx4
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_add_i32_e32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_add_i32_e32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
-; VI-DAG: v_mad_u64_u32
+; VI-DAG: v_mul_lo_i32
+; VI-DAG: v_mul_hi_u32
+; VI: v_mad_u64_u32
; VI: v_mad_u64_u32
; VI: v_mad_u64_u32
; SI: NumVgprs: {{[1-9]$}}
; stores may alias loads
-; VI: NumSgprs: {{[1-5][0-9]$}}
+; VI: NumSgprs: {{[0-9]$}}
; VI: NumVgprs: {{[1-3][0-9]$}}
define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
; set in vcc, which is undefined since the low scalar half add sets
; scc instead.
+; FIXME: SIShrinkInstructions should force immediate fold.
+
; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
-; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}}
+; SI: s_movk_i32 [[K:s[0-9]+]], 0x18f
+; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}}
; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
%v.val = load volatile i32, i32 addrspace(1)* %in
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
-; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
+; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
%b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
- %a = load i16, i16 addrspace(1)* %in
+ %a = load i16, i16 addrspace(1)* %in
%b = load i16, i16 addrspace(1)* %b_ptr
%result = sub i16 %a, %b
store i16 %result, i16 addrspace(1)* %out
define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
- %a = load <2 x i16>, <2 x i16> addrspace(1) * %in
+ %a = load <2 x i16>, <2 x i16> addrspace(1) * %in
%b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
%result = sub <2 x i16> %a, %b
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
- %a = load <4 x i16>, <4 x i16> addrspace(1) * %in
+ %a = load <4 x i16>, <4 x i16> addrspace(1) * %in
%b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
%result = sub <4 x i16> %a, %b
store <4 x i16> %result, <4 x i16> addrspace(1)* %out
}
; FUNC-LABEL: {{^}}v_test_sub_v4i64:
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
; SI: v_subb_u32_e32
define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
%tid = call i32 @llvm.r600.read.tidig.x() readnone
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
-; VI: flat_load_ushort v[[A_HI:[0-9]+]]
; VI: flat_load_ushort v[[A_LO:[0-9]+]]
-; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; VI-NOT: and