From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 15 Nov 2017 21:51:43 +0000 (+0000)
Subject: AMDGPU: Replace i64 add/sub lowering
X-Git-Tag: llvmorg-6.0.0-rc1~3298
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=301162c4fe0daa33b53f5e62c713cc9e42696800;p=platform%2Fupstream%2Fllvm.git

AMDGPU: Replace i64 add/sub lowering

Use VOP3 add/addc like usual.

This has some tradeoffs. Inline immediates fold
a little better, but other constants are worse off.
SIShrinkInstructions could be made smarter to handle
these cases.

This allows us to avoid selecting scalar adds where we
need to track the carry in scc and replace its users.
This makes it easier to use the carryless VALU adds.

llvm-svn: 318340
---

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c9cefe3..b6449b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -459,10 +459,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   // We are selecting i64 ADD here instead of custom lower it during
   // DAG legalization, so we can fold some i64 ADDs used for address
   // calculation into the LOAD and STORE instructions.
-  case ISD::ADD:
   case ISD::ADDC:
   case ISD::ADDE:
-  case ISD::SUB:
   case ISD::SUBC:
   case ISD::SUBE: {
     if (N->getValueType(0) != MVT::i64)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 43c4be3..247a594 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2602,8 +2602,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
   MachineBasicBlock::iterator I(&MI);
 
   unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
 
   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
 
@@ -2954,13 +2954,57 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   }
 
   switch (MI.getOpcode()) {
-  case AMDGPU::SI_INIT_M0:
+  case AMDGPU::S_ADD_U64_PSEUDO:
+  case AMDGPU::S_SUB_U64_PSEUDO: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &Src0 = MI.getOperand(1);
+    MachineOperand &Src1 = MI.getOperand(2);
+
+    unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+    MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
+     Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+     &AMDGPU::SReg_32_XM0RegClass);
+    MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
+      Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+      &AMDGPU::SReg_32_XM0RegClass);
+
+    MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
+      Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+      &AMDGPU::SReg_32_XM0RegClass);
+    MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
+      Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+      &AMDGPU::SReg_32_XM0RegClass);
+
+    bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+
+    unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+    unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+    BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
+      .add(Src0Sub0)
+      .add(Src1Sub0);
+    BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
+      .add(Src0Sub1)
+      .add(Src1Sub1);
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+      .addReg(DestSub0)
+      .addImm(AMDGPU::sub0)
+      .addReg(DestSub1)
+      .addImm(AMDGPU::sub1);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::SI_INIT_M0: {
     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
         .add(MI.getOperand(0));
     MI.eraseFromParent();
     return BB;
-
+  }
   case AMDGPU::SI_INIT_EXEC:
     // This should be before all vector instructions.
     BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5b85174..40db10b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3608,6 +3608,11 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     switch (Opcode) {
     default:
       break;
+    case AMDGPU::S_ADD_U64_PSEUDO:
+    case AMDGPU::S_SUB_U64_PSEUDO:
+      splitScalar64BitAddSub(Worklist, Inst);
+      Inst.eraseFromParent();
+      continue;
     case AMDGPU::S_AND_B64:
       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
       Inst.eraseFromParent();
@@ -3956,6 +3961,74 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
+void SIInstrInfo::splitScalar64BitAddSub(
+  SetVectorType &Worklist, MachineInstr &Inst) const {
+  bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+  unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  const DebugLoc &DL = Inst.getDebugLoc();
+  MachineBasicBlock::iterator MII = Inst;
+
+  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub0, Src1SubRC);
+
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub1, Src1SubRC);
+
+  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+  MachineInstr *LoHalf =
+    BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
+    .addReg(CarryReg, RegState::Define)
+    .add(SrcReg0Sub0)
+    .add(SrcReg1Sub0);
+
+  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
+  MachineInstr *HiHalf =
+    BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
+    .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+    .add(SrcReg0Sub1)
+    .add(SrcReg1Sub1)
+    .addReg(CarryReg, RegState::Kill);
+
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  legalizeOperands(*LoHalf);
+  legalizeOperands(*HiHalf);
+
+  // Move all users of this moved vlaue.
+  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
 void SIInstrInfo::splitScalar64BitBinaryOp(
     SetVectorType &Worklist, MachineInstr &Inst,
     unsigned Opcode) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5e84e0c..acab71d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -60,6 +60,7 @@ private:
   static unsigned getBranchOpcode(BranchPredicate Cond);
   static BranchPredicate getBranchPredicate(unsigned Opcode);
 
+public:
   unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
                               MachineRegisterInfo &MRI,
                               MachineOperand &SuperReg,
@@ -72,7 +73,7 @@ private:
                                          const TargetRegisterClass *SuperRC,
                                          unsigned SubIdx,
                                          const TargetRegisterClass *SubRC) const;
-
+private:
   void swapOperands(MachineInstr &Inst) const;
 
   void lowerScalarAbs(SetVectorType &Worklist,
@@ -84,6 +85,9 @@ private:
   void splitScalar64BitUnaryOp(SetVectorType &Worklist,
                                MachineInstr &Inst, unsigned Opcode) const;
 
+  void splitScalar64BitAddSub(SetVectorType &Worklist,
+                              MachineInstr &Inst) const;
+
   void splitScalar64BitBinaryOp(SetVectorType &Worklist,
                                 MachineInstr &Inst, unsigned Opcode) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6cee5be..4339c85 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -108,8 +108,8 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
   let usesCustomInserter = 1;
 }
 
-// 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
-// pass to enable folding of inline immediates.
+// 64-bit vector move instruction. This is mainly used by the
+// SIFoldOperands pass to enable folding of inline immediates.
 def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)>;
 
@@ -147,6 +147,24 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
   let Constraints = "$src = $vdst";
 }
 
+
+let usesCustomInserter = 1, Defs = [SCC] in {
+def S_ADD_U64_PSEUDO : SPseudoInstSI <
+  (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+  [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))]
+>;
+
+def S_SUB_U64_PSEUDO : SPseudoInstSI <
+  (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+  [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))]
+>;
+
+def S_ADDC_U64_PSEUDO : SPseudoInstSI <(outs SReg_64:$vdst, SReg_64:$sdst),
+                                       (ins SSrc_b64:$src0, SSrc_b64:$src1)>;
+def S_SUBC_U64_PSEUDO : SPseudoInstSI <(outs SReg_64:$vdst, SReg_64:$sdst),
+                                       (ins SSrc_b64:$src0, SSrc_b64:$src1)>;
+} // End usesCustomInserter = 1, Defs = [SCC]
+
 let usesCustomInserter = 1, SALU = 1 in {
 def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
   [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index f999db3..a2f647c 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -163,10 +163,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(
 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
 
-; VI: flat_load_ushort v[[A_HI:[0-9]+]]
 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
-; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
 
 ; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
 ; VI-NOT: and
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 216ecf7..6832357 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -398,7 +398,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %ou
 
 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll
index 5ea3903..2d16d40 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll
@@ -44,8 +44,8 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
 ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
 ; SI: buffer_load_dword [[VAL0:v[0-9]+]],
 ; SI: buffer_load_dword [[VAL1:v[0-9]+]],
-; VI: flat_load_dword [[VAL1:v[0-9]+]],
 ; VI: flat_load_dword [[VAL0:v[0-9]+]],
+; VI: flat_load_dword [[VAL1:v[0-9]+]],
 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
 ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
@@ -58,8 +58,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out,
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
-  %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4
-  %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep, align 4
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
   %add = add i32 %ctpop0, %ctpop1
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index adfa43f..404ad60 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -261,13 +261,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000
 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 
 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
 
+; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[AND]]
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
 define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -345,13 +347,16 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_1:
 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000
 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 
 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
 
+; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[AND]]
+
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
 define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -423,11 +428,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
-
-; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
@@ -452,11 +455,10 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
 
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+
 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
-
-; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 5496976..72aac23 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -8,7 +8,7 @@
 ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
 
-; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CI-DAG:   v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index b4d9d92..91a53ee 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -69,8 +69,9 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i128:
 ; CI: v_mad_u64_u32
 ; CI: v_mad_u64_u32
-; CI: v_mad_u64_u32
 ; CI: v_mad_i64_i32
+; CI: v_mad_u64_u32
+
 
 ; SI-NOT: v_mad_
 define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 555c65a..b35eefa 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -211,6 +211,7 @@ endif:
 ; SI: s_mul_i32
 ; SI: v_mul_hi_u32
 ; SI: s_mul_i32
+; SI: s_mul_i32
 
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_u32
@@ -219,22 +220,16 @@ endif:
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_u32
 
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-
-
 ; VI: s_mul_i32
-; VI: v_mul_hi_u32
 ; VI: v_mad_u64_u32
 ; VI: s_mul_i32
 ; VI: v_mul_hi_u32
 ; VI: v_mad_u64_u32
+; VI: v_mul_hi_u32
 ; VI: v_mad_u64_u32
 
 
+
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
   %mul = mul i128 %a, %b
@@ -246,15 +241,15 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b)
 ; GCN: {{buffer|flat}}_load_dwordx4
 ; GCN: {{buffer|flat}}_load_dwordx4
 
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_add_i32_e32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_add_i32_e32
 
 ; SI-DAG: v_mul_hi_u32
 ; SI-DAG: v_mul_lo_i32
@@ -265,7 +260,9 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b)
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_lo_i32
 
-; VI-DAG: v_mad_u64_u32
+; VI-DAG: v_mul_lo_i32
+; VI-DAG: v_mul_hi_u32
+; VI: v_mad_u64_u32
 ; VI: v_mad_u64_u32
 ; VI: v_mad_u64_u32
 
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index cae713b..2072cf5 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -7,7 +7,7 @@
 ; SI: NumVgprs: {{[1-9]$}}
 
 ; stores may alias loads
-; VI: NumSgprs: {{[1-5][0-9]$}}
+; VI: NumSgprs: {{[0-9]$}}
 ; VI: NumVgprs: {{[1-3][0-9]$}}
 
 define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
diff --git a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index 5d7d29d..59ddd61 100644
--- a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -7,8 +7,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; set in vcc, which is undefined since the low scalar half add sets
 ; scc instead.
 
+; FIXME: SIShrinkInstructions should force immediate fold.
+
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
-; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}}
+; SI: s_movk_i32 [[K:s[0-9]+]], 0x18f
+; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}}
 ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
 define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
   %v.val = load volatile i32, i32 addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
index fd70f2b..3ad091a 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
@@ -85,9 +85,9 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
-; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
+; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 46f1b12..4c573ac 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -57,7 +57,7 @@ define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
     %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-    %a = load i16, i16 addrspace(1)* %in 
+    %a = load i16, i16 addrspace(1)* %in
     %b = load i16, i16 addrspace(1)* %b_ptr
     %result = sub i16 %a, %b
     store i16 %result, i16 addrspace(1)* %out
@@ -71,7 +71,7 @@ define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)
 
 define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
     %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
-    %a = load <2 x i16>, <2 x i16> addrspace(1) * %in 
+    %a = load <2 x i16>, <2 x i16> addrspace(1) * %in
     %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
     %result = sub <2 x i16> %a, %b
     store <2 x i16> %result, <2 x i16> addrspace(1)* %out
@@ -87,7 +87,7 @@ define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 
 define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
     %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
-    %a = load <4 x i16>, <4 x i16> addrspace(1) * %in 
+    %a = load <4 x i16>, <4 x i16> addrspace(1) * %in
     %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
     %result = sub <4 x i16> %a, %b
     store <4 x i16> %result, <4 x i16> addrspace(1)* %out
@@ -146,13 +146,13 @@ define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 }
 
 ; FUNC-LABEL: {{^}}v_test_sub_v4i64:
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
 define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 6e76575..620f38e5 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -160,10 +160,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
 
-; VI: flat_load_ushort v[[A_HI:[0-9]+]]
 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
-; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+
 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
 
 ; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
 ; VI-NOT: and