AMDGPU: Replace i64 add/sub lowering

author Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 15 Nov 2017 21:51:43 +0000 (21:51 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 15 Nov 2017 21:51:43 +0000 (21:51 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 15 Nov 2017 21:51:43 +0000 (21:51 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 15 Nov 2017 21:51:43 +0000 (21:51 +0000)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

index c9cefe3..b6449b9 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -459,10 +459,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
    // We are selecting i64 ADD here instead of custom lower it during
    // DAG legalization, so we can fold some i64 ADDs used for address
    // calculation into the LOAD and STORE instructions.
-  case ISD::ADD:
    case ISD::ADDC:
    case ISD::ADDE:
-  case ISD::SUB:
    case ISD::SUBC:
    case ISD::SUBE: {
      if (N->getValueType(0) != MVT::i64)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index 43c4be3..247a594 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2602,8 +2602,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
    MachineBasicBlock::iterator I(&MI);
  
    unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
  
    BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
  
@@ -2954,13 +2954,57 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    }
  
    switch (MI.getOpcode()) {
-  case AMDGPU::SI_INIT_M0:
+  case AMDGPU::S_ADD_U64_PSEUDO:
+  case AMDGPU::S_SUB_U64_PSEUDO: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &Src0 = MI.getOperand(1);
+    MachineOperand &Src1 = MI.getOperand(2);
+
+    unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+    MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
+     Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+     &AMDGPU::SReg_32_XM0RegClass);
+    MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
+      Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+      &AMDGPU::SReg_32_XM0RegClass);
+
+    MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
+      Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+      &AMDGPU::SReg_32_XM0RegClass);
+    MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
+      Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+      &AMDGPU::SReg_32_XM0RegClass);
+
+    bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+
+    unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+    unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+    BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
+      .add(Src0Sub0)
+      .add(Src1Sub0);
+    BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
+      .add(Src0Sub1)
+      .add(Src1Sub1);
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+      .addReg(DestSub0)
+      .addImm(AMDGPU::sub0)
+      .addReg(DestSub1)
+      .addImm(AMDGPU::sub1);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::SI_INIT_M0: {
      BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
              TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
          .add(MI.getOperand(0));
      MI.eraseFromParent();
      return BB;
-
+  }
    case AMDGPU::SI_INIT_EXEC:
      // This should be before all vector instructions.
      BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

index 5b85174..40db10b 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3608,6 +3608,11 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
      switch (Opcode) {
      default:
        break;
+    case AMDGPU::S_ADD_U64_PSEUDO:
+    case AMDGPU::S_SUB_U64_PSEUDO:
+      splitScalar64BitAddSub(Worklist, Inst);
+      Inst.eraseFromParent();
+      continue;
      case AMDGPU::S_AND_B64:
        splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
        Inst.eraseFromParent();
@@ -3956,6 +3961,74 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
    addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
  }
  
+void SIInstrInfo::splitScalar64BitAddSub(
+  SetVectorType &Worklist, MachineInstr &Inst) const {
+  bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+  unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  const DebugLoc &DL = Inst.getDebugLoc();
+  MachineBasicBlock::iterator MII = Inst;
+
+  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub0, Src1SubRC);
+
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub1, Src1SubRC);
+
+  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+  MachineInstr *LoHalf =
+    BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
+    .addReg(CarryReg, RegState::Define)
+    .add(SrcReg0Sub0)
+    .add(SrcReg1Sub0);
+
+  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
+  MachineInstr *HiHalf =
+    BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
+    .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+    .add(SrcReg0Sub1)
+    .add(SrcReg1Sub1)
+    .addReg(CarryReg, RegState::Kill);
+
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  legalizeOperands(*LoHalf);
+  legalizeOperands(*HiHalf);
+
+  // Move all users of this moved vlaue.
+  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
  void SIInstrInfo::splitScalar64BitBinaryOp(
      SetVectorType &Worklist, MachineInstr &Inst,
      unsigned Opcode) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h

index 5e84e0c..acab71d 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -60,6 +60,7 @@ private:
    static unsigned getBranchOpcode(BranchPredicate Cond);
    static BranchPredicate getBranchPredicate(unsigned Opcode);
  
+public:
    unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
                                MachineRegisterInfo &MRI,
                                MachineOperand &SuperReg,
@@ -72,7 +73,7 @@ private:
                                           const TargetRegisterClass *SuperRC,
                                           unsigned SubIdx,
                                           const TargetRegisterClass *SubRC) const;
-
+private:
    void swapOperands(MachineInstr &Inst) const;
  
    void lowerScalarAbs(SetVectorType &Worklist,
@@ -84,6 +85,9 @@ private:
    void splitScalar64BitUnaryOp(SetVectorType &Worklist,
                                 MachineInstr &Inst, unsigned Opcode) const;
  
+  void splitScalar64BitAddSub(SetVectorType &Worklist,
+                              MachineInstr &Inst) const;
+
    void splitScalar64BitBinaryOp(SetVectorType &Worklist,
                                  MachineInstr &Inst, unsigned Opcode) const;
  
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td

index 6cee5be..4339c85 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -108,8 +108,8 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
    let usesCustomInserter = 1;
  }
  
-// 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
-// pass to enable folding of inline immediates.
+// 64-bit vector move instruction. This is mainly used by the
+// SIFoldOperands pass to enable folding of inline immediates.
  def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                        (ins VSrc_b64:$src0)>;
  
@@ -147,6 +147,24 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
    let Constraints = "$src = $vdst";
  }
  
+
+let usesCustomInserter = 1, Defs = [SCC] in {
+def S_ADD_U64_PSEUDO : SPseudoInstSI <
+  (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+  [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))]
+>;
+
+def S_SUB_U64_PSEUDO : SPseudoInstSI <
+  (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+  [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))]
+>;
+
+def S_ADDC_U64_PSEUDO : SPseudoInstSI <(outs SReg_64:$vdst, SReg_64:$sdst),
+                                       (ins SSrc_b64:$src0, SSrc_b64:$src1)>;
+def S_SUBC_U64_PSEUDO : SPseudoInstSI <(outs SReg_64:$vdst, SReg_64:$sdst),
+                                       (ins SSrc_b64:$src0, SSrc_b64:$src1)>;
+} // End usesCustomInserter = 1, Defs = [SCC]
+
  let usesCustomInserter = 1, SALU = 1 in {
  def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
    [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll

index f999db3..a2f647c 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -163,10 +163,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(
  ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
  ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
  
-; VI: flat_load_ushort v[[A_HI:[0-9]+]]
  ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
-; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
  ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
  
  ; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
  ; VI-NOT: and
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll

index 216ecf7..6832357 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -398,7 +398,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %ou
  
  ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
  ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
  define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll

index 5ea3903..2d16d40 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/ctpop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll
@@ -44,8 +44,8 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
  ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
  ; SI: buffer_load_dword [[VAL0:v[0-9]+]],
  ; SI: buffer_load_dword [[VAL1:v[0-9]+]],
-; VI: flat_load_dword [[VAL1:v[0-9]+]],
  ; VI: flat_load_dword [[VAL0:v[0-9]+]],
+; VI: flat_load_dword [[VAL1:v[0-9]+]],
  ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
  ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
  ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
@@ -58,8 +58,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out,
    %tid = call i32 @llvm.r600.read.tidig.x()
    %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
    %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
-  %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4
-  %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep, align 4
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep, align 4
    %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
    %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
    %add = add i32 %ctpop0, %ctpop1
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

index adfa43f..404ad60 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -261,13 +261,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace
  ; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
  ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000
  ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
  
  ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
  ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
  ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
  
+; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[AND]]
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
  ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
  define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -345,13 +347,16 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac
  ; GCN-LABEL: {{^}}v_insertelement_v2f16_1:
  ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000
  ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
  
  ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
  ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
  ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
  
+; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[AND]]
+
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
  ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
  define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -423,11 +428,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
  
  ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
  ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
  ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
  ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
-
-; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
  
  ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
  ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
@@ -452,11 +455,10 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
  
  ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
  ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+
  ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
  ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
-
-; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
  
  ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
  ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll

index 5496976..72aac23 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -8,7 +8,7 @@
  ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
  ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
  
-; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
  ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
  ; CI-DAG:   v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
  ; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll

index b4d9d92..91a53ee 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -69,8 +69,9 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
  ; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i128:
  ; CI: v_mad_u64_u32
  ; CI: v_mad_u64_u32
-; CI: v_mad_u64_u32
  ; CI: v_mad_i64_i32
+; CI: v_mad_u64_u32
+
  
  ; SI-NOT: v_mad_
  define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll

index 555c65a..b35eefa 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -211,6 +211,7 @@ endif:
  ; SI: s_mul_i32
  ; SI: v_mul_hi_u32
  ; SI: s_mul_i32
+; SI: s_mul_i32
  
  ; SI-DAG: s_mul_i32
  ; SI-DAG: v_mul_hi_u32
@@ -219,22 +220,16 @@ endif:
  ; SI-DAG: s_mul_i32
  ; SI-DAG: v_mul_hi_u32
  
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-
-
  ; VI: s_mul_i32
-; VI: v_mul_hi_u32
  ; VI: v_mad_u64_u32
  ; VI: s_mul_i32
  ; VI: v_mul_hi_u32
  ; VI: v_mad_u64_u32
+; VI: v_mul_hi_u32
  ; VI: v_mad_u64_u32
  
  
+
  ; GCN: buffer_store_dwordx4
  define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
    %mul = mul i128 %a, %b
@@ -246,15 +241,15 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b)
  ; GCN: {{buffer|flat}}_load_dwordx4
  ; GCN: {{buffer|flat}}_load_dwordx4
  
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_add_i32_e32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_add_i32_e32
  
  ; SI-DAG: v_mul_hi_u32
  ; SI-DAG: v_mul_lo_i32
@@ -265,7 +260,9 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b)
  ; SI-DAG: v_mul_lo_i32
  ; SI-DAG: v_mul_lo_i32
  
-; VI-DAG: v_mad_u64_u32
+; VI-DAG: v_mul_lo_i32
+; VI-DAG: v_mul_hi_u32
+; VI: v_mad_u64_u32
  ; VI: v_mad_u64_u32
  ; VI: v_mad_u64_u32
  
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll

index cae713b..2072cf5 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -7,7 +7,7 @@
  ; SI: NumVgprs: {{[1-9]$}}
  
  ; stores may alias loads
-; VI: NumSgprs: {{[1-5][0-9]$}}
+; VI: NumSgprs: {{[0-9]$}}
  ; VI: NumVgprs: {{[1-3][0-9]$}}
  
  define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
diff --git a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll

index 5d7d29d..59ddd61 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -7,8 +7,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
  ; set in vcc, which is undefined since the low scalar half add sets
  ; scc instead.
  
+; FIXME: SIShrinkInstructions should force immediate fold.
+
  ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
-; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}}
+; SI: s_movk_i32 [[K:s[0-9]+]], 0x18f
+; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}}
  ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
  define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
    %v.val = load volatile i32, i32 addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll

index fd70f2b..3ad091a 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
@@ -85,9 +85,9 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
  
  ; FIXME: Need to handle non-uniform case for function below (load without gep).
  ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
-; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
  ; VI: flat_load_ushort [[A:v[0-9]+]]
  ; VI: flat_load_ushort [[B:v[0-9]+]]
+; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
  ; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
  ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
  define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll

index 46f1b12..4c573ac 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -57,7 +57,7 @@ define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32
  ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
  define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
      %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-    %a = load i16, i16 addrspace(1)* %in 
+    %a = load i16, i16 addrspace(1)* %in
      %b = load i16, i16 addrspace(1)* %b_ptr
      %result = sub i16 %a, %b
      store i16 %result, i16 addrspace(1)* %out
@@ -71,7 +71,7 @@ define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)
  
  define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
      %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
-    %a = load <2 x i16>, <2 x i16> addrspace(1) * %in 
+    %a = load <2 x i16>, <2 x i16> addrspace(1) * %in
      %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
      %result = sub <2 x i16> %a, %b
      store <2 x i16> %result, <2 x i16> addrspace(1)* %out
@@ -87,7 +87,7 @@ define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
  
  define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
      %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
-    %a = load <4 x i16>, <4 x i16> addrspace(1) * %in 
+    %a = load <4 x i16>, <4 x i16> addrspace(1) * %in
      %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
      %result = sub <4 x i16> %a, %b
      store <4 x i16> %result, <4 x i16> addrspace(1)* %out
@@ -146,13 +146,13 @@ define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
  }
  
  ; FUNC-LABEL: {{^}}v_test_sub_v4i64:
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
  ; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
  ; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
  ; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
  ; SI: v_subb_u32_e32
  define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
    %tid = call i32 @llvm.r600.read.tidig.x() readnone
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll

index 6e76575..620f38e 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -160,10 +160,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
  ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
  ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
  
-; VI: flat_load_ushort v[[A_HI:[0-9]+]]
  ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
-; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+
  ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
  
  ; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
  ; VI-NOT: and
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 15 Nov 2017 21:51:43 +0000 (21:51 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 15 Nov 2017 21:51:43 +0000 (21:51 +0000)
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.h		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/add.v2i16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/clamp.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/ctpop.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/mad_64_32.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/mul.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/sub.i16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/sub.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/sub.v2i16.ll		patch \| blob \| history