AMDGPU: Improve offset folding for register indexing

author Matt Arsenault <Matthew.Arsenault@amd.com>

Sat, 9 Jul 2016 01:13:56 +0000 (01:13 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Sat, 9 Jul 2016 01:13:56 +0000 (01:13 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Sat, 9 Jul 2016 01:13:56 +0000 (01:13 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Sat, 9 Jul 2016 01:13:56 +0000 (01:13 +0000)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

index 157b06b..fcaa6f9 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -129,6 +129,10 @@ private:
    bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
    bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
    bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
+  bool selectMOVRELOffsetImpl(SDValue Index, SDValue &Base,
+                              SDValue &Offset, bool IsInsert) const;
+  bool selectMOVRELSOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
+  bool selectMOVRELDOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
    bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
    bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
    bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@@ -1189,6 +1193,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
           !isa<ConstantSDNode>(Offset);
  }
  
+bool AMDGPUDAGToDAGISel::selectMOVRELOffsetImpl(SDValue Index,
+                                                SDValue &Base,
+                                                SDValue &Offset,
+                                                bool IsInsert) const {
+  SDLoc DL(Index);
+
+  if (CurDAG->isBaseWithConstantOffset(Index)) {
+    SDValue N0 = Index.getOperand(0);
+    SDValue N1 = Index.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+    // (add n0, c0)
+    Base = N0;
+    Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
+    return true;
+  }
+
+  if (IsInsert) {
+    if (ConstantSDNode *CBase = dyn_cast<ConstantSDNode>(Index)) {
+      Base = CurDAG->getRegister(AMDGPU::NoRegister, MVT::i32);
+      Offset = CurDAG->getTargetConstant(CBase->getZExtValue(), DL, MVT::i32);
+      return true;
+    }
+  } else {
+    if (isa<ConstantSDNode>(Index))
+      return false;
+  }
+
+  Base = Index;
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::selectMOVRELSOffset(SDValue Index,
+                                             SDValue &Base,
+                                             SDValue &Offset) const {
+  return selectMOVRELOffsetImpl(Index, Base, Offset, false);
+}
+
+bool AMDGPUDAGToDAGISel::selectMOVRELDOffset(SDValue Index,
+                                             SDValue &Base,
+                                             SDValue &Offset) const {
+  return selectMOVRELOffsetImpl(Index, Base, Offset, true);
+}
+
  SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
                                       SDValue Val, uint32_t Offset,
                                       uint32_t Width) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

index 213cdc3..798ff08 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1712,7 +1712,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
  
      if (RegClass != -1) {
        unsigned Reg = MI.getOperand(i).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Reg == AMDGPU::NoRegister ||
+          TargetRegisterInfo::isVirtualRegister(Reg))
          continue;
  
        const TargetRegisterClass *RC = RI.getRegClass(RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td

index 9ae5851..88d133f 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -624,6 +624,9 @@ def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
  def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
  def SMRDBufferSgpr  : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
  
+def MOVRELSOffset : ComplexPattern<i32, 2, "selectMOVRELSOffset">;
+def MOVRELDOffset : ComplexPattern<i32, 2, "selectMOVRELDOffset">;
+
  def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
  def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
  def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td

index 78bb1c1..3eebd1b 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2021,12 +2021,11 @@ let Uses = [EXEC], Defs = [EXEC, VCC, M0],
  
  class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
    (outs VGPR_32:$vdst, SReg_64:$sdst),
-  (ins rc:$src, VSrc_32:$idx, i32imm:$offset)
->;
+  (ins rc:$src, VS_32:$idx, i32imm:$offset)>;
  
  class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
    (outs rc:$vdst, SReg_64:$sdst),
-  (ins unknown:$src, VSrc_32:$idx, i32imm:$offset, VGPR_32:$val)> {
+  (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
    let Constraints = "$src = $vdst";
  }
  
@@ -3308,29 +3307,16 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
  /********** ====================== **********/
  
  multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
-
-  // 1. Extract with offset
-  def : Pat<
-    (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))),
-    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off)
-  >;
-
-  // 2. Extract without offset
-  def : Pat<
-    (eltvt (extractelt vt:$vec, i32:$idx)),
-    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0)
-  >;
-
-  // 3. Insert with offset
+  // Extract with offset
    def : Pat<
-    (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
-    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val)
+    (eltvt (extractelt vt:$src, (MOVRELSOffset i32:$idx, (i32 imm:$offset)))),
+    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
    >;
  
-  // 4. Insert without offset
+  // Insert with offset
    def : Pat<
-    (insertelt vt:$vec, eltvt:$val, i32:$idx),
-    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val)
+    (insertelt vt:$src, eltvt:$val, (MOVRELDOffset i32:$idx, (i32 imm:$offset))),
+    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
    >;
  }
  
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

index 930fcb9..1ba14cd 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -435,7 +435,7 @@ void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
      .addReg(AMDGPU::VCC);
  
-  if (Offset) {
+  if (Offset != 0) {
      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
        .addReg(AMDGPU::M0)
        .addImm(Offset);
@@ -463,7 +463,7 @@ bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offs
    const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
  
    if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
-    if (Offset) {
+    if (Offset != 0) {
        BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
          .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
          .addImm(Offset);
@@ -520,16 +520,17 @@ bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offs
    return true;
  }
  
-/// \param @VecReg The register which holds element zero of the vector
-///                 being addressed into.
-/// \param[out] @Reg The base register to use in the indirect addressing instruction.
-/// \param[in,out] @Offset As an input, this is the constant offset part of the
-//                         indirect Index. e.g. v0 = v[VecReg + Offset]
-//                         As an output, this is a constant value that needs
-//                         to be added to the value stored in M0.
+/// \param @VecReg The register which holds element zero of the vector being
+///                 addressed into.
+//
+/// \param[in] @Idx The index operand from the movrel instruction. This must be
+// a register, but may be NoRegister.
+///
+/// \param[in] @Offset As an input, this is the constant offset part of the
+// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
+// value that needs to be added to the value stored in M0.
  std::pair<unsigned, int>
-SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
-                                                int Offset) const {
+SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
    unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
    if (!SubReg)
      SubReg = VecReg;
@@ -560,42 +561,59 @@ SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
  // Return true if a new block was inserted.
  bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
    MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
  
    unsigned Dst = MI.getOperand(0).getReg();
    const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
-  int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
+  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    unsigned Reg;
  
-  std::tie(Reg, Off) = computeIndirectRegAndOffset(SrcVec->getReg(), Off);
+  std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
+
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  if (Idx->getReg() == AMDGPU::NoRegister) {
+    // Only had a constant offset, copy the register directly.
+    BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+      .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
+    MI.eraseFromParent();
+    return false;
+  }
  
    MachineInstr *MovRel =
      BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
      .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
      .addReg(SrcVec->getReg(), RegState::Implicit);
  
-  return loadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Offset);
  }
  
  // Return true if a new block was inserted.
  bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
    MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
  
    unsigned Dst = MI.getOperand(0).getReg();
-  int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
-  MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    unsigned Reg;
  
-  std::tie(Reg, Off) = computeIndirectRegAndOffset(Dst, Off);
+  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+  std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
+
+  MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  if (Idx->getReg() == AMDGPU::NoRegister) {
+    // Only had a constant offset, copy the register directly.
+    BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
+      .addOperand(*Val);
+    MI.eraseFromParent();
+    return false;
+  }
  
    MachineInstr *MovRel =
-    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
-    .addReg(Reg, RegState::Define)
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
      .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
      .addReg(Dst, RegState::Implicit);
  
-  return loadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Offset);
  }
  
  bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll

index 9f57dd2..66cec88 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -452,6 +452,39 @@ entry:
    ret void
  }
  
+; Test that the or is folded into the base address register instead of
+; added to m0
+
+; GCN-LABEL: {{^}}extractelement_v4i32_or_index:
+; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
+; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
+; GCN-NOT: [[IDX_SHL]]
+; GCN: s_mov_b32 m0, [[IDX_SHL]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
+entry:
+  %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
+  %idx.shl = shl i32 %idx.in, 2
+  %idx = or i32 %idx.shl, 1
+  %value = extractelement <4 x i32> %ld, i32 %idx
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}insertelement_v4f32_or_index:
+; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
+; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
+; GCN-NOT: [[IDX_SHL]]
+; GCN: s_mov_b32 m0, [[IDX_SHL]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
+  %idx.shl = shl i32 %idx.in, 2
+  %idx = or i32 %idx.shl, 1
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
  declare i32 @llvm.amdgcn.workitem.id.x() #1
  
  attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll

index 93001e4..3e6905f 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -339,9 +339,9 @@ endif:
  ; FIXME: Should be able to manipulate m0 directly instead of add and
  ; copy.
  
-; GCN: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1
+; FIXME: Should avoid resetting m0 to same value
  ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
-; GCN-DAG: s_mov_b32 m0, [[IDX1]]
+; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
  ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
  
  ; GCN: buffer_store_dwordx4
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sat, 9 Jul 2016 01:13:56 +0000 (01:13 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sat, 9 Jul 2016 01:13:56 +0000 (01:13 +0000)
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll		patch \| blob \| history