[AMDGPU] Fix ds_read2/write2 with unaligned offsets

author Jay Foad <jay.foad@amd.com>

Mon, 2 Nov 2020 13:05:15 +0000 (13:05 +0000)

committer Jay Foad <jay.foad@amd.com>

Tue, 3 Nov 2020 15:16:10 +0000 (15:16 +0000)
author Jay Foad <jay.foad@amd.com>
Mon, 2 Nov 2020 13:05:15 +0000 (13:05 +0000)
committer Jay Foad <jay.foad@amd.com>
Tue, 3 Nov 2020 15:16:10 +0000 (15:16 +0000)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

index 994b97b..92bebe6 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -200,15 +200,16 @@ private:
    const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
    virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
    virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool isDSOffsetLegal(SDValue Base, unsigned Offset,
-                       unsigned OffsetBits) const;
+  bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
+  bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
+                        unsigned Size) const;
    bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
    bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
                                   SDValue &Offset1) const;
    bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
                                    SDValue &Offset1) const;
    bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0,
-                          SDValue &Offset1, bool IsDS128) const;
+                          SDValue &Offset1, unsigned Size) const;
    bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
                     SDValue &SOffset, SDValue &Offset, SDValue &Offen,
                     SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
@@ -1155,13 +1156,11 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
    CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
  }
  
-bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
-                                         unsigned OffsetBits) const {
-  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
-      (OffsetBits == 8 && !isUInt<8>(Offset)))
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
+  if (!isUInt<16>(Offset))
      return false;
  
-  if (Subtarget->hasUsableDSOffset() ||
+  if (!Base || Subtarget->hasUsableDSOffset() ||
        Subtarget->unsafeDSOffsetFoldingEnabled())
      return true;
  
@@ -1177,7 +1176,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
      SDValue N0 = Addr.getOperand(0);
      SDValue N1 = Addr.getOperand(1);
      ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-    if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
+    if (isDSOffsetLegal(N0, C1->getSExtValue())) {
        // (add n0, c0)
        Base = N0;
        Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
@@ -1187,7 +1186,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
      // sub C, x -> add (sub 0, x), C
      if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
        int64_t ByteOffset = C->getSExtValue();
-      if (isUInt<16>(ByteOffset)) {
+      if (isDSOffsetLegal(SDValue(), ByteOffset)) {
          SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
  
          // XXX - This is kind of hacky. Create a dummy sub node so we can check
@@ -1196,7 +1195,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
          SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
                                        Zero, Addr.getOperand(1));
  
-        if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+        if (isDSOffsetLegal(Sub, ByteOffset)) {
            SmallVector<SDValue, 3> Opnds;
            Opnds.push_back(Zero);
            Opnds.push_back(Addr.getOperand(1));
@@ -1226,7 +1225,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
  
      SDLoc DL(Addr);
  
-    if (isUInt<16>(CAddr->getZExtValue())) {
+    if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
        MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
                                   DL, MVT::i32, Zero);
@@ -1242,46 +1241,63 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
    return true;
  }
  
+bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
+                                          unsigned Offset1,
+                                          unsigned Size) const {
+  if (Offset0 % Size != 0 || Offset1 % Size != 0)
+    return false;
+  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
+    return false;
+
+  if (!Base || Subtarget->hasUsableDSOffset() ||
+      Subtarget->unsafeDSOffsetFoldingEnabled())
+    return true;
+
+  // On Southern Islands instruction with a negative base value and an offset
+  // don't seem to work.
+  return CurDAG->SignBitIsZero(Base);
+}
+
  // TODO: If offset is too big, put low 16-bit into offset.
  bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
                                                     SDValue &Offset0,
                                                     SDValue &Offset1) const {
-  return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, false);
+  return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
  }
  
  bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
                                                      SDValue &Offset0,
                                                      SDValue &Offset1) const {
-  return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, true);
+  return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
  }
  
  bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
                                              SDValue &Offset0, SDValue &Offset1,
-                                            bool IsDS128) const {
+                                            unsigned Size) const {
    SDLoc DL(Addr);
-  unsigned Align = IsDS128 ? 8 : 4;
  
    if (CurDAG->isBaseWithConstantOffset(Addr)) {
      SDValue N0 = Addr.getOperand(0);
      SDValue N1 = Addr.getOperand(1);
      ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-    unsigned OffsetValue0 = C1->getZExtValue() / Align;
-    unsigned OffsetValue1 = OffsetValue0 + 1;
+    unsigned OffsetValue0 = C1->getZExtValue();
+    unsigned OffsetValue1 = OffsetValue0 + Size;
+
      // (add n0, c0)
-    if (isDSOffsetLegal(N0, OffsetValue1, 8)) {
+    if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
        Base = N0;
-      Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8);
-      Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8);
+      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
        return true;
      }
    } else if (Addr.getOpcode() == ISD::SUB) {
      // sub C, x -> add (sub 0, x), C
      if (const ConstantSDNode *C =
              dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
-      unsigned OffsetValue0 = C->getZExtValue() / Align;
-      unsigned OffsetValue1 = OffsetValue0 + 1;
+      unsigned OffsetValue0 = C->getZExtValue();
+      unsigned OffsetValue1 = OffsetValue0 + Size;
  
-      if (isUInt<8>(OffsetValue0)) {
+      if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
          SDLoc DL(Addr);
          SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
  
@@ -1291,7 +1307,7 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
          SDValue Sub =
              CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
  
-        if (isDSOffsetLegal(Sub, OffsetValue1, 8)) {
+        if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
            SmallVector<SDValue, 3> Opnds;
            Opnds.push_back(Zero);
            Opnds.push_back(Addr.getOperand(1));
@@ -1303,27 +1319,26 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
            }
  
            MachineSDNode *MachineSub = CurDAG->getMachineNode(
-              SubOp, DL, (IsDS128 ? MVT::i64 : MVT::i32), Opnds);
+              SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
  
            Base = SDValue(MachineSub, 0);
-          Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8);
-          Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8);
+          Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+          Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
            return true;
          }
        }
      }
    } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
-    unsigned OffsetValue0 = CAddr->getZExtValue() / Align;
-    unsigned OffsetValue1 = OffsetValue0 + 1;
-    bool OffsetIsAligned = Align * OffsetValue0 == CAddr->getZExtValue();
+    unsigned OffsetValue0 = CAddr->getZExtValue();
+    unsigned OffsetValue1 = OffsetValue0 + Size;
  
-    if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1) && OffsetIsAligned) {
+    if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
        MachineSDNode *MovZero =
            CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
        Base = SDValue(MovZero, 0);
-      Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8);
-      Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8);
+      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
        return true;
      }
    }
@@ -2412,7 +2427,7 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
      SDValue PtrOffset = Ptr.getOperand(1);
  
      const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
-    if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
+    if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
        N = glueCopyToM0(N, PtrBase);
        Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
      }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

index 88863f4..1962c6d 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1408,7 +1408,7 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
    std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
  
    // TODO: Should this try to look through readfirstlane like GWS?
-  if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
+  if (!isDSOffsetLegal(PtrBase, Offset)) {
      PtrBase = MI.getOperand(2).getReg();
      Offset = 0;
    }
@@ -3636,10 +3636,24 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
  }
  
  bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
-                                                int64_t Offset,
-                                                unsigned OffsetBits) const {
-  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
-      (OffsetBits == 8 && !isUInt<8>(Offset)))
+                                                int64_t Offset) const {
+  if (!isUInt<16>(Offset))
+    return false;
+
+  if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
+    return true;
+
+  // On Southern Islands instruction with a negative base value and an offset
+  // don't seem to work.
+  return KnownBits->signBitIsZero(Base);
+}
+
+bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
+                                                 int64_t Offset1,
+                                                 unsigned Size) const {
+  if (Offset0 % Size != 0 || Offset1 % Size != 0)
+    return false;
+  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
      return false;
  
    if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
@@ -3694,7 +3708,7 @@ AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const
      getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
  
    if (Offset) {
-    if (isDSOffsetLegal(PtrBase, Offset, 16)) {
+    if (isDSOffsetLegal(PtrBase, Offset)) {
        // (add n0, c0)
        return std::make_pair(PtrBase, Offset);
      }
@@ -3723,20 +3737,20 @@ AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
  
  InstructionSelector::ComplexRendererFns
  AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
-  return selectDSReadWrite2(Root, false);
+  return selectDSReadWrite2(Root, 4);
  }
  
  InstructionSelector::ComplexRendererFns
  AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
-  return selectDSReadWrite2(Root, true);
+  return selectDSReadWrite2(Root, 8);
  }
  
  InstructionSelector::ComplexRendererFns
  AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
-                                              bool IsDS128) const {
+                                              unsigned Size) const {
    Register Reg;
    unsigned Offset;
-  std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, IsDS128);
+  std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
    return {{
        [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
        [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
@@ -3746,7 +3760,7 @@ AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
  
  std::pair<Register, unsigned>
  AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
-                                                  bool IsDS128) const {
+                                                  unsigned Size) const {
    const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
    if (!RootDef)
      return std::make_pair(Root.getReg(), 0);
@@ -3759,11 +3773,11 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
      getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
  
    if (Offset) {
-    int64_t OffsetValue0 = Offset / (IsDS128 ? 8 : 4);
-    int64_t OffsetValue1 = OffsetValue0 + 1;
-    if (isDSOffsetLegal(PtrBase, OffsetValue1, (IsDS128 ? 16 : 8))) {
+    int64_t OffsetValue0 = Offset;
+    int64_t OffsetValue1 = Offset + Size;
+    if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
        // (add n0, c0)
-      return std::make_pair(PtrBase, OffsetValue0);
+      return std::make_pair(PtrBase, OffsetValue0 / Size);
      }
    } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
      // TODO
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

index 9a11c37..9b9a531 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -203,8 +203,9 @@ private:
    InstructionSelector::ComplexRendererFns
    selectMUBUFScratchOffset(MachineOperand &Root) const;
  
-  bool isDSOffsetLegal(Register Base, int64_t Offset,
-                       unsigned OffsetBits) const;
+  bool isDSOffsetLegal(Register Base, int64_t Offset) const;
+  bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
+                        unsigned Size) const;
  
    std::pair<Register, unsigned>
    selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
@@ -217,10 +218,10 @@ private:
    InstructionSelector::ComplexRendererFns
    selectDS128Bit8ByteAligned(MachineOperand &Root) const;
  
-  std::pair<Register, unsigned>
-  selectDSReadWrite2Impl(MachineOperand &Root, bool IsDS128) const;
+  std::pair<Register, unsigned> selectDSReadWrite2Impl(MachineOperand &Root,
+                                                       unsigned size) const;
    InstructionSelector::ComplexRendererFns
-  selectDSReadWrite2(MachineOperand &Root, bool IsDS128) const;
+  selectDSReadWrite2(MachineOperand &Root, unsigned size) const;
  
    std::pair<Register, int64_t>
    getPtrBaseWithConstantOffset(Register Root,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir

index a844c39..e7c646e 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir
@@ -102,8 +102,10 @@ body: |
      ; GFX7-LABEL: name: load_local_v4s32_align_8_offset_320
      ; GFX7: liveins: $vgpr0
      ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4000, implicit $exec
+    ; GFX7: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
      ; GFX7: $m0 = S_MOV_B32 -1
-    ; GFX7: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 500, 501, 0, implicit $m0, implicit $exec :: (load 16, align 8, addrspace 3)
+    ; GFX7: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 %2, 0, 1, 0, implicit $m0, implicit $exec :: (load 16, align 8, addrspace 3)
      ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]
      ; GFX9-LABEL: name: load_local_v4s32_align_8_offset_320
      ; GFX9: liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll

index b79359c..11765bf 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -194,20 +194,20 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
  ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
  ; CI:       ; %bb.0:
  ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fb, v0
  ; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
  ; CI-NEXT:    v_mov_b32_e32 v2, 0
  ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset0:254 offset1:255
+; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
  ; CI-NEXT:    s_endpgm
  ;
  ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
  ; GFX9:       ; %bb.0:
  ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x3fb, v0
  ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
  ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:254 offset1:255
+; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
  ; GFX9-NEXT:    s_endpgm
    %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
    %neg = sub i32 0, %x.i
@@ -223,7 +223,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
  ; CI:       ; %bb.0:
  ; CI-NEXT:    s_load_dword s0, s[0:1], 0x9
  ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fb, v0
  ; CI-NEXT:    s_mov_b64 vcc, 0
  ; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
  ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -235,7 +235,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
  ; CI-NEXT:    s_mov_b32 s3, 0xf000
  ; CI-NEXT:    s_mov_b32 s2, -1
  ; CI-NEXT:    s_mov_b32 s1, s0
-; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:254 offset1:255
+; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
  ; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
  ; CI-NEXT:    s_endpgm
  ;
@@ -244,13 +244,13 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
  ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
  ; GFX9-NEXT:    s_mov_b64 vcc, 0
  ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x3fb, v0
  ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
  ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
  ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
  ; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
  ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX9-NEXT:    ds_write2_b32 v0, v1, v3 offset0:254 offset1:255
+; GFX9-NEXT:    ds_write2_b32 v0, v1, v3 offset1:1
  ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
  ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
  ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll

index 15e97e3..9f72a34 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -693,8 +693,8 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out,
  ; GFX9-UNALIGNED-NEXT:    s_load_dword s0, s[0:1], 0x2c
  ; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
  ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v0, s0, v2
-; GFX9-UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset0:1 offset1:2
+; GFX9-UNALIGNED-NEXT:    v_add3_u32 v0, s0, v2, 5
+; GFX9-UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
  ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
  ; GFX9-UNALIGNED-NEXT:    v_add_f32_e32 v0, v0, v1
  ; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll

index 8346ce9..9c71fa1 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -701,9 +701,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double add
  ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
  ; GFX9-UNALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
  ; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, s0, v2
+; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v3, 5, v2
+; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, 9, v2
  ; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset0:1 offset1:2
-; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset0:2 offset1:3
+; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
+; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
  ; GFX9-UNALIGNED-NEXT:    s_endpgm
    %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
author	Jay Foad <jay.foad@amd.com>
	Mon, 2 Nov 2020 13:05:15 +0000 (13:05 +0000)
committer	Jay Foad <jay.foad@amd.com>
	Tue, 3 Nov 2020 15:16:10 +0000 (15:16 +0000)
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/ds_read2.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/ds_write2.ll		patch \| blob \| history