[DAGCombiner] [AMDGPU] Allow vector loads in MatchLoadCombine

author jeff <Jeffrey.Byrnes@amd.com>

Tue, 6 Sep 2022 19:54:53 +0000 (19:54 +0000)

committer Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>

Tue, 4 Oct 2022 19:16:00 +0000 (12:16 -0700)
author jeff <Jeffrey.Byrnes@amd.com>
Tue, 6 Sep 2022 19:54:53 +0000 (19:54 +0000)
committer Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Tue, 4 Oct 2022 19:16:00 +0000 (12:16 -0700)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h

index e23eebe..7346d23 100644 (file)
--- a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
@@ -49,6 +49,9 @@ public:
    SDValue getBase() const { return Base; }
    SDValue getIndex() { return Index; }
    SDValue getIndex() const { return Index; }
+  void addToOffset(int64_t VectorOff) {
+    Offset = Offset.value_or(0) + VectorOff;
+  }
    bool hasValidOffset() const { return Offset.has_value(); }
    int64_t getOffset() const { return *Offset; }
  
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index e5c267e..51a4bd1 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7790,25 +7790,28 @@ struct ByteProvider {
    // ByteOffset is the offset of the byte in the value produced by the load.
    LoadSDNode *Load = nullptr;
    unsigned ByteOffset = 0;
+  unsigned VectorOffset = 0;
  
    ByteProvider() = default;
  
-  static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
-    return ByteProvider(Load, ByteOffset);
+  static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset,
+                                unsigned VectorOffset) {
+    return ByteProvider(Load, ByteOffset, VectorOffset);
    }
  
-  static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }
+  static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0, 0); }
  
    bool isConstantZero() const { return !Load; }
    bool isMemory() const { return Load; }
  
    bool operator==(const ByteProvider &Other) const {
-    return Other.Load == Load && Other.ByteOffset == ByteOffset;
+    return Other.Load == Load && Other.ByteOffset == ByteOffset &&
+           Other.VectorOffset == VectorOffset;
    }
  
  private:
-  ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
-      : Load(Load), ByteOffset(ByteOffset) {}
+  ByteProvider(LoadSDNode *Load, unsigned ByteOffset, unsigned VectorOffset)
+      : Load(Load), ByteOffset(ByteOffset), VectorOffset(VectorOffset) {}
  };
  
  } // end anonymous namespace
@@ -7816,25 +7819,63 @@ private:
  /// Recursively traverses the expression calculating the origin of the requested
  /// byte of the given value. Returns None if the provider can't be calculated.
  ///
-/// For all the values except the root of the expression verifies that the value
-/// has exactly one use and if it's not true return None. This way if the origin
-/// of the byte is returned it's guaranteed that the values which contribute to
-/// the byte are not used outside of this expression.
+/// For all the values except the root of the expression, we verify that the
+/// value has exactly one use and if not then return None. This way if the
+/// origin of the byte is returned it's guaranteed that the values which
+/// contribute to the byte are not used outside of this expression.
+
+/// However, there is a special case when dealing with vector loads -- we allow
+/// more than one use if the load is a vector type.  Since the values that
+/// contribute to the byte ultimately come from the ExtractVectorElements of the
+/// Load, we don't care if the Load has uses other than ExtractVectorElements,
+/// because those operations are independent from the pattern to be combined.
+/// For vector loads, we simply care that the ByteProviders are adjacent
+/// positions of the same vector, and their index matches the byte that is being
+/// provided. This is captured by the \p VectorIndex algorithm. \p VectorIndex
+/// is the index used in an ExtractVectorElement, and \p StartingIndex is the
+/// byte position we are trying to provide for the LoadCombine. If these do
+/// not match, then we can not combine the vector loads. \p Index uses the
+/// byte position we are trying to provide for and is matched against the
+/// shl and load size. The \p Index algorithm ensures the requested byte is
+/// provided for by the pattern, and the pattern does not over provide bytes.
  ///
-/// Because the parts of the expression are not allowed to have more than one
-/// use this function iterates over trees, not DAGs. So it never visits the same
-/// node more than once.
+///
+/// The supported LoadCombine pattern for vector loads is as follows
+///                              or
+///                          /        \
+///                         or        shl
+///                       /     \      |
+///                     or      shl   zext
+///                   /    \     |     |
+///                 shl   zext  zext  EVE*
+///                  |     |     |     |
+///                 zext  EVE*  EVE*  LOAD
+///                  |     |     |
+///                 EVE*  LOAD  LOAD
+///                  |
+///                 LOAD
+///
+/// *ExtractVectorElement
  static const Optional<ByteProvider>
  calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
-                      bool Root = false) {
+                      Optional<uint64_t> VectorIndex,
+                      unsigned StartingIndex = 0) {
+
    // Typical i64 by i8 pattern requires recursion up to 8 calls depth
    if (Depth == 10)
      return None;
  
-  if (!Root && !Op.hasOneUse())
+  // Only allow multiple uses if the instruction is a vector load (in which
+  // case we will use the load for every ExtractVectorElement)
+  if (Depth && !Op.hasOneUse() &&
+      (Op.getOpcode() != ISD::LOAD || !Op.getValueType().isVector()))
+    return None;
+
+  // Fail to combine if we have encountered anything but a LOAD after handling
+  // an ExtractVectorElement.
+  if (Op.getOpcode() != ISD::LOAD && VectorIndex.has_value())
      return None;
  
-  assert(Op.getValueType().isScalarInteger() && "can't handle other types");
    unsigned BitWidth = Op.getValueSizeInBits();
    if (BitWidth % 8 != 0)
      return None;
@@ -7844,10 +7885,12 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
  
    switch (Op.getOpcode()) {
    case ISD::OR: {
-    auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
+    auto LHS =
+        calculateByteProvider(Op->getOperand(0), Index, Depth + 1, VectorIndex);
      if (!LHS)
        return None;
-    auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
+    auto RHS =
+        calculateByteProvider(Op->getOperand(1), Index, Depth + 1, VectorIndex);
      if (!RHS)
        return None;
  
@@ -7863,14 +7906,18 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
        return None;
  
      uint64_t BitShift = ShiftOp->getZExtValue();
+
      if (BitShift % 8 != 0)
        return None;
      uint64_t ByteShift = BitShift / 8;
  
+    // If we are shifting by an amount greater than the index we are trying to
+    // provide, then do not provide anything. Otherwise, subtract the index by
+    // the amount we shifted by.
      return Index < ByteShift
                 ? ByteProvider::getConstantZero()
                 : calculateByteProvider(Op->getOperand(0), Index - ByteShift,
-                                       Depth + 1);
+                                       Depth + 1, VectorIndex, Index);
    }
    case ISD::ANY_EXTEND:
    case ISD::SIGN_EXTEND:
@@ -7885,11 +7932,39 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
        return Op.getOpcode() == ISD::ZERO_EXTEND
                   ? Optional<ByteProvider>(ByteProvider::getConstantZero())
                   : None;
-    return calculateByteProvider(NarrowOp, Index, Depth + 1);
+    return calculateByteProvider(NarrowOp, Index, Depth + 1, VectorIndex,
+                                 StartingIndex);
    }
    case ISD::BSWAP:
      return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
-                                 Depth + 1);
+                                 Depth + 1, VectorIndex, StartingIndex);
+  case ISD::EXTRACT_VECTOR_ELT: {
+    auto OffsetOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!OffsetOp)
+      return None;
+
+    VectorIndex = OffsetOp->getZExtValue();
+
+    SDValue NarrowOp = Op->getOperand(0);
+    unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
+    if (NarrowBitWidth % 8 != 0)
+      return None;
+    uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+    // Check to see if the position of the element in the vector corresponds
+    // with the byte we are trying to provide for. In the case of a vector of
+    // i8, this simply means the VectorIndex == StartingIndex. For non i8 cases,
+    // the element will provide a range of bytes. For example, if we have a
+    // vector of i16s, each element provides two bytes (V[1] provides byte 2 and
+    // 3).
+    if (VectorIndex.value() * NarrowByteWidth > StartingIndex)
+      return None;
+    if ((VectorIndex.value() + 1) * NarrowByteWidth <= StartingIndex)
+      return None;
+
+    return calculateByteProvider(Op->getOperand(0), Index, Depth + 1,
+                                 VectorIndex, StartingIndex);
+  }
    case ISD::LOAD: {
      auto L = cast<LoadSDNode>(Op.getNode());
      if (!L->isSimple() || L->isIndexed())
@@ -7900,11 +7975,16 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
        return None;
      uint64_t NarrowByteWidth = NarrowBitWidth / 8;
  
+    // If the width of the load does not reach byte we are trying to provide for
+    // and it is not a ZEXTLOAD, then the load does not provide for the byte in
+    // question
      if (Index >= NarrowByteWidth)
        return L->getExtensionType() == ISD::ZEXTLOAD
                   ? Optional<ByteProvider>(ByteProvider::getConstantZero())
                   : None;
-    return ByteProvider::getMemory(L, Index);
+
+    unsigned BPVectorIndex = VectorIndex.value_or(0U);
+    return ByteProvider::getMemory(L, Index, BPVectorIndex);
    }
    }
  
@@ -8196,7 +8276,8 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
    bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
    auto MemoryByteOffset = [&] (ByteProvider P) {
      assert(P.isMemory() && "Must be a memory byte provider");
-    unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
+    unsigned LoadBitWidth = P.Load->getMemoryVT().getScalarSizeInBits();
+
      assert(LoadBitWidth % 8 == 0 &&
             "can only analyze providers for individual bytes not bit");
      unsigned LoadByteWidth = LoadBitWidth / 8;
@@ -8217,7 +8298,8 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
    SmallVector<int64_t, 8> ByteOffsets(ByteWidth);
    unsigned ZeroExtendedBytes = 0;
    for (int i = ByteWidth - 1; i >= 0; --i) {
-    auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true);
+    auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*VectorIndex*/ None,
+                                   /*StartingIndex*/ i);
      if (!P)
        return SDValue();
  
@@ -8231,10 +8313,6 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
      assert(P->isMemory() && "provenance should either be memory or zero");
  
      LoadSDNode *L = P->Load;
-    assert(L->hasNUsesOfValue(1, 0) && L->isSimple() &&
-           !L->isIndexed() &&
-           "Must be enforced by calculateByteProvider");
-    assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
  
      // All loads must share the same chain
      SDValue LChain = L->getChain();
@@ -8246,8 +8324,25 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
      // Loads must share the same base address
      BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
      int64_t ByteOffsetFromBase = 0;
+
+    // For vector loads, the expected load combine pattern will have an
+    // ExtractElement for each index in the vector. While each of these
+    // ExtractElements will be accessing the same base address as determined
+    // by the load instruction, the actual bytes they interact with will differ
+    // due to different ExtractElement indices. To accurately determine the
+    // byte position of an ExtractElement, we offset the base load ptr with
+    // the index multiplied by the byte size of each element in the vector.
+    if (L->getMemoryVT().isVector()) {
+      unsigned LoadWidthInBit = L->getMemoryVT().getScalarSizeInBits();
+      if (LoadWidthInBit % 8 != 0)
+        return SDValue();
+      unsigned ByteOffsetFromVector = P->VectorOffset * LoadWidthInBit / 8;
+      Ptr.addToOffset(ByteOffsetFromVector);
+    }
+
      if (!Base)
        Base = Ptr;
+
      else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
        return SDValue();
  
@@ -8263,6 +8358,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
  
      Loads.insert(L);
    }
+
    assert(!Loads.empty() && "All the bytes of the value must be loaded from "
           "memory, so there must be at least one load which produces the value");
    assert(Base && "Base address of the accessed memory location must be set");
diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll

index 32d6769..293967b 100644 (file)
--- a/llvm/test/CodeGen/AArch64/load-combine.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine.ll
@@ -562,18 +562,11 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
    ret i32 %tmp8
  }
  
+; x1 = x0
  define void @short_vector_to_i32(<4 x i8>* %in, i32* %out, i32* %p) {
  ; CHECK-LABEL: short_vector_to_i32:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    umov w8, v0.h[0]
-; CHECK-NEXT:    umov w9, v0.h[1]
-; CHECK-NEXT:    umov w10, v0.h[2]
-; CHECK-NEXT:    umov w11, v0.h[3]
-; CHECK-NEXT:    bfi w8, w9, #8, #8
-; CHECK-NEXT:    bfi w8, w10, #16, #8
-; CHECK-NEXT:    bfi w8, w11, #24, #8
+; CHECK-NEXT:    ldr w8, [x0]
  ; CHECK-NEXT:    str w8, [x1]
  ; CHECK-NEXT:    ret
    %ld = load <4 x i8>, <4 x i8>* %in, align 4
@@ -638,13 +631,11 @@ define void @short_vector_to_i32_unused_high_i8(<4 x i8>* %in, i32* %out, i32* %
  ; CHECK-LABEL: short_vector_to_i32_unused_high_i8:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ldrh w9, [x0]
  ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    umov w8, v0.h[0]
-; CHECK-NEXT:    umov w9, v0.h[1]
-; CHECK-NEXT:    umov w10, v0.h[2]
-; CHECK-NEXT:    bfi w8, w9, #8, #8
-; CHECK-NEXT:    bfi w8, w10, #16, #8
-; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    umov w8, v0.h[2]
+; CHECK-NEXT:    bfi w9, w8, #16, #8
+; CHECK-NEXT:    str w9, [x1]
  ; CHECK-NEXT:    ret
    %ld = load <4 x i8>, <4 x i8>* %in, align 4
  
@@ -694,14 +685,11 @@ define void @short_vector_to_i32_unused_low_i16(<4 x i8>* %in, i32* %out, i32* %
    ret void
  }
  
+; x1 = x0[0:1]
  define void @short_vector_to_i32_unused_high_i16(<4 x i8>* %in, i32* %out, i32* %p) {
  ; CHECK-LABEL: short_vector_to_i32_unused_high_i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    umov w8, v0.h[0]
-; CHECK-NEXT:    umov w9, v0.h[1]
-; CHECK-NEXT:    bfi w8, w9, #8, #8
+; CHECK-NEXT:    ldrh w8, [x0]
  ; CHECK-NEXT:    str w8, [x1]
  ; CHECK-NEXT:    ret
    %ld = load <4 x i8>, <4 x i8>* %in, align 4
@@ -720,18 +708,11 @@ define void @short_vector_to_i32_unused_high_i16(<4 x i8>* %in, i32* %out, i32*
    ret void
  }
  
+; x1 = x0
  define void @short_vector_to_i64(<4 x i8>* %in, i64* %out, i64* %p) {
  ; CHECK-LABEL: short_vector_to_i64:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    umov w8, v0.h[0]
-; CHECK-NEXT:    umov w9, v0.h[1]
-; CHECK-NEXT:    umov w10, v0.h[2]
-; CHECK-NEXT:    umov w11, v0.h[3]
-; CHECK-NEXT:    bfi x8, x9, #8, #8
-; CHECK-NEXT:    bfi x8, x10, #16, #8
-; CHECK-NEXT:    bfi x8, x11, #24, #8
+; CHECK-NEXT:    ldr w8, [x0]
  ; CHECK-NEXT:    str x8, [x1]
  ; CHECK-NEXT:    ret
    %ld = load <4 x i8>, <4 x i8>* %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll

index d4708ae..a344634 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -9,15 +9,9 @@ define amdgpu_kernel void @vectorLoadCombine(<4 x i8>* %in, i32* %out) {
  ; GCN-NEXT:    v_mov_b32_e32 v0, s0
  ; GCN-NEXT:    v_mov_b32_e32 v1, s1
  ; GCN-NEXT:    flat_load_dword v2, v[0:1]
-; GCN-NEXT:    s_mov_b32 s0, 0x6050400
  ; GCN-NEXT:    v_mov_b32_e32 v0, s2
  ; GCN-NEXT:    v_mov_b32_e32 v1, s3
  ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_bfe_u32 v3, v2, 8, 8
-; GCN-NEXT:    v_and_b32_e32 v4, 0xff0000, v2
-; GCN-NEXT:    v_perm_b32 v3, v3, v2, s0
-; GCN-NEXT:    v_and_b32_e32 v2, 0xff000000, v2
-; GCN-NEXT:    v_or3_b32 v2, v3, v4, v2
  ; GCN-NEXT:    flat_store_dword v[0:1], v2
  ; GCN-NEXT:    s_endpgm
  entry:
@@ -84,10 +78,7 @@ define i32 @load_2xi16_combine(i16 addrspace(1)* %p) #0 {
  ; GCN:       ; %bb.0:
  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GCN-NEXT:    global_load_dword v0, v[0:1], off
-; GCN-NEXT:    s_mov_b32 s4, 0xffff
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GCN-NEXT:    v_and_or_b32 v0, v0, s4, v1
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 1
    %p.0 = load i16, i16 addrspace(1)* %p, align 4
@@ -162,8 +153,6 @@ define i64 @load_4xi16_combine(i16 addrspace(1)* %p) #0 {
  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GCN-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 1
    %gep.2p = getelementptr i16, i16 addrspace(1)* %p, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll

index eec983d..a28baa2 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -184,10 +184,7 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 {
  ; GFX9:       ; %bb.0:
  ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
  ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
  ; GFX9-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX10-LABEL: global_load_2xi16_align1:
@@ -196,8 +193,6 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 {
  ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
  ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
  ; GFX10-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX11-LABEL: global_load_2xi16_align1:
@@ -206,9 +201,6 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 {
  ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
  ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
  ; GFX11-NEXT:    s_setpc_b64 s[30:31]
    %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
    %p.0 = load i16, i16 addrspace(1)* %p, align 1
@@ -293,13 +285,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 a
  
  ; Should merge this to a dword load
  define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 {
-; GFX7-LABEL: load_2xi16_align4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
  ; GFX7-ALIGNED-LABEL: global_load_2xi16_align4:
  ; GFX7-ALIGNED:       ; %bb.0:
  ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -318,10 +303,7 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 {
  ; GFX9:       ; %bb.0:
  ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
  ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
  ; GFX9-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX10-LABEL: global_load_2xi16_align4:
@@ -330,8 +312,6 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 {
  ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
  ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
  ; GFX10-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX11-LABEL: global_load_2xi16_align4:
@@ -340,9 +320,6 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 {
  ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
  ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
  ; GFX11-NEXT:    s_setpc_b64 s[30:31]
    %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
    %p.0 = load i16, i16 addrspace(1)* %p, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll

index 0e2fae7..8b6e1a6 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -387,13 +387,6 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)*
  
  ; Should merge this to a dword load
  define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
-; GFX7-LABEL: load_2xi16_align4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
  ; GFX7-ALIGNED-LABEL: private_load_2xi16_align4:
  ; GFX7-ALIGNED:       ; %bb.0:
  ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -412,20 +405,14 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
  ; GFX9:       ; %bb.0:
  ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
  ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
  ; GFX9-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align4:
  ; GFX9-FLASTSCR:       ; %bb.0:
  ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GFX9-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
-; GFX9-FLASTSCR-NEXT:    s_mov_b32 s0, 0xffff
  ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLASTSCR-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-FLASTSCR-NEXT:    v_and_or_b32 v0, v0, s0, v1
  ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX10-LABEL: private_load_2xi16_align4:
@@ -434,8 +421,6 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
  ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX10-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
  ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
  ; GFX10-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align4:
@@ -444,8 +429,6 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
  ; GFX10-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX10-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
  ; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLASTSCR-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-FLASTSCR-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
  ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX11-LABEL: private_load_2xi16_align4:
@@ -454,9 +437,6 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
  ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX11-NEXT:    scratch_load_b32 v0, v0, off
  ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
  ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align4:
@@ -465,9 +445,6 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
  ; GFX11-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX11-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
  ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLASTSCR-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-FLASTSCR-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLASTSCR-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
  ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
    %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
    %p.0 = load i16, i16 addrspace(5)* %p, align 4
author	jeff <Jeffrey.Byrnes@amd.com>
	Tue, 6 Sep 2022 19:54:53 +0000 (19:54 +0000)
committer	Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
	Tue, 4 Oct 2022 19:16:00 +0000 (12:16 -0700)
llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/load-combine.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll		patch \| blob \| history