From cebec4208982dccb70e724e38fca72823652ec76 Mon Sep 17 00:00:00 2001 From: jeff Date: Tue, 6 Sep 2022 19:54:53 +0000 Subject: [PATCH] [DAGCombiner] [AMDGPU] Allow vector loads in MatchLoadCombine Since SROA chooses promotion based on reaching load / stores of allocas, we may run into scenarios in which we alloca a vector, but promote it to an integer. The result of which is the familiar LoadCombine pattern (i.e. ZEXT, SHL, OR). However, instead of coming directly from distinct loads, the elements to be combined are coming from ExtractVectorElements which stem from a shared load. This patch identifies such a pattern and combines it into a load. Change-Id: I0bc06588f11e88a0a975cde1fd71e9143e6c42dd --- .../llvm/CodeGen/SelectionDAGAddressAnalysis.h | 3 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 152 +++++++++++++++++---- llvm/test/CodeGen/AArch64/load-combine.ll | 39 ++---- llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll | 11 -- .../AMDGPU/fast-unaligned-load-store.global.ll | 23 ---- .../AMDGPU/fast-unaligned-load-store.private.ll | 23 ---- 6 files changed, 137 insertions(+), 114 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h index e23eebe..7346d23 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h @@ -49,6 +49,9 @@ public: SDValue getBase() const { return Base; } SDValue getIndex() { return Index; } SDValue getIndex() const { return Index; } + void addToOffset(int64_t VectorOff) { + Offset = Offset.value_or(0) + VectorOff; + } bool hasValidOffset() const { return Offset.has_value(); } int64_t getOffset() const { return *Offset; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index e5c267e..51a4bd1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7790,25 +7790,28 @@ struct ByteProvider { // ByteOffset is the offset of the byte in the value produced by the load. LoadSDNode *Load = nullptr; unsigned ByteOffset = 0; + unsigned VectorOffset = 0; ByteProvider() = default; - static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { - return ByteProvider(Load, ByteOffset); + static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset, + unsigned VectorOffset) { + return ByteProvider(Load, ByteOffset, VectorOffset); } - static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } + static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0, 0); } bool isConstantZero() const { return !Load; } bool isMemory() const { return Load; } bool operator==(const ByteProvider &Other) const { - return Other.Load == Load && Other.ByteOffset == ByteOffset; + return Other.Load == Load && Other.ByteOffset == ByteOffset && + Other.VectorOffset == VectorOffset; } private: - ByteProvider(LoadSDNode *Load, unsigned ByteOffset) - : Load(Load), ByteOffset(ByteOffset) {} + ByteProvider(LoadSDNode *Load, unsigned ByteOffset, unsigned VectorOffset) + : Load(Load), ByteOffset(ByteOffset), VectorOffset(VectorOffset) {} }; } // end anonymous namespace @@ -7816,25 +7819,63 @@ private: /// Recursively traverses the expression calculating the origin of the requested /// byte of the given value. Returns None if the provider can't be calculated. /// -/// For all the values except the root of the expression verifies that the value -/// has exactly one use and if it's not true return None. This way if the origin -/// of the byte is returned it's guaranteed that the values which contribute to -/// the byte are not used outside of this expression. +/// For all the values except the root of the expression, we verify that the +/// value has exactly one use and if not then return None. This way if the +/// origin of the byte is returned it's guaranteed that the values which +/// contribute to the byte are not used outside of this expression. + +/// However, there is a special case when dealing with vector loads -- we allow +/// more than one use if the load is a vector type. Since the values that +/// contribute to the byte ultimately come from the ExtractVectorElements of the +/// Load, we don't care if the Load has uses other than ExtractVectorElements, +/// because those operations are independent from the pattern to be combined. +/// For vector loads, we simply care that the ByteProviders are adjacent +/// positions of the same vector, and their index matches the byte that is being +/// provided. This is captured by the \p VectorIndex algorithm. \p VectorIndex +/// is the index used in an ExtractVectorElement, and \p StartingIndex is the +/// byte position we are trying to provide for the LoadCombine. If these do +/// not match, then we can not combine the vector loads. \p Index uses the +/// byte position we are trying to provide for and is matched against the +/// shl and load size. The \p Index algorithm ensures the requested byte is +/// provided for by the pattern, and the pattern does not over provide bytes. /// -/// Because the parts of the expression are not allowed to have more than one -/// use this function iterates over trees, not DAGs. So it never visits the same -/// node more than once. +/// +/// The supported LoadCombine pattern for vector loads is as follows +/// or +/// / \ +/// or shl +/// / \ | +/// or shl zext +/// / \ | | +/// shl zext zext EVE* +/// | | | | +/// zext EVE* EVE* LOAD +/// | | | +/// EVE* LOAD LOAD +/// | +/// LOAD +/// +/// *ExtractVectorElement static const Optional calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, - bool Root = false) { + Optional VectorIndex, + unsigned StartingIndex = 0) { + // Typical i64 by i8 pattern requires recursion up to 8 calls depth if (Depth == 10) return None; - if (!Root && !Op.hasOneUse()) + // Only allow multiple uses if the instruction is a vector load (in which + // case we will use the load for every ExtractVectorElement) + if (Depth && !Op.hasOneUse() && + (Op.getOpcode() != ISD::LOAD || !Op.getValueType().isVector())) + return None; + + // Fail to combine if we have encountered anything but a LOAD after handling + // an ExtractVectorElement. + if (Op.getOpcode() != ISD::LOAD && VectorIndex.has_value()) return None; - assert(Op.getValueType().isScalarInteger() && "can't handle other types"); unsigned BitWidth = Op.getValueSizeInBits(); if (BitWidth % 8 != 0) return None; @@ -7844,10 +7885,12 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, switch (Op.getOpcode()) { case ISD::OR: { - auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); + auto LHS = + calculateByteProvider(Op->getOperand(0), Index, Depth + 1, VectorIndex); if (!LHS) return None; - auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); + auto RHS = + calculateByteProvider(Op->getOperand(1), Index, Depth + 1, VectorIndex); if (!RHS) return None; @@ -7863,14 +7906,18 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, return None; uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8 != 0) return None; uint64_t ByteShift = BitShift / 8; + // If we are shifting by an amount greater than the index we are trying to + // provide, then do not provide anything. Otherwise, subtract the index by + // the amount we shifted by. return Index < ByteShift ? ByteProvider::getConstantZero() : calculateByteProvider(Op->getOperand(0), Index - ByteShift, - Depth + 1); + Depth + 1, VectorIndex, Index); } case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: @@ -7885,11 +7932,39 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, return Op.getOpcode() == ISD::ZERO_EXTEND ? Optional(ByteProvider::getConstantZero()) : None; - return calculateByteProvider(NarrowOp, Index, Depth + 1); + return calculateByteProvider(NarrowOp, Index, Depth + 1, VectorIndex, + StartingIndex); } case ISD::BSWAP: return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, - Depth + 1); + Depth + 1, VectorIndex, StartingIndex); + case ISD::EXTRACT_VECTOR_ELT: { + auto OffsetOp = dyn_cast(Op->getOperand(1)); + if (!OffsetOp) + return None; + + VectorIndex = OffsetOp->getZExtValue(); + + SDValue NarrowOp = Op->getOperand(0); + unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return None; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + // Check to see if the position of the element in the vector corresponds + // with the byte we are trying to provide for. In the case of a vector of + // i8, this simply means the VectorIndex == StartingIndex. For non i8 cases, + // the element will provide a range of bytes. For example, if we have a + // vector of i16s, each element provides two bytes (V[1] provides byte 2 and + // 3). + if (VectorIndex.value() * NarrowByteWidth > StartingIndex) + return None; + if ((VectorIndex.value() + 1) * NarrowByteWidth <= StartingIndex) + return None; + + return calculateByteProvider(Op->getOperand(0), Index, Depth + 1, + VectorIndex, StartingIndex); + } case ISD::LOAD: { auto L = cast(Op.getNode()); if (!L->isSimple() || L->isIndexed()) @@ -7900,11 +7975,16 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, return None; uint64_t NarrowByteWidth = NarrowBitWidth / 8; + // If the width of the load does not reach byte we are trying to provide for + // and it is not a ZEXTLOAD, then the load does not provide for the byte in + // question if (Index >= NarrowByteWidth) return L->getExtensionType() == ISD::ZEXTLOAD ? Optional(ByteProvider::getConstantZero()) : None; - return ByteProvider::getMemory(L, Index); + + unsigned BPVectorIndex = VectorIndex.value_or(0U); + return ByteProvider::getMemory(L, Index, BPVectorIndex); } } @@ -8196,7 +8276,8 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto MemoryByteOffset = [&] (ByteProvider P) { assert(P.isMemory() && "Must be a memory byte provider"); - unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); + unsigned LoadBitWidth = P.Load->getMemoryVT().getScalarSizeInBits(); + assert(LoadBitWidth % 8 == 0 && "can only analyze providers for individual bytes not bit"); unsigned LoadByteWidth = LoadBitWidth / 8; @@ -8217,7 +8298,8 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { SmallVector ByteOffsets(ByteWidth); unsigned ZeroExtendedBytes = 0; for (int i = ByteWidth - 1; i >= 0; --i) { - auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); + auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*VectorIndex*/ None, + /*StartingIndex*/ i); if (!P) return SDValue(); @@ -8231,10 +8313,6 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { assert(P->isMemory() && "provenance should either be memory or zero"); LoadSDNode *L = P->Load; - assert(L->hasNUsesOfValue(1, 0) && L->isSimple() && - !L->isIndexed() && - "Must be enforced by calculateByteProvider"); - assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); // All loads must share the same chain SDValue LChain = L->getChain(); @@ -8246,8 +8324,25 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // Loads must share the same base address BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG); int64_t ByteOffsetFromBase = 0; + + // For vector loads, the expected load combine pattern will have an + // ExtractElement for each index in the vector. While each of these + // ExtractElements will be accessing the same base address as determined + // by the load instruction, the actual bytes they interact with will differ + // due to different ExtractElement indices. To accurately determine the + // byte position of an ExtractElement, we offset the base load ptr with + // the index multiplied by the byte size of each element in the vector. + if (L->getMemoryVT().isVector()) { + unsigned LoadWidthInBit = L->getMemoryVT().getScalarSizeInBits(); + if (LoadWidthInBit % 8 != 0) + return SDValue(); + unsigned ByteOffsetFromVector = P->VectorOffset * LoadWidthInBit / 8; + Ptr.addToOffset(ByteOffsetFromVector); + } + if (!Base) Base = Ptr; + else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) return SDValue(); @@ -8263,6 +8358,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { Loads.insert(L); } + assert(!Loads.empty() && "All the bytes of the value must be loaded from " "memory, so there must be at least one load which produces the value"); assert(Base && "Base address of the accessed memory location must be set"); diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll index 32d6769..293967b 100644 --- a/llvm/test/CodeGen/AArch64/load-combine.ll +++ b/llvm/test/CodeGen/AArch64/load-combine.ll @@ -562,18 +562,11 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) { ret i32 %tmp8 } +; x1 = x0 define void @short_vector_to_i32(<4 x i8>* %in, i32* %out, i32* %p) { ; CHECK-LABEL: short_vector_to_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: bfi w8, w11, #24, #8 +; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 @@ -638,13 +631,11 @@ define void @short_vector_to_i32_unused_high_i8(<4 x i8>* %in, i32* %out, i32* % ; CHECK-LABEL: short_vector_to_i32_unused_high_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: str w8, [x1] +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: bfi w9, w8, #16, #8 +; CHECK-NEXT: str w9, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 @@ -694,14 +685,11 @@ define void @short_vector_to_i32_unused_low_i16(<4 x i8>* %in, i32* %out, i32* % ret void } +; x1 = x0[0:1] define void @short_vector_to_i32_unused_high_i16(<4 x i8>* %in, i32* %out, i32* %p) { ; CHECK-LABEL: short_vector_to_i32_unused_high_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: ldrh w8, [x0] ; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 @@ -720,18 +708,11 @@ define void @short_vector_to_i32_unused_high_i16(<4 x i8>* %in, i32* %out, i32* ret void } +; x1 = x0 define void @short_vector_to_i64(<4 x i8>* %in, i64* %out, i64* %p) { ; CHECK-LABEL: short_vector_to_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: bfi x8, x9, #8, #8 -; CHECK-NEXT: bfi x8, x10, #16, #8 -; CHECK-NEXT: bfi x8, x11, #24, #8 +; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll index d4708ae..a344634 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -9,15 +9,9 @@ define amdgpu_kernel void @vectorLoadCombine(<4 x i8>* %in, i32* %out) { ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_mov_b32 s0, 0x6050400 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GCN-NEXT: v_and_b32_e32 v4, 0xff0000, v2 -; GCN-NEXT: v_perm_b32 v3, v3, v2, s0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff000000, v2 -; GCN-NEXT: v_or3_b32 v2, v3, v4, v2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm entry: @@ -84,10 +78,7 @@ define i32 @load_2xi16_combine(i16 addrspace(1)* %p) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_dword v0, v[0:1], off -; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 1 %p.0 = load i16, i16 addrspace(1)* %p, align 4 @@ -162,8 +153,6 @@ define i64 @load_4xi16_combine(i16 addrspace(1)* %p) #0 { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GCN-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 1 %gep.2p = getelementptr i16, i16 addrspace(1)* %p, i32 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index eec983d..a28baa2 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -184,10 +184,7 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_load_2xi16_align1: @@ -196,8 +193,6 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_load_2xi16_align1: @@ -206,9 +201,6 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 1 @@ -293,13 +285,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 a ; Should merge this to a dword load define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 { -; GFX7-LABEL: load_2xi16_align4: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_setpc_b64 s[30:31] -; ; GFX7-ALIGNED-LABEL: global_load_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -318,10 +303,7 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_load_2xi16_align4: @@ -330,8 +312,6 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_load_2xi16_align4: @@ -340,9 +320,6 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index 0e2fae7..8b6e1a6 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -387,13 +387,6 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)* ; Should merge this to a dword load define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { -; GFX7-LABEL: load_2xi16_align4: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_setpc_b64 s[30:31] -; ; GFX7-ALIGNED-LABEL: private_load_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -412,20 +405,14 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align4: ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off -; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_load_2xi16_align4: @@ -434,8 +421,6 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align4: @@ -444,8 +429,6 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_load_2xi16_align4: @@ -454,9 +437,6 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align4: @@ -465,9 +445,6 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 4 -- 2.7.4