From ed891b55616dd07f05c18db593e103f97faba520 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 16 Jun 2015 15:51:48 +0000 Subject: [PATCH] Revert "Revert "Fix merges of non-zero vector stores"" Reapply r239539. Don't assume the collected number of stores is the same vector size. Just take the first N stores to fill the vector. llvm-svn: 239825 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 26 ++++-- llvm/test/CodeGen/AMDGPU/merge-stores.ll | 101 ++++++++++++++++++++- .../CodeGen/X86/2012-11-28-merge-store-alias.ll | 1 + 3 files changed, 120 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 33f5f52..ab72941 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -388,6 +388,13 @@ namespace { unsigned SequenceNum; }; + /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a + /// constant build_vector of the stored constant values in Stores. + SDValue getMergedConstantVectorStore(SelectionDAG &DAG, + SDLoc SL, + ArrayRef Stores, + EVT Ty) const; + /// This is a helper function for MergeConsecutiveStores. When the source /// elements of the consecutive stores are all constants or all extracted /// vector elements, try to merge them into one larger store. @@ -10591,6 +10598,18 @@ struct BaseIndexOffset { }; } // namespace +SDValue DAGCombiner::getMergedConstantVectorStore(SelectionDAG &DAG, + SDLoc SL, + ArrayRef Stores, + EVT Ty) const { + SmallVector BuildVector; + + for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) + BuildVector.push_back(cast(Stores[I].MemNode)->getValue()); + + return DAG.getNode(ISD::BUILD_VECTOR, SL, Ty, BuildVector); +} + bool DAGCombiner::MergeStoresOfConstantsOrVecElts( SmallVectorImpl &StoreNodes, EVT MemVT, unsigned NumElem, bool IsConstantSrc, bool UseVector) { @@ -10621,12 +10640,7 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); if (IsConstantSrc) { - // A vector store with a constant source implies that the constant is - // zero; we only handle merging stores of constant zeros because the zero - // can be materialized without a load. - // It may be beneficial to loosen this restriction to allow non-zero - // store merging. - StoredVal = DAG.getConstant(0, DL, Ty); + StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Ty); } else { SmallVector Ops; for (unsigned i = 0; i < NumElem ; ++i) { diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll index dbf9d44..34a2fc7 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -89,7 +89,11 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { } ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: -; GCN: buffer_store_dwordx2 +; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0 +; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}} +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]] +; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* @@ -99,7 +103,11 @@ define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 } ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: -; GCN: buffer_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}} +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}} +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}} +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}} +; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}} define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 @@ -530,6 +538,95 @@ define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { ret void } +; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} +; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} +; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} +; GCN: buffer_store_dword v[[HI]] +define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { + store i32 9, i32 addrspace(1)* %out, align 4 + %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 12, i32 addrspace(1)* %idx1, align 4 + %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 + store i32 16, i32 addrspace(1)* %idx2, align 4 + %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 -12, i32 addrspace(1)* %idx3, align 4 + %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 + store i32 11, i32 addrspace(1)* %idx4, align 4 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_6_constants_i32: +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx2 +define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { + store i32 13, i32 addrspace(1)* %out, align 4 + %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 15, i32 addrspace(1)* %idx1, align 4 + %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 + store i32 62, i32 addrspace(1)* %idx2, align 4 + %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 63, i32 addrspace(1)* %idx3, align 4 + %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 + store i32 11, i32 addrspace(1)* %idx4, align 4 + %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 + store i32 123, i32 addrspace(1)* %idx5, align 4 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dword v +define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { + store i32 34, i32 addrspace(1)* %out, align 4 + %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 999, i32 addrspace(1)* %idx1, align 4 + %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 + store i32 65, i32 addrspace(1)* %idx2, align 4 + %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 33, i32 addrspace(1)* %idx3, align 4 + %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 + store i32 98, i32 addrspace(1)* %idx4, align 4 + %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 + store i32 91, i32 addrspace(1)* %idx5, align 4 + %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 + store i32 212, i32 addrspace(1)* %idx6, align 4 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_8_constants_i32: +; XGCN: buffer_store_dwordx4 +; XGCN: buffer_store_dwordx4 + +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { + store i32 34, i32 addrspace(1)* %out, align 4 + %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 999, i32 addrspace(1)* %idx1, align 4 + %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 + store i32 65, i32 addrspace(1)* %idx2, align 4 + %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 33, i32 addrspace(1)* %idx3, align 4 + %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 + store i32 98, i32 addrspace(1)* %idx4, align 4 + %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 + store i32 91, i32 addrspace(1)* %idx5, align 4 + %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 + store i32 212, i32 addrspace(1)* %idx6, align 4 + %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7 + store i32 999, i32 addrspace(1)* %idx7, align 4 + ret void +} + declare void @llvm.AMDGPU.barrier.local() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/2012-11-28-merge-store-alias.ll b/llvm/test/CodeGen/X86/2012-11-28-merge-store-alias.ll index ed1daad..c16deef 100644 --- a/llvm/test/CodeGen/X86/2012-11-28-merge-store-alias.ll +++ b/llvm/test/CodeGen/X86/2012-11-28-merge-store-alias.ll @@ -3,6 +3,7 @@ ; CHECK: merge_stores_can ; CHECK: callq foo ; CHECK: xorps %xmm0, %xmm0 +; CHECK-NEXT: movl 36(%rsp), %ebp ; CHECK-NEXT: movups %xmm0 ; CHECK: callq foo ; CHECK: ret -- 2.7.4