From d4e37afe450ae1822d65223b297a4b518b9eb268 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 5 Feb 2019 00:22:23 +0000
Subject: [PATCH] [DAGCombiner] Discard pointer info when combining
 extract_vector_elt of a vector load when the index isn't constant

Summary:
If the index isn't constant, this transform inserts a multiply and an add on the index to calculating the base pointer for a scalar load. But we still create a memory operand with an offset of 0 and the size of the scalar access. But the access is really to an unknown offset within the original access size.

This can cause the machine scheduler to incorrectly calculate dependencies between this load and other accesses. In the case we saw, there was a 32 byte vector store that was split into two 16 byte stores, one with offset 0 and one with offset 16. The size of the memory operand for both was 16. The scheduler correctly detected the alias with the offset 0 store, but not the offset 16 store.

This patch discards the pointer info so we don't incorrectly detect aliasing. I wasn't sure if we could keep using the original offset and size without risking some other transform on the load changing the size.

I tried to reduce a test case, but there's still a lot of memory operations needed to get the scheduler to do the bad reordering. So it looked pretty fragile to maintain.

Reviewers: efriedma

Reviewed By: efriedma

Subscribers: arphaman, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D57616

llvm-svn: 353124
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  4 ++-
 llvm/test/CodeGen/X86/vecloadextract.ll       | 44 +++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100755 llvm/test/CodeGen/X86/vecloadextract.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 16863e6..e6ac8bc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15656,7 +15656,9 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
     Offset = DAG.getNode(
         ISD::MUL, DL, PtrType, Offset,
         DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType));
-    MPI = OriginalLoad->getPointerInfo();
+    // Discard the pointer info except the address space because the memory
+    // operand can't represent this new access since the offset is variable.
+    MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
   }
   NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, NewPtr, Offset);
 
diff --git a/llvm/test/CodeGen/X86/vecloadextract.ll b/llvm/test/CodeGen/X86/vecloadextract.ll
new file mode 100755
index 0000000..94df350
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vecloadextract.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;RUN: llc < %s -mtriple=i686 -mattr=sse4.1 -stop-after=expand-isel-pseudos 2>&1 | FileCheck %s
+
+; This test makes sure we discard pointer info when we combine a vector load
+; and a variable extractelement into a scalar load using an index. There's also
+; a test to ensure we don't discard it for the constant index case.
+
+; CHECK: name: const_index
+; CHECK:  bb.0 (%ir-block.0):
+; CHECK:    [[POINTER:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
+; CHECK:    [[LOAD:%[0-9]+]]:gr32 = MOV32rm killed [[POINTER]], 1, $noreg, 4, $noreg :: (load 4 from %ir.v + 4)
+; CHECK:    $eax = COPY [[LOAD]]
+; CHECK:    RET 0, $eax
+define i32 @const_index(<8 x i32>* %v) {
+  %a = load <8 x i32>, <8 x i32>* %v
+  %b = extractelement <8 x i32> %a, i32 1
+  ret i32 %b
+}
+
+; CHECK: name: variable_index
+; CHECK:  bb.0 (%ir-block.0):
+; CHECK:    [[INDEX:%[0-9]+]]:gr32_nosp = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
+; CHECK:    [[POINTER:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.1)
+; CHECK:    [[LOAD:%[0-9]+]]:gr32 = MOV32rm killed [[POINTER]], 4, killed [[INDEX]], 0, $noreg :: (load 4)
+; CHECK:    $eax = COPY [[LOAD]]
+; CHECK:    RET 0, $eax
+define i32 @variable_index(<8 x i32>* %v, i32 %i) {
+  %a = load <8 x i32>, <8 x i32>* %v
+  %b = extractelement <8 x i32> %a, i32 %i
+  ret i32 %b
+}
+
+; CHECK: name: variable_index_with_addrspace
+; CHECK:  bb.0 (%ir-block.0):
+; CHECK:    [[INDEX:%[0-9]+]]:gr32_nosp = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
+; CHECK:    [[POINTER:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.1)
+; CHECK:    [[LOAD:%[0-9]+]]:gr32 = MOV32rm killed [[POINTER]], 4, killed [[INDEX]], 0, $noreg :: (load 4, addrspace 1)
+; CHECK:    $eax = COPY [[LOAD]]
+; CHECK:    RET 0, $eax
+define i32 @variable_index_with_addrspace(<8 x i32> addrspace(1)* %v, i32 %i) {
+  %a = load <8 x i32>, <8 x i32> addrspace(1)* %v
+  %b = extractelement <8 x i32> %a, i32 %i
+  ret i32 %b
+}
-- 
2.7.4