From 8c7a30baea219e8143b13e3e384ff713d8bb7c76 Mon Sep 17 00:00:00 2001
From: Farhana Aleen <farhana.aleen@gmail.com>
Date: Thu, 19 Jul 2018 16:50:27 +0000
Subject: [PATCH] [LoadStoreVectorizer] Use getMinusScev() to compute the
 distance between two pointers.

Summary: Currently, isConsecutiveAccess() detects two pointers(PtrA and PtrB) as consecutive by
         comparing PtrB with BaseDelta+PtrA. This works when both pointers are factorized or
         both of them are not factorized. But isConsecutiveAccess() fails if one of the
         pointers is factorized but the other one is not.

         Here is an example:
         PtrA = 4 * (A + B)
         PtrB = 4 + 4A + 4B

         This patch uses getMinusSCEV() to compute the distance between two pointers.
         getMinusSCEV() allows combining the expressions and computing the simplified distance.

Author: FarhanaAleen

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D49516

llvm-svn: 337471
---
 .../Vectorize/LoadStoreVectorizer.cpp         |  8 +++
 .../AMDGPU/complex-index.ll                   | 49 +++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index ce77ea80a289..8ce408294c03 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -340,6 +340,14 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
   if (X == PtrSCEVB)
     return true;
 
+  // The above check will not catch the cases where one of the pointers is
+  // factorized but the other one is not, such as (C + (S * (A + B))) vs
+  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
+  // and getting the simplified difference.
+  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
+  if (C == Dist)
+    return true;
+
   // Sometimes even this doesn't work, because SCEV can't always see through
   // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
   // things the hard way.
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
new file mode 100644
index 000000000000..220efd21fe19
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
@@ -0,0 +1,49 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+declare i64 @_Z12get_local_idj(i32)
+
+declare i64 @_Z12get_group_idj(i32)
+
+declare double @llvm.fmuladd.f64(double, double, double)
+
+; CHECK-LABEL: @factorizedVsNonfactorizedAccess(
+; CHECK: load <2 x float>
+; CHECK: store <2 x float>
+define amdgpu_kernel void @factorizedVsNonfactorizedAccess(float addrspace(1)* nocapture %c) {
+entry:
+  %call = tail call i64 @_Z12get_local_idj(i32 0)
+  %call1 = tail call i64 @_Z12get_group_idj(i32 0)
+  %div = lshr i64 %call, 4
+  %div2 = lshr i64 %call1, 3
+  %mul = shl i64 %div2, 7
+  %rem = shl i64 %call, 3
+  %mul3 = and i64 %rem, 120
+  %add = or i64 %mul, %mul3
+  %rem4 = shl i64 %call1, 7
+  %mul5 = and i64 %rem4, 896
+  %mul6 = shl nuw nsw i64 %div, 3
+  %add7 = add nuw i64 %mul5, %mul6
+  %mul9 = shl i64 %add7, 10
+  %add10 = add i64 %mul9, %add
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %c, i64 %add10
+  %load1 = load float, float addrspace(1)* %arrayidx, align 4
+  %conv = fpext float %load1 to double
+  %mul11 = fmul double %conv, 0x3FEAB481D8F35506
+  %conv12 = fptrunc double %mul11 to float
+  %conv18 = fpext float %conv12 to double
+  %storeval1 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv18)
+  %cstoreval1 = fptrunc double %storeval1 to float
+  store float %cstoreval1, float addrspace(1)* %arrayidx, align 4
+
+  %add23 = or i64 %add10, 1
+  %arrayidx24 = getelementptr inbounds float, float addrspace(1)* %c, i64 %add23
+  %load2 = load float, float addrspace(1)* %arrayidx24, align 4
+  %conv25 = fpext float %load2 to double
+  %mul26 = fmul double %conv25, 0x3FEAB481D8F35506
+  %conv27 = fptrunc double %mul26 to float
+  %conv34 = fpext float %conv27 to double
+  %storeval2 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv34)
+  %cstoreval2 = fptrunc double %storeval2 to float
+  store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4
+  ret void
+}
\ No newline at end of file
-- 
2.34.1