[LoadStoreVectorizer] Use getMinusScev() to compute the distance between two pointers.

author Farhana Aleen <farhana.aleen@gmail.com>

Thu, 19 Jul 2018 16:50:27 +0000 (16:50 +0000)

committer Farhana Aleen <farhana.aleen@gmail.com>

Thu, 19 Jul 2018 16:50:27 +0000 (16:50 +0000)
author Farhana Aleen <farhana.aleen@gmail.com>
Thu, 19 Jul 2018 16:50:27 +0000 (16:50 +0000)
committer Farhana Aleen <farhana.aleen@gmail.com>
Thu, 19 Jul 2018 16:50:27 +0000 (16:50 +0000)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

index ce77ea80a289705054f15f4a45dab5a5ce753de7..8ce408294c037e039d5ba6fa4b601779d4046504 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -340,6 +340,14 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
    if (X == PtrSCEVB)
      return true;
  
+  // The above check will not catch the cases where one of the pointers is
+  // factorized but the other one is not, such as (C + (S * (A + B))) vs
+  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
+  // and getting the simplified difference.
+  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
+  if (C == Dist)
+    return true;
+
    // Sometimes even this doesn't work, because SCEV can't always see through
    // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
    // things the hard way.
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll

new file mode 100644 (file)

index 0000000..220efd2
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
@@ -0,0 +1,49 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+declare i64 @_Z12get_local_idj(i32)
+
+declare i64 @_Z12get_group_idj(i32)
+
+declare double @llvm.fmuladd.f64(double, double, double)
+
+; CHECK-LABEL: @factorizedVsNonfactorizedAccess(
+; CHECK: load <2 x float>
+; CHECK: store <2 x float>
+define amdgpu_kernel void @factorizedVsNonfactorizedAccess(float addrspace(1)* nocapture %c) {
+entry:
+  %call = tail call i64 @_Z12get_local_idj(i32 0)
+  %call1 = tail call i64 @_Z12get_group_idj(i32 0)
+  %div = lshr i64 %call, 4
+  %div2 = lshr i64 %call1, 3
+  %mul = shl i64 %div2, 7
+  %rem = shl i64 %call, 3
+  %mul3 = and i64 %rem, 120
+  %add = or i64 %mul, %mul3
+  %rem4 = shl i64 %call1, 7
+  %mul5 = and i64 %rem4, 896
+  %mul6 = shl nuw nsw i64 %div, 3
+  %add7 = add nuw i64 %mul5, %mul6
+  %mul9 = shl i64 %add7, 10
+  %add10 = add i64 %mul9, %add
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %c, i64 %add10
+  %load1 = load float, float addrspace(1)* %arrayidx, align 4
+  %conv = fpext float %load1 to double
+  %mul11 = fmul double %conv, 0x3FEAB481D8F35506
+  %conv12 = fptrunc double %mul11 to float
+  %conv18 = fpext float %conv12 to double
+  %storeval1 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv18)
+  %cstoreval1 = fptrunc double %storeval1 to float
+  store float %cstoreval1, float addrspace(1)* %arrayidx, align 4
+
+  %add23 = or i64 %add10, 1
+  %arrayidx24 = getelementptr inbounds float, float addrspace(1)* %c, i64 %add23
+  %load2 = load float, float addrspace(1)* %arrayidx24, align 4
+  %conv25 = fpext float %load2 to double
+  %mul26 = fmul double %conv25, 0x3FEAB481D8F35506
+  %conv27 = fptrunc double %mul26 to float
+  %conv34 = fpext float %conv27 to double
+  %storeval2 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv34)
+  %cstoreval2 = fptrunc double %storeval2 to float
+  store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4
+  ret void
+}
+\ No newline at end of file
author	Farhana Aleen <farhana.aleen@gmail.com>
	Thu, 19 Jul 2018 16:50:27 +0000 (16:50 +0000)
committer	Farhana Aleen <farhana.aleen@gmail.com>
	Thu, 19 Jul 2018 16:50:27 +0000 (16:50 +0000)
llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp		patch \| blob \| history
llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll	[new file with mode: 0644]	patch \| blob