From de3445e0ef15c420955ad720fccf08473f460443 Mon Sep 17 00:00:00 2001
From: A-Wadhwani <aryan.w2203@gmail.com>
Date: Mon, 12 Sep 2022 09:29:34 -0700
Subject: [PATCH] [SROA] Create additional vector type candidates based on
 store and load slices

This patch adds additional vector types to be considered when doing
promotion in SROA, based on the types of the store and load slices. This
provides more promotion opportunities, by potentially using an optimal
"intermediate" vector type.

For example, the following code would currently not be promoted to a
vector, since `__m128i` is a `<2 x i64>` vector.

```

__m128i packfoo0(int a, int b, int c, int d) {
  int r[4] = {a, b, c, d};
  __m128i rm;
  std::memcpy(&rm, r, sizeof(rm));
  return rm;
}
```

```
packfoo0(int, int, int, int):
        mov     dword ptr [rsp - 24], edi
        mov     dword ptr [rsp - 20], esi
        mov     dword ptr [rsp - 16], edx
        mov     dword ptr [rsp - 12], ecx
        movaps  xmm0, xmmword ptr [rsp - 24]
        ret
```

By also considering the types of the elements, we could find that the
`<4 x i32>` type would be valid for promotion, hence removing the memory
accesses for this function. In other words, we can explore other new
vector types, with the same size but different element types based on
the load and store instructions from the Slices, which can provide us
more promotion opportunities.

Additionally, the step for removing duplicate elements from the
`CandidateTys` vector was not using an equality comparator, which has
been fixed.

Differential Revision: https://reviews.llvm.org/D132096
---
 llvm/lib/Transforms/Scalar/SROA.cpp           | 32 ++++++++++--
 llvm/test/Transforms/SROA/vector-promotion.ll | 71 +++++++++++++++++++++++++--
 2 files changed, 96 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index cb2c43e..4983273 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1922,6 +1922,28 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   if (CandidateTys.empty())
     return nullptr;
 
+  // Generate new candidate type based on load/store size.
+  for (const Slice &S : P) {
+    Type *Ty;
+    if (LoadInst *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
+      Ty = LI->getType();
+    else if (StoreInst *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
+      Ty = SI->getValueOperand()->getType();
+    else
+      continue;
+    if (isa<VectorType>(Ty))
+      continue;
+    // Create Vector with size of V, and each element of type Ty
+    VectorType *V = CandidateTys[0];
+    uint64_t ElementSize = DL.getTypeStoreSizeInBits(Ty).getFixedSize();
+    uint64_t VectorSize = DL.getTypeSizeInBits(V).getFixedSize();
+    if ((ElementSize != VectorSize) && (VectorSize % ElementSize == 0)) {
+      VectorType *VTy = VectorType::get(Ty, VectorSize / ElementSize, false);
+      CandidateTys.push_back(VTy);
+      if (CommonEltTy != Ty)
+        HaveCommonEltTy = false;
+    }
+  }
   // Remove non-integer vector types if we had multiple common element types.
   // FIXME: It'd be nice to replace them with integer vector types, but we can't
   // do that until all the backends are known to produce good code for all
@@ -1949,10 +1971,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
       return cast<FixedVectorType>(RHSTy)->getNumElements() <
              cast<FixedVectorType>(LHSTy)->getNumElements();
     };
+    auto RankVectorTypesEq = [&](VectorType *LHSTy, VectorType *RHSTy) {
+      return cast<FixedVectorType>(LHSTy)->getNumElements() ==
+             cast<FixedVectorType>(RHSTy)->getNumElements();
+    };
     llvm::sort(CandidateTys, RankVectorTypes);
-    CandidateTys.erase(
-        std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
-        CandidateTys.end());
+    CandidateTys.erase(std::unique(CandidateTys.begin(), CandidateTys.end(),
+                                   RankVectorTypesEq),
+                       CandidateTys.end());
   } else {
 // The only way to have the same element type in every vector type is to
 // have the same vector type. Check that and remove all but one.
diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index aed0d50..7ad8e5c 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -534,10 +534,9 @@ define <2 x float> @test11(<4 x i16> %x, i32 %y) {
 ; heuristic for making a deterministic decision.
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[Y:%.*]] to <2 x i16>
-; CHECK-NEXT:    [[A_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
-; CHECK-NEXT:    [[A_SROA_0_4_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i16> [[A_SROA_0_4_VEC_EXPAND]], <4 x i16> [[X:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[A_SROA_0_4_VECBLEND]] to <2 x float>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[A_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[A_SROA_0_4_VEC_INSERT]] to <2 x float>
 ; CHECK-NEXT:    ret <2 x float> [[TMP1]]
 ;
 entry:
@@ -565,3 +564,67 @@ define <4 x float> @test12(<4 x i32> %val) {
 
   ret <4 x float> %vec
 }
+
+define <2 x i64> @test13(i32 %a, i32 %b, i32 %c, i32 %d) {
+; Ensure that we can promote an alloca that needs to be
+; cast to a different vector type
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_SROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[X_SROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_0_VEC_INSERT]], i32 [[B:%.*]], i32 1
+; CHECK-NEXT:    [[X_SROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_4_VEC_INSERT]], i32 [[C:%.*]], i32 2
+; CHECK-NEXT:    [[X_SROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_8_VEC_INSERT]], i32 [[D:%.*]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[X_SROA_0_12_VEC_INSERT]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+;
+entry:
+  %x = alloca [4 x i32]
+
+  store i32 %a, ptr %x
+  %x.tmp2 = getelementptr inbounds i32, ptr %x, i64 1
+  store i32 %b, ptr %x.tmp2
+  %x.tmp3 = getelementptr inbounds i32, ptr %x, i64 2
+  store i32 %c, ptr %x.tmp3
+  %x.tmp4 = getelementptr inbounds i32, ptr %x, i64 3
+  store i32 %d, ptr %x.tmp4
+
+
+  %result = load <2 x i64>, ptr %x
+
+  ret <2 x i64> %result
+}
+
+define i32 @test14(<2 x i64> %x) {
+; Ensure that we can promote an alloca that needs to be
+; cast to a different vector type
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[X_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[X_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[X_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT:    [[X_SROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[X_SROA_0_0_VEC_EXTRACT]], [[X_SROA_0_4_VEC_EXTRACT]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[X_SROA_0_8_VEC_EXTRACT]], [[X_SROA_0_12_VEC_EXTRACT]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD]], [[ADD1]]
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+entry:
+
+  %x.addr = alloca <2 x i64>, align 16
+  store <2 x i64> %x, <2 x i64>* %x.addr, align 16
+  %x.cast = bitcast <2 x i64>* %x.addr to i32*
+
+  %a = load i32, ptr %x.cast
+  %x.tmp2 = getelementptr inbounds i32, ptr %x.cast, i64 1
+  %b = load i32, ptr %x.tmp2
+  %x.tmp3 = getelementptr inbounds i32, ptr %x.cast, i64 2
+  %c = load i32, ptr %x.tmp3
+  %x.tmp4 = getelementptr inbounds i32, ptr %x.cast, i64 3
+  %d = load i32, ptr %x.tmp4
+
+  %add = add i32 %a, %b
+  %add1 = add i32 %c, %d
+  %add2 = add i32 %add, %add1
+  ret i32 %add2
+}
-- 
2.7.4