From 845b73c06f533e623c8b0eae0b059aec1b5ff4a0 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Wed, 21 Nov 2012 08:16:30 +0000
Subject: [PATCH] PR14055: Implement support for sub-vector operations in SROA.

Now if we can transform an alloca into a single vector value, but it has
subvector, non-element accesses, we form the appropriate shufflevectors
to allow SROA to proceed. This fixes PR14055 which pointed out a very
common pattern that SROA couldn't handle -- mixed vec3 and vec4
operations on a single alloca.

llvm-svn: 168418
---
 llvm/lib/Transforms/Scalar/SROA.cpp           | 96 +++++++++++++++++++++------
 llvm/test/Transforms/SROA/vector-promotion.ll | 77 ++++++++++++++++++++-
 2 files changed, 150 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 09dff27..8284e14 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2116,12 +2116,11 @@ static bool isVectorPromotionViable(const DataLayout &TD,
         EndIndex > Ty->getNumElements())
       return false;
 
-    // FIXME: We should build shuffle vector instructions to handle
-    // non-element-sized accesses. See PR14055 for an example of where this
-    // matters.
-    if ((EndOffset - BeginOffset) != ElementSize &&
-        (EndOffset - BeginOffset) != VecSize)
-      return false;
+    assert(EndIndex > BeginIndex && "Empty vector!");
+    uint64_t NumElements = EndIndex - BeginIndex;
+    Type *PartitionTy
+      = (NumElements == 1) ? Ty->getElementType()
+                           : VectorType::get(Ty->getElementType(), NumElements);
 
     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
       if (MI->isVolatile())
@@ -2138,9 +2137,13 @@ static bool isVectorPromotionViable(const DataLayout &TD,
     } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
       if (LI->isVolatile())
         return false;
+      if (!canConvertValue(TD, PartitionTy, LI->getType()))
+        return false;
     } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
       if (SI->isVolatile())
         return false;
+      if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy))
+        return false;
     } else {
       return false;
     }
@@ -2448,13 +2451,13 @@ private:
     return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset);
   }
 
-  ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) {
+  unsigned getIndex(uint64_t Offset) {
     assert(VecTy && "Can only call getIndex when rewriting a vector");
     uint64_t RelOffset = Offset - NewAllocaBeginOffset;
     assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
     uint32_t Index = RelOffset / ElementSize;
     assert(Index * ElementSize == RelOffset);
-    return IRB.getInt32(Index);
+    return Index;
   }
 
   void deleteIfTriviallyDead(Value *V) {
@@ -2466,10 +2469,24 @@ private:
   Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) {
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                      getName(".load"));
-    if (LI.getType() == VecTy->getElementType() ||
-        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
-      V = IRB.CreateExtractElement(V, getIndex(IRB, BeginOffset),
+    unsigned BeginIndex = getIndex(BeginOffset);
+    unsigned EndIndex = getIndex(EndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+    unsigned NumElements = EndIndex - BeginIndex;
+    assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+    if (NumElements == 1) {
+      V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
                                    getName(".extract"));
+      DEBUG(dbgs() << "     extract: " << *V << "\n");
+    } else if (NumElements < VecTy->getNumElements()) {
+      SmallVector<Constant*, 8> Mask;
+      Mask.reserve(NumElements);
+      for (unsigned i = BeginIndex; i != EndIndex; ++i)
+        Mask.push_back(IRB.getInt32(i));
+      V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                  ConstantVector::get(Mask),
+                                  getName(".extract"));
+      DEBUG(dbgs() << "     shuffle: " << *V << "\n");
     }
     return V;
   }
@@ -2569,15 +2586,52 @@ private:
 
   bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V,
                                   StoreInst &SI, Value *OldOp) {
-    if (V->getType() == ElementTy ||
-        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
-      if (V->getType() != ElementTy)
-        V = convertValue(TD, IRB, V, ElementTy);
+    unsigned BeginIndex = getIndex(BeginOffset);
+    unsigned EndIndex = getIndex(EndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+    unsigned NumElements = EndIndex - BeginIndex;
+    assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+    Type *PartitionTy
+      = (NumElements == 1) ? ElementTy
+                           : VectorType::get(ElementTy, NumElements);
+    if (V->getType() != PartitionTy)
+      V = convertValue(TD, IRB, V, PartitionTy);
+    if (NumElements < VecTy->getNumElements()) {
+      // We need to mix in the existing elements.
       LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                            getName(".load"));
-      V = IRB.CreateInsertElement(LI, V, getIndex(IRB, BeginOffset),
-                                  getName(".insert"));
-    } else if (V->getType() != VecTy) {
+      if (NumElements == 1) {
+        V = IRB.CreateInsertElement(LI, V, IRB.getInt32(BeginIndex),
+                                    getName(".insert"));
+        DEBUG(dbgs() <<  "     insert: " << *V << "\n");
+      } else {
+        // When inserting a smaller vector into the larger to store, we first
+        // use a shuffle vector to widen it with undef elements, and then
+        // a second shuffle vector to select between the loaded vector and the
+        // incoming vector.
+        SmallVector<Constant*, 8> Mask;
+        Mask.reserve(VecTy->getNumElements());
+        for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+          if (i >= BeginIndex && i < EndIndex)
+            Mask.push_back(IRB.getInt32(i - BeginIndex));
+          else
+            Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
+        V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                    ConstantVector::get(Mask),
+                                    getName(".expand"));
+        DEBUG(dbgs() << "    shuffle1: " << *V << "\n");
+
+        Mask.clear();
+        for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+          if (i >= BeginIndex && i < EndIndex)
+            Mask.push_back(IRB.getInt32(i));
+          else
+            Mask.push_back(IRB.getInt32(i + VecTy->getNumElements()));
+        V = IRB.CreateShuffleVector(V, LI, ConstantVector::get(Mask),
+                                    getName("insert"));
+        DEBUG(dbgs() << "    shuffle2: " << *V << "\n");
+      }
+    } else {
       V = convertValue(TD, IRB, V, VecTy);
     }
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
@@ -2731,7 +2785,7 @@ private:
         IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI,
                                                       NewAI.getAlignment(),
                                                       getName(".load")),
-                                V, getIndex(IRB, BeginOffset),
+                                V, IRB.getInt32(getIndex(BeginOffset)),
                                 getName(".insert")),
         &NewAI, NewAI.getAlignment());
       (void)Store;
@@ -2899,7 +2953,7 @@ private:
       // We have to extract rather than load.
       Src = IRB.CreateExtractElement(
         IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")),
-        getIndex(IRB, BeginOffset),
+        IRB.getInt32(getIndex(BeginOffset)),
         getName(".copyextract"));
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
@@ -2927,7 +2981,7 @@ private:
       // We have to insert into a loaded copy before storing.
       Src = IRB.CreateInsertElement(
         IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")),
-        Src, getIndex(IRB, BeginOffset),
+        Src, IRB.getInt32(getIndex(BeginOffset)),
         getName(".insert"));
     }
 
diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index ea28f5d..f1e1189 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -36,15 +36,15 @@ entry:
 
 define i32 @test2(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK: @test2
-; FIXME: This should be handled!
 entry:
 	%a = alloca [2 x <4 x i32>]
-; CHECK: alloca <4 x i32>
+; CHECK-NOT: alloca
 
   %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
   store <4 x i32> %x, <4 x i32>* %a.x
   %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
   store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
 
   %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
   %tmp1 = load i32* %a.tmp1
@@ -54,10 +54,18 @@ entry:
   %a.tmp3.cast = bitcast i32* %a.tmp3 to <2 x i32>*
   %tmp3.vec = load <2 x i32>* %a.tmp3.cast
   %tmp3 = extractelement <2 x i32> %tmp3.vec, i32 0
+; CHECK-NOT: load
+; CHECK:      %[[extract1:.*]] = extractelement <4 x i32> %x, i32 2
+; CHECK-NEXT: %[[extract2:.*]] = extractelement <4 x i32> %y, i32 3
+; CHECK-NEXT: %[[extract3:.*]] = shufflevector <4 x i32> %y, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: %[[extract4:.*]] = extractelement <2 x i32> %[[extract3]], i32 0
 
   %tmp4 = add i32 %tmp1, %tmp2
   %tmp5 = add i32 %tmp3, %tmp4
   ret i32 %tmp5
+; CHECK-NEXT: %[[sum1:.*]] = add i32 %[[extract1]], %[[extract2]]
+; CHECK-NEXT: %[[sum2:.*]] = add i32 %[[extract4]], %[[sum1]]
+; CHECK-NEXT: ret i32 %[[sum2]]
 }
 
 define i32 @test3(<4 x i32> %x, <4 x i32> %y) {
@@ -206,6 +214,71 @@ define i64 @test6(<4 x i64> %x, <4 x i64> %y, i64 %n) {
   ret i64 %res
 }
 
+define <4 x i32> @test_subvec_store() {
+; CHECK: @test_subvec_store
+entry:
+  %a = alloca <4 x i32>
+; CHECK-NOT: alloca
+
+  %a.gep0 = getelementptr <4 x i32>* %a, i32 0, i32 0
+  %a.cast0 = bitcast i32* %a.gep0 to <2 x i32>*
+  store <2 x i32> <i32 0, i32 0>, <2 x i32>* %a.cast0
+; CHECK-NOT: store
+; CHECK:      %[[insert1:.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, {{.*}}>
+
+  %a.gep1 = getelementptr <4 x i32>* %a, i32 0, i32 1
+  %a.cast1 = bitcast i32* %a.gep1 to <2 x i32>*
+  store <2 x i32> <i32 1, i32 1>, <2 x i32>* %a.cast1
+; CHECK-NEXT: %[[insert2:.*]] = shufflevector <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>, <4 x i32> %[[insert1]], <4 x i32> <i32 4, i32 1, i32 2, {{.*}}>
+
+  %a.gep2 = getelementptr <4 x i32>* %a, i32 0, i32 2
+  %a.cast2 = bitcast i32* %a.gep2 to <2 x i32>*
+  store <2 x i32> <i32 2, i32 2>, <2 x i32>* %a.cast2
+; CHECK-NEXT: %[[insert3:.*]] = shufflevector <4 x i32> <i32 undef, i32 undef, i32 2, i32 2>, <4 x i32> %[[insert2]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+
+  %a.gep3 = getelementptr <4 x i32>* %a, i32 0, i32 3
+  store i32 3, i32* %a.gep3
+; CHECK-NEXT: %[[insert4:.*]] = insertelement <4 x i32> %[[insert3]], i32 3, i32 3
+
+  %ret = load <4 x i32>* %a
+
+  ret <4 x i32> %ret
+; CHECK-NEXT: ret <4 x i32> %[[insert4]]
+}
+
+define <4 x i32> @test_subvec_load() {
+; CHECK: @test_subvec_load
+entry:
+  %a = alloca <4 x i32>
+; CHECK-NOT: alloca
+  store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a
+; CHECK-NOT: store
+
+  %a.gep0 = getelementptr <4 x i32>* %a, i32 0, i32 0
+  %a.cast0 = bitcast i32* %a.gep0 to <2 x i32>*
+  %first = load <2 x i32>* %a.cast0
+; CHECK-NOT: load
+; CHECK:      %[[extract1:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %a.gep1 = getelementptr <4 x i32>* %a, i32 0, i32 1
+  %a.cast1 = bitcast i32* %a.gep1 to <2 x i32>*
+  %second = load <2 x i32>* %a.cast1
+; CHECK-NEXT: %[[extract2:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 1, i32 2>
+
+  %a.gep2 = getelementptr <4 x i32>* %a, i32 0, i32 2
+  %a.cast2 = bitcast i32* %a.gep2 to <2 x i32>*
+  %third = load <2 x i32>* %a.cast2
+; CHECK-NEXT: %[[extract3:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %tmp = shufflevector <2 x i32> %first, <2 x i32> %second, <2 x i32> <i32 0, i32 2>
+  %ret = shufflevector <2 x i32> %tmp, <2 x i32> %third, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: %[[tmp:.*]] = shufflevector <2 x i32> %[[extract1]], <2 x i32> %[[extract2]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: %[[ret:.*]] = shufflevector <2 x i32> %[[tmp]], <2 x i32> %[[extract3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+  ret <4 x i32> %ret
+; CHECK-NEXT: ret <4 x i32> %[[ret]]
+}
+
 define i32 @PR14212() {
 ; CHECK: @PR14212
 ; This caused a crash when "splitting" the load of the i32 in order to promote
-- 
2.7.4