From c1d20a36fb5a6034c76a10bded21b43561ec2a3d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 25 Apr 2015 20:55:25 +0000
Subject: [PATCH] [x86] instcombine more cases of insertps into a shufflevector

This is a follow-on to D8833 (insertps optimization when the zero mask is not used).

In this patch, we check for the case where the zmask is used, but both input vectors
to the insertps intrinsic are the same operand or the zmask overrides the destination
lane. This lets us replace the 2nd shuffle input operand with the zero vector.

Differential Revision: http://reviews.llvm.org/D9257

llvm-svn: 235810
---
 .../Transforms/InstCombine/InstCombineCalls.cpp    | 45 +++++++++++++++-------
 llvm/test/Transforms/InstCombine/x86-insertps.ll   | 43 ++++++++++++++++++---
 2 files changed, 69 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 1505ff0..2f83cc8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -201,7 +201,7 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II,
                                   InstCombiner::BuilderTy &Builder) {
   if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
     VectorType *VecTy = cast<VectorType>(II.getType());
-    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+    assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
     
     // The immediate permute control byte looks like this:
     //    [3:0] - zero mask for each 32-bit lane
@@ -213,25 +213,42 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II,
     uint8_t DestLane = (Imm >> 4) & 0x3;
     uint8_t SourceLane = (Imm >> 6) & 0x3;
 
+    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
     // If all zero mask bits are set, this was just a weird way to
     // generate a zero vector.
     if (ZMask == 0xf)
       return ZeroVector;
-    
-    // TODO: Model this case as two shuffles or a 'logical and' plus shuffle?
-    if (ZMask)
-      return nullptr;
-
-    assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
 
-    // If we're not zeroing anything, this is a single shuffle.
-    // Replace the selected destination lane with the selected source lane.
-    // For all other lanes, pass the first source bits through.
+    // Initialize by passing all of the first source bits through.
     int ShuffleMask[4] = { 0, 1, 2, 3 };
-    ShuffleMask[DestLane] = SourceLane + 4;
-    
-    return Builder.CreateShuffleVector(II.getArgOperand(0), II.getArgOperand(1),
-                                       ShuffleMask);
+
+    // We may replace the second operand with the zero vector.
+    Value *V1 = II.getArgOperand(1);
+
+    if (ZMask) {
+      // If the zero mask is being used with a single input or the zero mask
+      // overrides the destination lane, this is a shuffle with the zero vector.
+      if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+          (ZMask & (1 << DestLane))) {
+        V1 = ZeroVector;
+        // We may still move 32-bits of the first source vector from one lane
+        // to another.
+        ShuffleMask[DestLane] = SourceLane;
+        // The zero mask may override the previous insert operation.
+        for (unsigned i = 0; i < 4; ++i)
+          if ((ZMask >> i) & 0x1)
+            ShuffleMask[i] = i + 4;
+      } else {
+        // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+        return nullptr;
+      }
+    } else {
+      // Replace the selected destination lane with the selected source lane.
+      ShuffleMask[DestLane] = SourceLane + 4;
+    }
+  
+    return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
   }
   return nullptr;
 }
diff --git a/llvm/test/Transforms/InstCombine/x86-insertps.ll b/llvm/test/Transforms/InstCombine/x86-insertps.ll
index 487c727..092bf7c 100644
--- a/llvm/test/Transforms/InstCombine/x86-insertps.ll
+++ b/llvm/test/Transforms/InstCombine/x86-insertps.ll
@@ -30,14 +30,47 @@ define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) {
 ; CHECK-NEXT:  ret <4 x float> zeroinitializer
 }
 
-; If some zero mask bits are set, we do not change anything.
+; If some zero mask bits are set that do not override the insertion, we do not change anything.
 
-define <4 x float> @insertps_0x03(<4 x float> %v1, <4 x float> %v2) {
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
   ret <4 x float> %res
 
-; CHECK-LABEL: @insertps_0x03
-; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+; CHECK-LABEL: @insertps_0x0c
+; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; ...unless both input vectors are the same operand.
+
+define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x15_single_input
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <4 x i32> <i32 4, i32 0, i32 6, i32 3>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; The zero mask overrides the insertion lane.
+
+define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x1a_single_input
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float undef, float 0.000000e+00, float undef, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; The zero mask overrides the insertion lane, so the second input vector is not used.
+
+define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0xc1
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:  ret <4 x float>
 }
 
-- 
2.7.4