[x86] instcombine more cases of insertps into a shufflevector

author Sanjay Patel <spatel@rotateright.com>

Sat, 25 Apr 2015 20:55:25 +0000 (20:55 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Sat, 25 Apr 2015 20:55:25 +0000 (20:55 +0000)
author Sanjay Patel <spatel@rotateright.com>
Sat, 25 Apr 2015 20:55:25 +0000 (20:55 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Sat, 25 Apr 2015 20:55:25 +0000 (20:55 +0000)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

index 1505ff0..2f83cc8 100644 (file)
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -201,7 +201,7 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II,
                                    InstCombiner::BuilderTy &Builder) {
    if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
      VectorType *VecTy = cast<VectorType>(II.getType());
-    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+    assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
      
      // The immediate permute control byte looks like this:
      //    [3:0] - zero mask for each 32-bit lane
@@ -213,25 +213,42 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II,
      uint8_t DestLane = (Imm >> 4) & 0x3;
      uint8_t SourceLane = (Imm >> 6) & 0x3;
  
+    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
      // If all zero mask bits are set, this was just a weird way to
      // generate a zero vector.
      if (ZMask == 0xf)
        return ZeroVector;
-    
-    // TODO: Model this case as two shuffles or a 'logical and' plus shuffle?
-    if (ZMask)
-      return nullptr;
-
-    assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
  
-    // If we're not zeroing anything, this is a single shuffle.
-    // Replace the selected destination lane with the selected source lane.
-    // For all other lanes, pass the first source bits through.
+    // Initialize by passing all of the first source bits through.
      int ShuffleMask[4] = { 0, 1, 2, 3 };
-    ShuffleMask[DestLane] = SourceLane + 4;
-    
-    return Builder.CreateShuffleVector(II.getArgOperand(0), II.getArgOperand(1),
-                                       ShuffleMask);
+
+    // We may replace the second operand with the zero vector.
+    Value *V1 = II.getArgOperand(1);
+
+    if (ZMask) {
+      // If the zero mask is being used with a single input or the zero mask
+      // overrides the destination lane, this is a shuffle with the zero vector.
+      if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+          (ZMask & (1 << DestLane))) {
+        V1 = ZeroVector;
+        // We may still move 32-bits of the first source vector from one lane
+        // to another.
+        ShuffleMask[DestLane] = SourceLane;
+        // The zero mask may override the previous insert operation.
+        for (unsigned i = 0; i < 4; ++i)
+          if ((ZMask >> i) & 0x1)
+            ShuffleMask[i] = i + 4;
+      } else {
+        // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+        return nullptr;
+      }
+    } else {
+      // Replace the selected destination lane with the selected source lane.
+      ShuffleMask[DestLane] = SourceLane + 4;
+    }
+  
+    return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
    }
    return nullptr;
  }
diff --git a/llvm/test/Transforms/InstCombine/x86-insertps.ll b/llvm/test/Transforms/InstCombine/x86-insertps.ll

index 487c727..092bf7c 100644 (file)
--- a/llvm/test/Transforms/InstCombine/x86-insertps.ll
+++ b/llvm/test/Transforms/InstCombine/x86-insertps.ll
@@ -30,14 +30,47 @@ define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) {
  ; CHECK-NEXT:  ret <4 x float> zeroinitializer
  }
  
-; If some zero mask bits are set, we do not change anything.
+; If some zero mask bits are set that do not override the insertion, we do not change anything.
  
-define <4 x float> @insertps_0x03(<4 x float> %v1, <4 x float> %v2) {
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
    ret <4 x float> %res
  
-; CHECK-LABEL: @insertps_0x03
-; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+; CHECK-LABEL: @insertps_0x0c
+; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; ...unless both input vectors are the same operand.
+
+define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x15_single_input
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <4 x i32> <i32 4, i32 0, i32 6, i32 3>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; The zero mask overrides the insertion lane.
+
+define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x1a_single_input
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float undef, float 0.000000e+00, float undef, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; The zero mask overrides the insertion lane, so the second input vector is not used.
+
+define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0xc1
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
  ; CHECK-NEXT:  ret <4 x float>
  }
author	Sanjay Patel <spatel@rotateright.com>
	Sat, 25 Apr 2015 20:55:25 +0000 (20:55 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Sat, 25 Apr 2015 20:55:25 +0000 (20:55 +0000)
llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp		patch \| blob \| history
llvm/test/Transforms/InstCombine/x86-insertps.ll		patch \| blob \| history