InstCombiner::BuilderTy &Builder) {
if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
VectorType *VecTy = cast<VectorType>(II.getType());
- ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+ assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
// The immediate permute control byte looks like this:
// [3:0] - zero mask for each 32-bit lane
uint8_t DestLane = (Imm >> 4) & 0x3;
uint8_t SourceLane = (Imm >> 6) & 0x3;
+ ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
// If all zero mask bits are set, this was just a weird way to
// generate a zero vector.
if (ZMask == 0xf)
return ZeroVector;
-
- // TODO: Model this case as two shuffles or a 'logical and' plus shuffle?
- if (ZMask)
- return nullptr;
-
- assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
- // If we're not zeroing anything, this is a single shuffle.
- // Replace the selected destination lane with the selected source lane.
- // For all other lanes, pass the first source bits through.
+ // Initialize by passing all of the first source bits through.
int ShuffleMask[4] = { 0, 1, 2, 3 };
- ShuffleMask[DestLane] = SourceLane + 4;
-
- return Builder.CreateShuffleVector(II.getArgOperand(0), II.getArgOperand(1),
- ShuffleMask);
+
+ // We may replace the second operand with the zero vector.
+ Value *V1 = II.getArgOperand(1);
+
+ if (ZMask) {
+ // If the zero mask is being used with a single input or the zero mask
+ // overrides the destination lane, this is a shuffle with the zero vector.
+ if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+ (ZMask & (1 << DestLane))) {
+ V1 = ZeroVector;
+ // We may still move 32-bits of the first source vector from one lane
+ // to another.
+ ShuffleMask[DestLane] = SourceLane;
+ // The zero mask may override the previous insert operation.
+ for (unsigned i = 0; i < 4; ++i)
+ if ((ZMask >> i) & 0x1)
+ ShuffleMask[i] = i + 4;
+ } else {
+ // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+ return nullptr;
+ }
+ } else {
+ // Replace the selected destination lane with the selected source lane.
+ ShuffleMask[DestLane] = SourceLane + 4;
+ }
+
+ return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
}
return nullptr;
}
; CHECK-NEXT: ret <4 x float> zeroinitializer
}
-; If some zero mask bits are set, we do not change anything.
+; If some zero mask bits are set that do not override the insertion, we do not change anything.
-define <4 x float> @insertps_0x03(<4 x float> %v1, <4 x float> %v2) {
- %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) {
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
ret <4 x float> %res
-; CHECK-LABEL: @insertps_0x03
-; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+; CHECK-LABEL: @insertps_0x0c
+; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
+; CHECK-NEXT: ret <4 x float>
+}
+
+; ...unless both input vectors are the same operand.
+
+define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) {
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21)
+ ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x15_single_input
+; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <4 x i32> <i32 4, i32 0, i32 6, i32 3>
+; CHECK-NEXT: ret <4 x float>
+}
+
+; The zero mask overrides the insertion lane.
+
+define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) {
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26)
+ ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x1a_single_input
+; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> <float undef, float 0.000000e+00, float undef, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: ret <4 x float>
+}
+
+; The zero mask overrides the insertion lane, so the second input vector is not used.
+
+define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) {
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193)
+ ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0xc1
+; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
; CHECK-NEXT: ret <4 x float>
}