if (!ArgTy->isVectorTy())
return nullptr;
- if (auto *C = dyn_cast<Constant>(Arg)) {
- // Extract signbits of the vector input and pack into integer result.
- APInt Result(ResTy->getPrimitiveSizeInBits(), 0);
- for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) {
- auto *COp = C->getAggregateElement(I);
- if (!COp)
- return nullptr;
- if (isa<UndefValue>(COp))
- continue;
-
- auto *CInt = dyn_cast<ConstantInt>(COp);
- auto *CFp = dyn_cast<ConstantFP>(COp);
- if (!CInt && !CFp)
- return nullptr;
-
- if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative()))
- Result.setBit(I);
- }
- return Constant::getIntegerValue(ResTy, Result);
- }
-
- // Look for a sign-extended boolean source vector as the argument to this
- // movmsk. If the argument is bitcast, look through that, but make sure the
- // source of that bitcast is still a vector with the same number of elements.
- // TODO: We can also convert a bitcast with wider elements, but that requires
- // duplicating the bool source sign bits to match the number of elements
- // expected by the movmsk call.
- Arg = peekThroughBitcast(Arg);
- Value *X;
- if (Arg->getType()->isVectorTy() &&
- Arg->getType()->getVectorNumElements() == ArgTy->getVectorNumElements() &&
- match(Arg, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
- // call iM movmsk(sext <N x i1> X) --> zext (bitcast <N x i1> X to iN) to iM
- unsigned NumElts = X->getType()->getVectorNumElements();
- Type *ScalarTy = Type::getIntNTy(Arg->getContext(), NumElts);
- Value *BC = Builder.CreateBitCast(X, ScalarTy);
- return Builder.CreateZExtOrTrunc(BC, ResTy);
- }
-
- return nullptr;
+ // Expand MOVMSK to compare/bitcast/zext:
+ // e.g. PMOVMSKB(v16i8 x):
+ // %cmp = icmp slt <16 x i8> %x, zeroinitializer
+ // %int = bitcast <16 x i1> %cmp to i16
+ // %res = zext i16 %int to i32
+ unsigned NumElts = ArgTy->getVectorNumElements();
+ Type *IntegerVecTy = VectorType::getInteger(cast<VectorType>(ArgTy));
+ Type *IntegerTy = Builder.getIntNTy(NumElts);
+
+ Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
+ Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
+ Res = Builder.CreateBitCast(Res, IntegerTy);
+ Res = Builder.CreateZExtOrTrunc(Res, ResTy);
+ return Res;
}
static Value *simplifyX86addcarry(const IntrinsicInst &II,
define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) {
; CHECK-LABEL: @test_upper_x86_sse_movmsk_ps(
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> [[A0:%.*]])
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A0:%.*]] to <4 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT: [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT: ret i32 [[TMP4]]
;
%1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
%2 = and i32 %1, 15
define i32 @test_upper_x86_sse2_movmsk_pd(<2 x double> %a0) {
; CHECK-LABEL: @test_upper_x86_sse2_movmsk_pd(
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> [[A0:%.*]])
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[A0:%.*]] to <2 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i1> [[TMP2]] to i2
+; CHECK-NEXT: [[TMP4:%.*]] = zext i2 [[TMP3]] to i32
+; CHECK-NEXT: ret i32 [[TMP4]]
;
%1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
%2 = and i32 %1, 3
define i32 @test_upper_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
; CHECK-LABEL: @test_upper_x86_sse2_pmovmskb_128(
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> [[A0:%.*]])
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <16 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i1> [[TMP1]] to i16
+; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT: ret i32 [[TMP3]]
;
%1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
%2 = and i32 %1, 65535
define i32 @test_upper_x86_avx_movmsk_ps_256(<8 x float> %a0) {
; CHECK-LABEL: @test_upper_x86_avx_movmsk_ps_256(
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> [[A0:%.*]])
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float> [[A0:%.*]] to <8 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i1> [[TMP2]] to i8
+; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT: ret i32 [[TMP4]]
;
%1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
%2 = and i32 %1, 255
define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) {
; CHECK-LABEL: @test_upper_x86_avx_movmsk_pd_256(
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> [[A0:%.*]])
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x double> [[A0:%.*]] to <4 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT: [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT: ret i32 [[TMP4]]
;
%1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
%2 = and i32 %1, 15
ret i32 %r
}
-; Negative test - bitcast from scalar.
+; Bitcast from sign-extended scalar.
define i32 @sext_sse_movmsk_ps_scalar_source(i1 %x) {
; CHECK-LABEL: @sext_sse_movmsk_ps_scalar_source(
; CHECK-NEXT: [[SEXT:%.*]] = sext i1 [[X:%.*]] to i128
-; CHECK-NEXT: [[BC:%.*]] = bitcast i128 [[SEXT]] to <4 x float>
-; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> [[BC]])
-; CHECK-NEXT: ret i32 [[R]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[SEXT]] to <4 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT: [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT: ret i32 [[TMP4]]
;
%sext = sext i1 %x to i128
%bc = bitcast i128 %sext to <4 x float>
ret i32 %r
}
-; Negative test - bitcast from vector type with more elements.
+; Bitcast from vector type with more elements.
define i32 @sext_sse_movmsk_ps_too_many_elts(<8 x i1> %x) {
; CHECK-LABEL: @sext_sse_movmsk_ps_too_many_elts(
; CHECK-NEXT: [[SEXT:%.*]] = sext <8 x i1> [[X:%.*]] to <8 x i16>
-; CHECK-NEXT: [[BC:%.*]] = bitcast <8 x i16> [[SEXT]] to <4 x float>
-; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> [[BC]])
-; CHECK-NEXT: ret i32 [[R]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SEXT]] to <4 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT: [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT: ret i32 [[TMP4]]
;
%sext = sext <8 x i1> %x to <8 x i16>
%bc = bitcast <8 x i16> %sext to <4 x float>
ret i32 %r
}
-; TODO: We could handle this by doing a bitcasted sign-bit test after the sext?
-; But need to make sure the backend handles that correctly.
+; Handle this by doing a bitcasted sign-bit test after the sext.
define i32 @sext_sse_movmsk_ps_must_replicate_bits(<2 x i1> %x) {
; CHECK-LABEL: @sext_sse_movmsk_ps_must_replicate_bits(
; CHECK-NEXT: [[SEXT:%.*]] = sext <2 x i1> [[X:%.*]] to <2 x i64>
-; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <4 x float>
-; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> [[BC]])
-; CHECK-NEXT: ret i32 [[R]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[SEXT]] to <4 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT: [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT: ret i32 [[TMP4]]
;
%sext = sext <2 x i1> %x to <2 x i64>
%bc = bitcast <2 x i64> %sext to <4 x float>