return ConstantVector::get(Vals);
}
+// Replace X86-specific intrinsics with generic floor-ceil where applicable.
+static Value *simplifyX86round(IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ ConstantInt *Arg = nullptr;
+ Intrinsic::ID IntrinsicID = II.getIntrinsicID();
+
+ if (IntrinsicID == Intrinsic::x86_sse41_round_ss ||
+ IntrinsicID == Intrinsic::x86_sse41_round_sd)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ else if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(4));
+ else
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(1));
+ if (!Arg)
+ return nullptr;
+ unsigned RoundControl = Arg->getZExtValue();
+
+ Arg = nullptr;
+ unsigned SAE = 0;
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_512 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_512)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(4));
+ else if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(5));
+ else
+ SAE = 4;
+ if (!SAE) {
+ if (!Arg)
+ return nullptr;
+ SAE = Arg->getZExtValue();
+ }
+
+ if (SAE != 4 || (RoundControl != 2 /*ceil*/ && RoundControl != 1 /*floor*/))
+ return nullptr;
+
+ Value *Src, *Dst, *Mask;
+ bool IsScalar = false;
+ if (IntrinsicID == Intrinsic::x86_sse41_round_ss ||
+ IntrinsicID == Intrinsic::x86_sse41_round_sd ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+ IsScalar = true;
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+ Mask = II.getArgOperand(3);
+ Value *Zero = Constant::getNullValue(Mask->getType());
+ Mask = Builder.CreateAnd(Mask, 1);
+ Mask = Builder.CreateICmp(ICmpInst::ICMP_NE, Mask, Zero);
+ Dst = II.getArgOperand(2);
+ } else
+ Dst = II.getArgOperand(0);
+ Src = Builder.CreateExtractElement(II.getArgOperand(1), (uint64_t)0);
+ } else {
+ Src = II.getArgOperand(0);
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_128 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_256 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_512 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_128 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_256 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_512) {
+ Dst = II.getArgOperand(2);
+ Mask = II.getArgOperand(3);
+ } else {
+ Dst = Src;
+ Mask = ConstantInt::getAllOnesValue(
+ Builder.getIntNTy(Src->getType()->getVectorNumElements()));
+ }
+ }
+
+ Intrinsic::ID ID = (RoundControl == 2) ? Intrinsic::ceil : Intrinsic::floor;
+ Value *Res = Builder.CreateIntrinsic(ID, {Src}, &II);
+ if (!IsScalar) {
+ if (auto *C = dyn_cast<Constant>(Mask))
+ if (C->isAllOnesValue())
+ return Res;
+ auto *MaskTy = VectorType::get(
+ Builder.getInt1Ty(), cast<IntegerType>(Mask->getType())->getBitWidth());
+ Mask = Builder.CreateBitCast(Mask, MaskTy);
+ unsigned Width = Src->getType()->getVectorNumElements();
+ if (MaskTy->getVectorNumElements() > Width) {
+ uint32_t Indices[4];
+ for (unsigned i = 0; i != Width; ++i)
+ Indices[i] = i;
+ Mask = Builder.CreateShuffleVector(Mask, Mask,
+ makeArrayRef(Indices, Width));
+ }
+ return Builder.CreateSelect(Mask, Res, Dst);
+ }
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+ Dst = Builder.CreateExtractElement(Dst, (uint64_t)0);
+ Res = Builder.CreateSelect(Mask, Res, Dst);
+ Dst = II.getArgOperand(0);
+ }
+ return Builder.CreateInsertElement(Dst, Res, (uint64_t)0);
+}
+
static Value *simplifyX86movmsk(const IntrinsicInst &II) {
Value *Arg = II.getArgOperand(0);
Type *ResTy = II.getType();
break;
}
+ case Intrinsic::x86_sse41_round_ps:
+ case Intrinsic::x86_sse41_round_pd:
+ case Intrinsic::x86_avx_round_ps_256:
+ case Intrinsic::x86_avx_round_pd_256:
+ case Intrinsic::x86_avx512_mask_rndscale_ps_128:
+ case Intrinsic::x86_avx512_mask_rndscale_ps_256:
+ case Intrinsic::x86_avx512_mask_rndscale_ps_512:
+ case Intrinsic::x86_avx512_mask_rndscale_pd_128:
+ case Intrinsic::x86_avx512_mask_rndscale_pd_256:
+ case Intrinsic::x86_avx512_mask_rndscale_pd_512:
+ case Intrinsic::x86_avx512_mask_rndscale_ss:
+ case Intrinsic::x86_avx512_mask_rndscale_sd:
+ if (Value *V = simplifyX86round(*II, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+
case Intrinsic::x86_mmx_pmovmskb:
case Intrinsic::x86_sse_movmsk_ps:
case Intrinsic::x86_sse2_movmsk_pd:
case Intrinsic::x86_sse2_cmp_sd:
case Intrinsic::x86_sse2_min_sd:
case Intrinsic::x86_sse2_max_sd:
- case Intrinsic::x86_sse41_round_ss:
- case Intrinsic::x86_sse41_round_sd:
case Intrinsic::x86_xop_vfrcz_ss:
case Intrinsic::x86_xop_vfrcz_sd: {
unsigned VWidth = II->getType()->getVectorNumElements();
}
break;
}
+ case Intrinsic::x86_sse41_round_ss:
+ case Intrinsic::x86_sse41_round_sd: {
+ unsigned VWidth = II->getType()->getVectorNumElements();
+ APInt UndefElts(VWidth, 0);
+ APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+ if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
+ if (V != II)
+ return replaceInstUsesWith(*II, V);
+ return II;
+ } else if (Value *V = simplifyX86round(*II, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+ }
// Constant fold ashr( <A x Bi>, Ci ).
// Constant fold lshr( <A x Bi>, Ci ).
declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32)
declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32)
+declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
+declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
+declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8)
+declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8)
+declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
+declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8)
+declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8)
+declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+
+define <4 x float> @test_rndscale_ss_floor(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ss_floor(
+; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SRC1:%.*]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.floor.f32(float [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[DST:%.*]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], float [[TMP5]], float [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[SRC0:%.*]], float [[TMP6]], i64 0
+; CHECK-NEXT: ret <4 x float> [[TMP7]]
+;
+ %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k, i32 1, i32 4)
+ ret <4 x float> %1
+}
+
+define <4 x float> @test_rndscale_ss_ceil(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ss_ceil(
+; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SRC1:%.*]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.ceil.f32(float [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[DST:%.*]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], float [[TMP5]], float [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[SRC0:%.*]], float [[TMP6]], i64 0
+; CHECK-NEXT: ret <4 x float> [[TMP7]]
+;
+ %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k, i32 2, i32 4)
+ ret <4 x float> %1
+}
+
+define <2 x double> @test_rndscale_sd_floor(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_sd_floor(
+; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[SRC1:%.*]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.floor.f64(double [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[DST:%.*]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], double [[TMP5]], double [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[SRC0:%.*]], double [[TMP6]], i64 0
+; CHECK-NEXT: ret <2 x double> [[TMP7]]
+;
+ %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k, i32 1, i32 4)
+ ret <2 x double> %1
+}
+
+define <2 x double> @test_rndscale_sd_ceil(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_sd_ceil(
+; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[SRC1:%.*]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.ceil.f64(double [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[DST:%.*]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], double [[TMP5]], double [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[SRC0:%.*]], double [[TMP6]], i64 0
+; CHECK-NEXT: ret <2 x double> [[TMP7]]
+;
+ %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k, i32 2, i32 4)
+ ret <2 x double> %1
+}
+
+define <4 x float> @test_rndscale_ps_128_floor(<4 x float> %src, <4 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ps_128_floor(
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP1]], <4 x float> [[DST:%.*]]
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
+;
+ %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %src, i32 1, <4 x float> %dst, i8 %k)
+ ret <4 x float> %1
+}
+
+define <4 x float> @test_rndscale_ps_128_ceil(<4 x float> %src, <4 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ps_128_ceil(
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP1]], <4 x float> [[DST:%.*]]
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
+;
+ %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %src, i32 2, <4 x float> %dst, i8 %k)
+ ret <4 x float> %1
+}
+
+define <8 x float> @test_rndscale_ps_256_floor(<8 x float> %src, <8 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ps_256_floor(
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[DST:%.*]]
+; CHECK-NEXT: ret <8 x float> [[TMP3]]
+;
+ %1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %src, i32 1, <8 x float> %dst, i8 %k)
+ ret <8 x float> %1
+}
+
+define <8 x float> @test_rndscale_ps_256_ceil(<8 x float> %src, <8 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ps_256_ceil(
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[DST:%.*]]
+; CHECK-NEXT: ret <8 x float> [[TMP3]]
+;
+ %1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %src, i32 2, <8 x float> %dst, i8 %k)
+ ret <8 x float> %1
+}
+
+define <16 x float> @test_rndscale_ps_512_floor(<16 x float> %src, <16 x float> %dst, i16 %k) {
+; CHECK-LABEL: @test_rndscale_ps_512_floor(
+; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[K:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[DST:%.*]]
+; CHECK-NEXT: ret <16 x float> [[TMP3]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %src, i32 1, <16 x float> %dst, i16 %k, i32 4)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_rndscale_ps_512_ceil(<16 x float> %src, <16 x float> %dst, i16 %k) {
+; CHECK-LABEL: @test_rndscale_ps_512_ceil(
+; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[K:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[DST:%.*]]
+; CHECK-NEXT: ret <16 x float> [[TMP3]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %src, i32 2, <16 x float> %dst, i16 %k, i32 4)
+ ret <16 x float> %1
+}
+
+define <2 x double> @test_rndscale_pd_128_floor(<2 x double> %src, <2 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_128_floor(
+; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP1]], <2 x double> [[DST:%.*]]
+; CHECK-NEXT: ret <2 x double> [[TMP4]]
+;
+ %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %src, i32 1, <2 x double> %dst, i8 %k)
+ ret <2 x double> %1
+}
+
+define <2 x double> @test_rndscale_pd_128_ceil(<2 x double> %src, <2 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_128_ceil(
+; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP1]], <2 x double> [[DST:%.*]]
+; CHECK-NEXT: ret <2 x double> [[TMP4]]
+;
+ %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %src, i32 2, <2 x double> %dst, i8 %k)
+ ret <2 x double> %1
+}
+
+define <4 x double> @test_rndscale_pd_256_floor(<4 x double> %src, <4 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_256_floor(
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> [[DST:%.*]]
+; CHECK-NEXT: ret <4 x double> [[TMP4]]
+;
+ %1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %src, i32 1, <4 x double> %dst, i8 %k)
+ ret <4 x double> %1
+}
+
+define <4 x double> @test_rndscale_pd_256_ceil(<4 x double> %src, <4 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_256_ceil(
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> [[DST:%.*]]
+; CHECK-NEXT: ret <4 x double> [[TMP4]]
+;
+ %1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %src, i32 2, <4 x double> %dst, i8 %k)
+ ret <4 x double> %1
+}
+
+define <8 x double> @test_rndscale_pd_512_floor(<8 x double> %src, <8 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_512_floor(
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[DST:%.*]]
+; CHECK-NEXT: ret <8 x double> [[TMP3]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %src, i32 1, <8 x double> %dst, i8 %k, i32 4)
+ ret <8 x double> %1
+}
+
+define <8 x double> @test_rndscale_pd_512_ceil(<8 x double> %src, <8 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_512_ceil(
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[SRC:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[DST:%.*]]
+; CHECK-NEXT: ret <8 x double> [[TMP3]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %src, i32 2, <8 x double> %dst, i8 %k, i32 4)
+ ret <8 x double> %1
+}
+
declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {