GCCBuiltin<"__builtin_ia32_compressstoredi128_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
+
+// expand
+ def int_x86_avx512_mask_expand_ps_512 :
+ GCCBuiltin<"__builtin_ia32_expandsf512_mask">,
+ Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
+ llvm_i16_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_expand_pd_512 :
+ GCCBuiltin<"__builtin_ia32_expanddf512_mask">,
+ Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
+ llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_expand_ps_256 :
+ GCCBuiltin<"__builtin_ia32_expandsf256_mask">,
+ Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+ llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_expand_pd_256 :
+ GCCBuiltin<"__builtin_ia32_expanddf256_mask">,
+ Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+ llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_expand_ps_128 :
+ GCCBuiltin<"__builtin_ia32_expandsf128_mask">,
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+ llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_expand_pd_128 :
+ GCCBuiltin<"__builtin_ia32_expanddf128_mask">,
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+ llvm_i8_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_mask_expand_load_ps_512 :
+ GCCBuiltin<"__builtin_ia32_expandloadsf512_mask">,
+ Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty,
+ llvm_i16_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_expand_load_pd_512 :
+ GCCBuiltin<"__builtin_ia32_expandloaddf512_mask">,
+ Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty,
+ llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_expand_load_ps_256 :
+ GCCBuiltin<"__builtin_ia32_expandloadsf256_mask">,
+ Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8f32_ty,
+ llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_expand_load_pd_256 :
+ GCCBuiltin<"__builtin_ia32_expandloaddf256_mask">,
+ Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4f64_ty,
+ llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_expand_load_ps_128 :
+ GCCBuiltin<"__builtin_ia32_expandloadsf128_mask">,
+ Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4f32_ty,
+ llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_expand_load_pd_128 :
+ GCCBuiltin<"__builtin_ia32_expandloaddf128_mask">,
+ Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2f64_ty,
+ llvm_i8_ty], [IntrReadArgMem]>;
+
+ def int_x86_avx512_mask_expand_d_512 :
+ GCCBuiltin<"__builtin_ia32_expandsi512_mask">,
+ Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
+ llvm_i16_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_expand_q_512 :
+ GCCBuiltin<"__builtin_ia32_expanddi512_mask">,
+ Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
+ llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_expand_d_256 :
+ GCCBuiltin<"__builtin_ia32_expandsi256_mask">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
+ llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_expand_q_256 :
+ GCCBuiltin<"__builtin_ia32_expanddi256_mask">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
+ llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_expand_d_128 :
+ GCCBuiltin<"__builtin_ia32_expandsi128_mask">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
+ llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_expand_q_128 :
+ GCCBuiltin<"__builtin_ia32_expanddi128_mask">,
+ Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
+ llvm_i8_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_mask_expand_load_d_512 :
+ GCCBuiltin<"__builtin_ia32_expandloadsi512_mask">,
+ Intrinsic<[llvm_v16i32_ty], [llvm_ptr_ty, llvm_v16i32_ty,
+ llvm_i16_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_expand_load_q_512 :
+ GCCBuiltin<"__builtin_ia32_expandloaddi512_mask">,
+ Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty, llvm_v8i64_ty,
+ llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_expand_load_d_256 :
+ GCCBuiltin<"__builtin_ia32_expandloadsi256_mask">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty,
+ llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_expand_load_q_256 :
+ GCCBuiltin<"__builtin_ia32_expandloaddi256_mask">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty,
+ llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_expand_load_d_128 :
+ GCCBuiltin<"__builtin_ia32_expandloadsi128_mask">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty,
+ llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_expand_load_q_128 :
+ GCCBuiltin<"__builtin_ia32_expandloaddi128_mask">,
+ Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty,
+ llvm_i8_ty], [IntrReadArgMem]>;
+
}
// Misc.
let TargetPrefix = "x86" in {
Op.getOperand(2), DAG),
Op.getOperand(4), Op.getOperand(3), Subtarget,
DAG);
- case COMPRESS_TO_REG: {
+ case COMPRESS_EXPAND_IN_REG: {
SDValue Mask = Op.getOperand(3);
SDValue DataToCompress = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
return DAG.getStore(Chain, dl, Compressed, Addr,
MachinePointerInfo(), false, false, 0);
}
+ case EXPAND_FROM_MEM: {
+ SDLoc dl(Op);
+ SDValue Mask = Op.getOperand(4);
+ SDValue PathThru = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+
+ if (isAllOnes(Mask)) // return just a load
+ return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
+ false, 0);
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+
+ SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
+ false, false, false, 0);
+
+ SmallVector<SDValue, 2> Results;
+ Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
+ PathThru));
+ Results.push_back(Chain);
+ return DAG.getMergeValues(Results, dl);
+ }
}
}
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
case X86ISD::XTEST: return "X86ISD::XTEST";
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
+ case X86ISD::EXPAND: return "X86ISD::EXPAND";
}
}
defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>,
EVEX, VEX_W;
+// expand
+multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+ string OpcodeStr> {
+ def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, (_.VT _.RC:$src),
+ _.ImmAllZerosV)))]>, EVEX_KZ;
+
+ let Constraints = "$src0 = $dst" in
+ def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+ [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask,
+ (_.VT _.RC:$src), _.RC:$src0)))]>, EVEX_K;
+
+ let mayLoad = 1, Constraints = "$src0 = $dst" in
+ def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+ [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask,
+ (_.VT (bitconvert
+ (_.LdFrag addr:$src))),
+ _.RC:$src0)))]>,
+ EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+
+ let mayLoad = 1 in
+ def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask,
+ (_.VT (bitconvert (_.LdFrag addr:$src))),
+ _.ImmAllZerosV)))]>,
+ EVEX_KZ, EVEX_CD8<_.EltSize, CD8VT1>;
+
+}
+
+multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+ defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+ }
+}
+
+defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>,
+ EVEX;
+defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>,
+ EVEX, VEX_W;
+defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
+ EVEX;
+defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
+ EVEX, VEX_W;
def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3,
[SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>;
+def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 3>,
+ SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>;
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_SCALAR_MASK_RM,
- COMPRESS_TO_REG, COMPRESS_TO_MEM
+ COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM
};
struct IntrinsicData {
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_TO_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
+
+ X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0),
}
declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
+
+; Expand
+
+; CHECK-LABEL: expand1
+; CHECK: vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07]
+define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) {
+ %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+ ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+
+; CHECK-LABEL: expand2
+; CHECK: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
+define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) {
+ %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+ ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+
+; CHECK-LABEL: expand3
+; CHECK: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
+define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) {
+ %res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
+ ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
+
+; CHECK-LABEL: expand4
+; CHECK: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0]
+define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) {
+ %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
+ ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
+
+; CHECK-LABEL: expand5
+; CHECK: vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8]
+define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
+ %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
+ ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
+
+; CHECK-LABEL: expand6
+; CHECK: vexpandps %xmm0
+define <4 x float> @expand6(<4 x float> %data, i8 %mask) {
+ %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
+ ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
+
+; CHECK-LABEL: expand7
+; CHECK-NOT: vexpand
+; CHECK: vmovapd
+define <8 x double> @expand7(i8* %addr, <8 x double> %data) {
+ %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1)
+ ret <8 x double> %res
+}
+
+; CHECK-LABEL: expand8
+; CHECK-NOT: vexpandps %xmm0
+define <4 x float> @expand8(<4 x float> %data) {
+ %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
+ ret <4 x float> %res
+}
+
+; CHECK-LABEL: expand9
+; CHECK: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07]
+define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) {
+ %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+
+; CHECK-LABEL: expand10
+; CHECK: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
+define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) {
+ %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
+
+