From 18bcf93efb3b5c99fba6d95e197b8f26538e5a48 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 3 Feb 2016 09:41:59 +0000 Subject: [PATCH] [X86][AVX] Add support for 64-bit VZEXT_LOAD of 256/512-bit vectors to EltsFromConsecutiveLoads Follow up to D16217 and D16729 This change uncovered an odd pattern where VZEXT_LOAD v4i64 was being lowered to a load of the lower v2i64 (so the 2nd i64 destination element wasn't being zeroed), I can't find any use/reason for this and have removed the pattern and replaced it so only the 1st i64 element is loaded and the upper bits all zeroed. This matches the description for X86ISD::VZEXT_LOAD Differential Revision: http://reviews.llvm.org/D16768 llvm-svn: 259635 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 9 +- llvm/lib/Target/X86/X86InstrAVX512.td | 222 +++++++++++---------- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 5 - llvm/lib/Target/X86/X86InstrSSE.td | 9 +- .../CodeGen/X86/merge-consecutive-loads-256.ll | 24 +-- .../CodeGen/X86/merge-consecutive-loads-512.ll | 16 -- 6 files changed, 125 insertions(+), 160 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8f52296..511c734 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5639,11 +5639,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs. - // TODO: The code below fires only for for loading the low 64-bits of a - // of a 128-bit vector. It's probably worth generalizing more. if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 && - (VT.is128BitVector() && TLI.isTypeLegal(MVT::v2i64))) { - SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + ((VT.is128BitVector() && TLI.isTypeLegal(MVT::v2i64)) || + (VT.is256BitVector() && TLI.isTypeLegal(MVT::v4i64)) || + (VT.is512BitVector() && TLI.isTypeLegal(MVT::v8i64)))) { + MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index c3cc8fb..ad19cf7 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1997,9 +1997,9 @@ multiclass avx512_vector_fpclass_all opcVec, bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{ - defm PS : avx512_vector_fpclass_all, EVEX_CD8<32, CD8VF>; - defm PD : avx512_vector_fpclass_all,EVEX_CD8<64, CD8VF> , VEX_W; defm SS : avx512_scalar_fpclass, EVEX_CD8<32, CD8VT1>; @@ -2113,12 +2113,12 @@ let Predicates = [HasAVX512, NoDQI] in { def : Pat<(store VK4:$src, addr:$dst), (MOV8mr addr:$dst, (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)), - sub_8bit))>; + sub_8bit))>; def : Pat<(store VK8:$src, addr:$dst), (MOV8mr addr:$dst, (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), sub_8bit))>; - + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), @@ -2596,7 +2596,7 @@ multiclass avx512_load opc, string OpcodeStr, X86VectorVTInfo _, def rrkz : AVX512PI, @@ -2919,24 +2919,24 @@ def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar { - defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), + defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), - asm, "$src2, $src1","$src1, $src2", + asm, "$src2, $src1","$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), IIC_SSE_MOV_S_RR>, EVEX_4V; let Constraints = "$src1 = $dst" , mayLoad = 1 in defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _, - (outs _.RC:$dst), + (outs _.RC:$dst), (ins _.ScalarMemOp:$src), asm,"$src","$src", - (_.VT (OpNode (_.VT _.RC:$src1), - (_.VT (scalar_to_vector + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src)))))>, EVEX; let isCodeGenOnly = 1 in { - def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.FRC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, @@ -2953,7 +2953,7 @@ multiclass avx512_move_scalar , EVEX; - def mrk: AVX512PI<0x11, MRMDestMem, (outs), + def mrk: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K; @@ -3175,6 +3175,12 @@ let Predicates = [HasAVX512] in { def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; + def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>; + + // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. + def : Pat<(v8i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>; } def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))), @@ -3429,7 +3435,7 @@ defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_I defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, SSE_INTALU_ITINS_P, HasBWI, 1>; -multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, +multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo, SDNode OpNode, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in @@ -3439,11 +3445,11 @@ multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; let Predicates = [HasVLX, prd] in { defm NAME#Z256 : avx512_binop_rm2, EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W; defm NAME#Z128 : avx512_binop_rm2, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W; } @@ -3452,7 +3458,7 @@ multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P, avx512vl_i32_info, avx512vl_i64_info, X86pmuldq, HasAVX512, 1>,T8PD; -defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, +defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, avx512vl_i32_info, avx512vl_i64_info, X86pmuludq, HasAVX512, 1>; defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P, @@ -3875,15 +3881,15 @@ multiclass avx512_vptest_mb opc, string OpcodeStr, SDNode OpNode, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; } -// Use 512bit version to implement 128/256 bit in case NoVLX. +// Use 512bit version to implement 128/256 bit in case NoVLX. multiclass avx512_vptest_lowering { def : Pat<(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), (_.KVT (COPY_TO_REGCLASS (!cast(NAME # Suffix # "Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), _.RC:$src2, _.SubRegIdx)), _.KRC))>; } @@ -3903,7 +3909,7 @@ multiclass avx512_vptest_dq_sizes opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX512, NoVLX] in { defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>; defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, Suffix>; - } + } } multiclass avx512_vptest_dq opc, string OpcodeStr, SDNode OpNode> { @@ -3932,13 +3938,13 @@ multiclass avx512_vptest_wb opc, string OpcodeStr, defm BZ128: avx512_vptest, EVEX_V128; } - + let Predicates = [HasAVX512, NoVLX] in { defm BZ256_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v32i8x_info, "B">; defm BZ128_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v16i8x_info, "B">; defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">; defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">; - } + } } @@ -4136,20 +4142,20 @@ multiclass avx512_var_shift_types opc, string OpcodeStr, avx512vl_i64_info>, VEX_W; } -// Use 512bit version to implement 128/256 bit in case NoVLX. +// Use 512bit version to implement 128/256 bit in case NoVLX. multiclass avx512_var_shift_w_lowering { let Predicates = [HasBWI, NoVLX] in { - def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), + def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), (_.info256.VT _.info256.RC:$src2))), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (!cast(NAME#"WZrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; - def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), + def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), (_.info128.VT _.info128.RC:$src2))), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (!cast(NAME#"WZrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), @@ -4247,7 +4253,7 @@ defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd", X86VPermi, avx512vl_f64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; //===----------------------------------------------------------------------===// -// AVX-512 - VPERMIL +// AVX-512 - VPERMIL //===----------------------------------------------------------------------===// multiclass avx512_permil_vec OpcVar, string OpcodeStr, SDNode OpNode, @@ -4932,7 +4938,7 @@ def : Pat<(f64 (uint_to_fp GR64:$src)), //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from float/double to integer //===----------------------------------------------------------------------===// -multiclass avx512_cvt_s_int_round opc, RegisterClass SrcRC, +multiclass avx512_cvt_s_int_round opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, Operand memop, ComplexPattern mem_cpat, string asm> { let hasSideEffects = 0, Predicates = [HasAVX512] in { @@ -4940,23 +4946,23 @@ multiclass avx512_cvt_s_int_round opc, RegisterClass SrcRC, !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG; def rb : SI, + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>, EVEX, VEX_LIG, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : SI, EVEX, VEX_LIG; - } // hasSideEffects = 0, Predicates = [HasAVX512] + } // hasSideEffects = 0, Predicates = [HasAVX512] } // Convert float/double to signed/unsigned int 32/64 defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, ssmem, sse_load_f32, "cvtss2si">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, +defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64, ssmem, sse_load_f32, "cvtss2si">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, +defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi, ssmem, sse_load_f32, "cvtss2usi">, XS, EVEX_CD8<32, CD8VT1>; @@ -4967,11 +4973,11 @@ defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64, defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, +defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, +defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi, sdmem, sse_load_f64, "cvtsd2usi">, XD, EVEX_CD8<64, CD8VT1>; @@ -5000,8 +5006,8 @@ let isCodeGenOnly = 1 , Predicates = [HasAVX512] in { } // isCodeGenOnly = 1, Predicates = [HasAVX512] // Convert float/double to signed/unsigned int 32/64 with truncation -multiclass avx512_cvt_s_all opc, string asm, X86VectorVTInfo _SrcRC, - X86VectorVTInfo _DstRC, SDNode OpNode, +multiclass avx512_cvt_s_all opc, string asm, X86VectorVTInfo _SrcRC, + X86VectorVTInfo _DstRC, SDNode OpNode, SDNode OpNodeRnd>{ let Predicates = [HasAVX512] in { def rr : SI, EVEX, EVEX_B; def rm : SI, + [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, EVEX; let isCodeGenOnly = 1,hasSideEffects = 0 in { @@ -5022,11 +5028,11 @@ let Predicates = [HasAVX512] in { (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; def rb_Int : SI, + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_NO_EXC)))]>, EVEX,VEX_LIG , EVEX_B; let mayLoad = 1 in - def rm_Int : SI, EVEX, VEX_LIG; @@ -5036,30 +5042,30 @@ let Predicates = [HasAVX512] in { } -defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, - fp_to_sint,X86cvttss2IntRnd>, +defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, + fp_to_sint,X86cvttss2IntRnd>, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, - fp_to_sint,X86cvttss2IntRnd>, +defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, + fp_to_sint,X86cvttss2IntRnd>, VEX_W, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, +defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, fp_to_sint,X86cvttsd2IntRnd>, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, - fp_to_sint,X86cvttsd2IntRnd>, +defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, + fp_to_sint,X86cvttsd2IntRnd>, VEX_W, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, - fp_to_uint,X86cvttss2UIntRnd>, +defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, + fp_to_uint,X86cvttss2UIntRnd>, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, - fp_to_uint,X86cvttss2UIntRnd>, +defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, + fp_to_uint,X86cvttss2UIntRnd>, XS,VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, - fp_to_uint,X86cvttsd2UIntRnd>, +defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, + fp_to_uint,X86cvttsd2UIntRnd>, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, - fp_to_uint,X86cvttsd2UIntRnd>, +defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, + fp_to_uint,X86cvttsd2UIntRnd>, XD, VEX_W, EVEX_CD8<64, CD8VT1>; let Predicates = [HasAVX512] in { def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), @@ -5078,17 +5084,17 @@ let Predicates = [HasAVX512] in { multiclass avx512_cvt_fp_scalar opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNode> { defm rr : AVX512_maskable_scalar, + (_Src.VT _Src.RC:$src2)))>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; defm rm : AVX512_maskable_scalar, + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT (scalar_to_vector + (_Src.ScalarLdFrag addr:$src2)))))>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } @@ -5098,7 +5104,7 @@ multiclass avx512_cvt_fp_sae_scalar opc, string OpcodeStr, X86VectorVTIn defm rrb : AVX512_maskable_scalar, EVEX_4V, VEX_LIG, EVEX_B; @@ -5110,13 +5116,13 @@ multiclass avx512_cvt_fp_rc_scalar opc, string OpcodeStr, X86VectorVTInf defm rrb : AVX512_maskable_scalar, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_B, EVEX_RC; } -multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86VectorVTInfo _src, +multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { defm Z : avx512_cvt_fp_scalar, @@ -5126,22 +5132,22 @@ multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, SDNode OpNo } } -multiclass avx512_cvt_fp_scalar_ss2sd opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86VectorVTInfo _src, +multiclass avx512_cvt_fp_scalar_ss2sd opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { defm Z : avx512_cvt_fp_scalar, - avx512_cvt_fp_sae_scalar, + avx512_cvt_fp_sae_scalar, EVEX_CD8<32, CD8VT1>, XS, EVEX_V512; } } defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround, X86froundRnd, f64x_info, f32x_info>; -defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, X86fpextRnd,f32x_info, f64x_info >; -def : Pat<(f64 (fextend FR32X:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), +def : Pat<(f64 (fextend FR32X:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; def : Pat<(f64 (fextend (loadf32 addr:$src))), @@ -5153,12 +5159,12 @@ def : Pat<(f64 (extloadf32 addr:$src)), Requires<[HasAVX512, OptForSize]>; def : Pat<(f64 (extloadf32 addr:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, Requires<[HasAVX512, OptForSpeed]>; -def : Pat<(f32 (fround FR64X:$src)), - (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), +def : Pat<(f32 (fround FR64X:$src)), + (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; //===----------------------------------------------------------------------===// @@ -5575,7 +5581,7 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass avx512_cvtph2ps { defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), "vcvtph2ps", "$src", "$src", @@ -5583,7 +5589,7 @@ multiclass avx512_cvtph2ps, T8PD; let hasSideEffects = 0, mayLoad = 1 in { defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), - "vcvtph2ps", "$src", "$src", + "vcvtph2ps", "$src", "$src", (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))), (i32 FROUND_CURRENT))>, T8PD; } @@ -5599,43 +5605,43 @@ multiclass avx512_cvtph2ps_sae { let Predicates = [HasAVX512] in { defm VCVTPH2PSZ : avx512_cvtph2ps, - avx512_cvtph2ps_sae, + avx512_cvtph2ps_sae, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; let Predicates = [HasVLX] in { - defm VCVTPH2PSZ256 : avx512_cvtph2ps,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; defm VCVTPH2PSZ128 : avx512_cvtph2ps, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; } } -multiclass avx512_cvtps2ph { defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph", "$src2, $src1", "$src1, $src2", + "vcvtps2ph", "$src2, $src1", "$src1, $src2", (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2), + (i32 imm:$src2), (i32 FROUND_CURRENT))>, AVX512AIi8Base; let hasSideEffects = 0, mayStore = 1 in { def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), (i32 FROUND_CURRENT) )), addr:$dst)]>; def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", []>, EVEX_K; } } multiclass avx512_cvtps2ph_sae { defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", + "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2), + (i32 imm:$src2), (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base; } let Predicates = [HasAVX512] in { @@ -5655,7 +5661,7 @@ multiclass avx512_ord_cmp_sae opc, X86VectorVTInfo _, SDNode OpNode, string OpcodeStr> { def rb: AVX512, EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[WriteFAdd]>; @@ -6660,14 +6666,14 @@ multiclass convert_vector_to_mask_common opc, X86VectorVTInfo _, string [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX; } -// Use 512bit version to implement 128/256 bit in case NoVLX. -multiclass convert_vector_to_mask_lowering { def : Pat<(_.KVT (X86cvt2mask (_.VT _.RC:$src))), (_.KVT (COPY_TO_REGCLASS (!cast(NAME#"Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), _.RC:$src, _.SubRegIdx)), _.KRC))>; } @@ -7449,29 +7455,29 @@ multiclass avx512_shift_packed opc, SDNode OpNode, Format MRMr, def rm : AVX512; } -multiclass avx512_shift_packed_all opc, SDNode OpNode, Format MRMr, +multiclass avx512_shift_packed_all opc, SDNode OpNode, Format MRMr, Format MRMm, string OpcodeStr, Predicate prd>{ let Predicates = [prd] in - defm Z512 : avx512_shift_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_shift_packed, EVEX_V256; - defm Z128 : avx512_shift_packed, EVEX_V128; } } -defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", +defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", HasBWI>, AVX512PDIi8Base, EVEX_4V; -defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", +defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", HasBWI>, AVX512PDIi8Base, EVEX_4V; -multiclass avx512_psadbw_packed opc, SDNode OpNode, +multiclass avx512_psadbw_packed opc, SDNode OpNode, string OpcodeStr, X86VectorVTInfo _dst, X86VectorVTInfo _src>{ def rr : AVX512BI opc, SDNode OpNode, (_src.LdFrag addr:$src2))))))]>; } -multiclass avx512_psadbw_packed_all opc, SDNode OpNode, +multiclass avx512_psadbw_packed_all opc, SDNode OpNode, string OpcodeStr, Predicate prd> { let Predicates = [prd] in defm Z512 : avx512_psadbw_packed opc, SDNode OpNode, } } -defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", +defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", HasBWI>, EVEX_4V; multiclass avx512_ternlog opc, string OpcodeStr, SDNode OpNode, @@ -7592,7 +7598,7 @@ multiclass avx512_fixupimm_packed_sae opc, string OpcodeStr, let Constraints = "$src1 = $dst" in { defm rrib : AVX512_maskable_3src{ } } -defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, - f32x_info, v4i32x_info>, +defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, + f32x_info, v4i32x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; -defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, - f64x_info, v2i64x_info>, +defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, + f64x_info, v2i64x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; -defm VFIXUPIMMPS : avx512_fixupimm_packed_all, +defm VFIXUPIMMPS : avx512_fixupimm_packed_all, EVEX_CD8<32, CD8VF>; -defm VFIXUPIMMPD : avx512_fixupimm_packed_all, +defm VFIXUPIMMPD : avx512_fixupimm_packed_all, EVEX_CD8<64, CD8VF>, VEX_W; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 27c2329..9ad49f2 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -685,11 +685,6 @@ def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast(N)->getAlignment() >= 16; }]>; -// Like 'X86vzload', but always requires 128-bit vector alignment. -def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{ - return cast(N)->getAlignment() >= 16; -}]>; - // Like 'load', but always requires 256-bit vector alignment. def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast(N)->getAlignment() >= 32; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 722ea0f..45f9230 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5058,6 +5058,8 @@ let Predicates = [UseAVX], AddedComplexity = 20 in { def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; + def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; } let Predicates = [UseSSE2], AddedComplexity = 20 in { @@ -5066,13 +5068,6 @@ let Predicates = [UseSSE2], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; } -let Predicates = [HasAVX] in { -def : Pat<(v4i64 (alignedX86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>; -def : Pat<(v4i64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>; -} - //===---------------------------------------------------------------------===// // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll index fe69183..2f43ada 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -239,26 +239,10 @@ define <8 x float> @merge_8f32_4f32_z2(<4 x float>* %ptr) nounwind uwtable noinl } define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinline ssp { -; AVX1-LABEL: merge_8f32_f32_12zzuuzz: -; AVX1: # BB#0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: merge_8f32_f32_12zzuuzz: -; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: merge_8f32_f32_12zzuuzz: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq +; AVX-LABEL: merge_8f32_f32_12zzuuzz: +; AVX: # BB#0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: retq %ptr0 = getelementptr inbounds float, float* %ptr, i64 1 %ptr1 = getelementptr inbounds float, float* %ptr, i64 2 %val0 = load float, float* %ptr0 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll index 3ffdd2c..811d3da 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -187,10 +187,6 @@ define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwt ; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: ; ALL: # BB#0: ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %ptr0 = getelementptr inbounds float, float* %ptr, i64 8 %ptr1 = getelementptr inbounds float, float* %ptr, i64 9 @@ -282,10 +278,6 @@ define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(i32* %ptr) nounwind uwtable ; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: ; ALL: # BB#0: ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 2 @@ -383,8 +375,6 @@ define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) n ; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2 @@ -454,18 +444,12 @@ define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu ; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 -- 2.7.4