From: Simon Pilgrim Date: Thu, 3 Mar 2016 18:13:53 +0000 (+0000) Subject: [X86][AVX] Better support for the variable mask form of VPERMILPD/VPERMILPS X-Git-Tag: llvmorg-3.9.0-rc1~12550 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=abcee45b7ad3a07359ac92cc2954f4ef489367ae;p=platform%2Fupstream%2Fllvm.git [X86][AVX] Better support for the variable mask form of VPERMILPD/VPERMILPS The variable mask form of VPERMILPD/VPERMILPS were only partially implemented, with much of it still performed as an intrinsic. This patch properly defines the instructions in terms of X86ISD::VPERMILPV, permitting the opcode to be easily combined as a target shuffle. Differential Revision: http://reviews.llvm.org/D17681 llvm-svn: 262635 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 802988c..2a11edd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3976,6 +3976,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: + case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case X86ISD::VPERMI: case X86ISD::VPERMV: @@ -5008,6 +5009,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodeZeroMoveLowMask(VT, Mask); IsUnary = true; break; + case X86ISD::VPERMILPV: { + IsUnary = true; + SDValue MaskNode = N->getOperand(1); + if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { + unsigned MaskEltSize = VT.getScalarSizeInBits(); + DecodeVPERMILPMask(C, MaskEltSize, Mask); + break; + } + return false; + } case X86ISD::PSHUFB: { IsUnary = true; SDValue MaskNode = N->getOperand(1); @@ -29107,6 +29118,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::VPERMILPI: + case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case ISD::FMA: return combineFMA(N, DAG, Subtarget); diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 0822f1f..70bcc2b 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8093,45 +8093,45 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", multiclass avx_permil opc_rm, bits<8> opc_rmi, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop_f, X86MemOperand x86memop_i, PatFrag i_frag, - Intrinsic IntVar, ValueType vt> { - def rr : AVX8I, VEX_4V, - Sched<[WriteFShuffle]>; - def rm : AVX8I, VEX_4V, - Sched<[WriteFShuffleLd, ReadAfterLd]>; - + ValueType f_vt, ValueType i_vt> { let Predicates = [HasAVX, NoVLX] in { + def rr : AVX8I, VEX_4V, + Sched<[WriteFShuffle]>; + def rm : AVX8I, VEX_4V, + Sched<[WriteFShuffleLd, ReadAfterLd]>; + def ri : AVXAIi8, VEX, + [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; def mi : AVXAIi8, VEX, + (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; }// Predicates = [HasAVX, NoVLX] } let ExeDomain = SSEPackedSingle in { defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, - loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>; + loadv2i64, v4f32, v4i32>; defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, - loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L; + loadv4i64, v8f32, v8i32>, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, - loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>; + loadv2i64, v2f64, v2i64>; defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, - loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; + loadv4i64, v4f64, v4i64>, VEX_L; } let Predicates = [HasAVX, NoVLX] in { diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 2d6660e..65aea3d 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -328,6 +328,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 758f2be..7d4d892a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -22,9 +22,6 @@ declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, define <4 x float> @combine_vpermilvar_4f32(<4 x float> %a0) { ; ALL-LABEL: combine_vpermilvar_4f32: ; ALL: # BB#0: -; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [3,2,1,0] -; ALL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 ; ALL-NEXT: retq %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> ) @@ -34,9 +31,6 @@ define <4 x float> @combine_vpermilvar_4f32(<4 x float> %a0) { define <8 x float> @combine_vpermilvar_8f32(<8 x float> %a0) { ; ALL-LABEL: combine_vpermilvar_8f32: ; ALL: # BB#0: -; ALL-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,2,3,0,1] -; ALL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; ALL-NEXT: retq %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> ) @@ -59,9 +53,6 @@ define <2 x double> @combine_vpermilvar_2f64(<2 x double> %a0) { define <4 x double> @combine_vpermilvar_4f64(<4 x double> %a0) { ; ALL-LABEL: combine_vpermilvar_4f64: ; ALL: # BB#0: -; ALL-NEXT: vmovapd {{.*#+}} ymm1 = [2,0,2,0] -; ALL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 ; ALL-NEXT: retq %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> )