From 4a8dd89128eb563d50ac9dd2c9f8017fc0cfc495 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 9 Mar 2016 22:12:08 +0000 Subject: [PATCH] [x86, AVX] optimize masked loads with constant masks Instead of a variable-blend instruction, form a blend with immediate because those are always cheaper. Differential Revision: http://reviews.llvm.org/D17899 llvm-svn: 263067 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 46 ++++++++++++++- llvm/test/CodeGen/X86/masked_memop.ll | 100 ++++++++++++++++++++++++-------- 2 files changed, 121 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6ab1daa..ea438a8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27323,12 +27323,54 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, return DCI.CombineTo(ML, Insert, Load.getValue(1), true); } +/// Convert a masked load with a constant mask into a masked load and a shuffle. +/// This allows the blend operation to use a faster kind of shuffle instruction +/// (for example, vblendvps -> vblendps). +static SDValue +combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // Don't try this if the pass-through operand is already undefined. That would + // cause an infinite loop because that's what we're about to create. + if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()) || + ML->getSrc0().getOpcode() == ISD::UNDEF) + return SDValue(); + + // Convert the masked load's mask into a blend mask for a vector shuffle node. + EVT VT = ML->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + BuildVectorSDNode *MaskBV = cast(ML->getMask()); + SmallVector ShufMask(NumElts, SM_SentinelUndef); + for (unsigned i = 0; i < NumElts; ++i) { + // If this mask bit of the masked load is false, the pass-through vector + // (Src0) element will be selected for that vector lane. + if (MaskBV->getOperand(i).getOpcode() != ISD::UNDEF) + ShufMask[i] = isNullConstant(MaskBV->getOperand(i)) ? i + NumElts : i; + } + + // The new masked load has an undef pass-through operand. The shuffle uses the + // original pass-through operand. + SDLoc DL(ML); + SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), + ML->getMask(), DAG.getUNDEF(VT), + ML->getMemoryVT(), ML->getMemOperand(), + ML->getExtensionType()); + SDValue Blend = DAG.getVectorShuffle(VT, DL, NewML, ML->getSrc0(), ShufMask); + + return DCI.CombineTo(ML, Blend, NewML.getValue(1), true); +} + static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MaskedLoadSDNode *Mld = cast(N); - if (Mld->getExtensionType() == ISD::NON_EXTLOAD) - return reduceMaskedLoadToScalarLoad(Mld, DAG, DCI); + if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { + if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI)) + return ScalarLoad; + // TODO: Do some AVX512 subsets benefit from this transform? + if (!Subtarget.hasAVX512()) + if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI)) + return Blend; + } if (Mld->getExtensionType() != ISD::SEXTLOAD) return SDValue(); diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index 247560b..f3f36e4 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -943,8 +943,8 @@ define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) ; AVX-LABEL: mload_constmask_v4f32: ; AVX: ## BB#0: ; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [4294967295,0,4294967295,4294967295] -; AVX-NEXT: vmaskmovps (%rdi), %xmm1, %xmm2 -; AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v4f32: @@ -970,15 +970,15 @@ define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) { ; AVX1-LABEL: mload_constmask_v4i32: ; AVX1: ## BB#0: ; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] -; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: mload_constmask_v4i32: ; AVX2: ## BB#0: ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] -; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v4i32: @@ -1004,8 +1004,8 @@ define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) ; AVX-LABEL: mload_constmask_v8f32: ; AVX: ## BB#0: ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0] -; AVX-NEXT: vmaskmovps (%rdi), %ymm1, %ymm2 -; AVX-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v8f32: @@ -1029,8 +1029,8 @@ define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %ds ; AVX-LABEL: mload_constmask_v4f64: ; AVX: ## BB#0: ; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0] -; AVX-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm2 -; AVX-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] ; AVX-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v4f64: @@ -1056,15 +1056,15 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) { ; AVX1-LABEL: mload_constmask_v8i32: ; AVX1: ## BB#0: ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,4294967295] -; AVX1-NEXT: vmaskmovps (%rdi), %ymm1, %ymm2 -; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: mload_constmask_v8i32: ; AVX2: ## BB#0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,4294967295] -; AVX2-NEXT: vpmaskmovd (%rdi), %ymm1, %ymm2 -; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaskmovd (%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v8i32: @@ -1088,15 +1088,15 @@ define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) { ; AVX1-LABEL: mload_constmask_v4i64: ; AVX1: ## BB#0: ; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615] -; AVX1-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm2 -; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: mload_constmask_v4i64: ; AVX2: ## BB#0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615] -; AVX2-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm2 -; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v4i64: @@ -1122,11 +1122,11 @@ define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %ds ; AVX-LABEL: mload_constmask_v8f64: ; AVX: ## BB#0: ; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,0] -; AVX-NEXT: vmaskmovpd (%rdi), %ymm2, %ymm3 -; AVX-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX-NEXT: vmaskmovpd (%rdi), %ymm2, %ymm2 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] ; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,0,18446744073709551615] -; AVX-NEXT: vmaskmovpd 32(%rdi), %ymm2, %ymm3 -; AVX-NEXT: vblendvpd %ymm2, %ymm3, %ymm1, %ymm1 +; AVX-NEXT: vmaskmovpd 32(%rdi), %ymm2, %ymm2 +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] ; AVX-NEXT: retq ; ; AVX512-LABEL: mload_constmask_v8f64: @@ -1139,6 +1139,60 @@ define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %ds ret <8 x double> %res } +; If the pass-through operand is undef, no blend is needed. + +define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) { +; AVX-LABEL: mload_constmask_v4f64_undef_passthrough: +; AVX: ## BB#0: +; AVX-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0] +; AVX-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0] +; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; SKX-LABEL: mload_constmask_v4f64_undef_passthrough: +; SKX: ## BB#0: +; SKX-NEXT: movb $7, %al +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq + %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> undef) + ret <4 x double> %res +} + +define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) { +; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough: +; AVX1: ## BB#0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough: +; AVX2: ## BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] +; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] +; AVX512F-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; SKX-LABEL: mload_constmask_v4i64_undef_passthrough: +; SKX: ## BB#0: +; SKX-NEXT: movb $6, %al +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq + %res = call <4 x i64> @llvm.masked.load.v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> undef) + ret <4 x i64> %res +} + define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { ; AVX1-LABEL: test21: ; AVX1: ## BB#0: @@ -2313,4 +2367,4 @@ define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i1 call void @llvm.masked.store.v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask) ret void } -declare void @llvm.masked.store.v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>) \ No newline at end of file +declare void @llvm.masked.store.v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>) -- 2.7.4