From 02e56e2533027833ff2ca0042fae9cfaa2d85aa6 Mon Sep 17 00:00:00 2001 From: aqjune Date: Mon, 27 Jun 2022 17:44:51 +0900 Subject: [PATCH] [CodeGen] Generate efficient assembly for freeze(poison) version of `mm*_cast*` intel intrinsics This patch makes the variants of `mm*_cast*` intel intrinsics that use `shufflevector(freeze(poison), ..)` emit efficient assembly. (These intrinsics are planned to use `shufflevector(freeze(poison), ..)` after shufflevector's semantics update; relevant thread: D103874) To do so, this patch 1. Updates `LowerAVXCONCAT_VECTORS` in X86ISelLowering.cpp to recognize `FREEZE(UNDEF)` operand of `CONCAT_VECTOR` in addition to `UNDEF` 2. Updates X86InstrVecCompiler.td to recognize `insert_subvector` of `FREEZE(UNDEF)` vector as its first operand. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D130339 --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 3 ++ llvm/include/llvm/Target/TargetSelectionDAG.td | 7 ++++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 ++ llvm/lib/Target/X86/X86ISelLowering.cpp | 8 +++- llvm/lib/Target/X86/X86InstrVecCompiler.td | 2 +- llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll | 3 -- llvm/test/CodeGen/X86/avx-intrinsics-x86.ll | 45 ++++++----------------- llvm/test/CodeGen/X86/avx512-intrinsics.ll | 3 -- llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll | 2 - 9 files changed, 33 insertions(+), 44 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 810e571..fad6359 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -122,6 +122,9 @@ bool isBuildVectorOfConstantFPSDNodes(const SDNode *N); /// specified node are ISD::UNDEF. bool allOperandsUndef(const SDNode *N); +/// Return true if the specified node is FREEZE(UNDEF). +bool isFreezeUndef(const SDNode *N); + } // end namespace ISD //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index bce8f2b..e882dd70 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -175,6 +175,9 @@ def SDTExtInvec : SDTypeProfile<1, 1, [ // sext_invec SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>, SDTCisVec<1>, SDTCisOpSmallerThanOp<1, 0> ]>; +def SDTFreeze : SDTypeProfile<1, 1, [ + SDTCisSameAs<0, 1> +]>; def SDTSetCC : SDTypeProfile<1, 3, [ // setcc SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> @@ -453,6 +456,7 @@ def anyext : SDNode<"ISD::ANY_EXTEND" , SDTIntExtendOp>; def trunc : SDNode<"ISD::TRUNCATE" , SDTIntTruncOp>; def bitconvert : SDNode<"ISD::BITCAST" , SDTUnaryOp>; def addrspacecast : SDNode<"ISD::ADDRSPACECAST", SDTUnaryOp>; +def freeze : SDNode<"ISD::FREEZE" , SDTFreeze>; def extractelt : SDNode<"ISD::EXTRACT_VECTOR_ELT", SDTVecExtract>; def insertelt : SDNode<"ISD::INSERT_VECTOR_ELT", SDTVecInsert>; @@ -1300,6 +1304,9 @@ def post_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset), let ScalarMemoryVT = i16; } +// A helper for matching undef or freeze undef +def undef_or_freeze_undef : PatFrags<(ops), [(undef), (freeze undef)]>; + // TODO: Split these into volatile and unordered flavors to enable // selectively legal optimizations for each. (See D66309) def simple_load : PatFrag<(ops node:$ptr), diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index a212582..e3e9914 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -300,6 +300,10 @@ bool ISD::allOperandsUndef(const SDNode *N) { return all_of(N->op_values(), [](SDValue Op) { return Op.isUndef(); }); } +bool ISD::isFreezeUndef(const SDNode *N) { + return N->getOpcode() == ISD::FREEZE && N->getOperand(0).isUndef(); +} + bool ISD::matchUnaryPredicate(SDValue Op, std::function Match, bool AllowUndefs) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 00f42c2..102055c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11461,6 +11461,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); unsigned NumOperands = Op.getNumOperands(); + unsigned NumFreezeUndef = 0; unsigned NumZero = 0; unsigned NumNonZero = 0; unsigned NonZeros = 0; @@ -11468,7 +11469,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; - if (ISD::isBuildVectorAllZeros(SubVec.getNode())) + if (ISD::isFreezeUndef(SubVec.getNode()) && SubVec.hasOneUse()) + ++NumFreezeUndef; + else if (ISD::isBuildVectorAllZeros(SubVec.getNode())) ++NumZero; else { assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. @@ -11490,7 +11493,8 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, // Otherwise, build it up through insert_subvectors. SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) - : DAG.getUNDEF(ResVT); + : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT)) + : DAG.getUNDEF(ResVT)); MVT SubVT = Op.getOperand(0).getSimpleValueType(); unsigned NumSubElems = SubVT.getVectorNumElements(); diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td index e6ecbb6..70bd77b 100644 --- a/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -68,7 +68,7 @@ multiclass subvector_subreg_lowering; - def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))), + def : Pat<(VT (insert_subvector undef_or_freeze_undef, subRC:$src, (iPTR 0))), (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>; } diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index c6f5d79..728c15c 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -258,7 +258,6 @@ define <4 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwin ; CHECK-LABEL: test_mm256_castpd128_pd256_freeze: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %a1 = freeze <2 x double> poison %res = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> @@ -304,7 +303,6 @@ define <8 x float> @test_mm256_castps128_ps256_freeze(<4 x float> %a0) nounwind ; CHECK-LABEL: test_mm256_castps128_ps256_freeze: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %a1 = freeze <4 x float> poison %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> @@ -334,7 +332,6 @@ define <4 x i64> @test_mm256_castsi128_si256_freeze(<2 x i64> %a0) nounwind { ; CHECK-LABEL: test_mm256_castsi128_si256_freeze: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %a1 = freeze <2 x i64> poison %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll index ebbcb53..80ca682 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -1033,17 +1033,10 @@ declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnon define <4 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwind { -; AVX-LABEL: test_mm256_castpd128_pd256_freeze: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_mm256_castpd128_pd256_freeze: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; CHECK-LABEL: test_mm256_castpd128_pd256_freeze: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %a1 = freeze <2 x double> poison %res = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> ret <4 x double> %res @@ -1051,17 +1044,10 @@ define <4 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwin define <8 x float> @test_mm256_castps128_ps256_freeze(<4 x float> %a0) nounwind { -; AVX-LABEL: test_mm256_castps128_ps256_freeze: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_mm256_castps128_ps256_freeze: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; CHECK-LABEL: test_mm256_castps128_ps256_freeze: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %a1 = freeze <4 x float> poison %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> ret <8 x float> %res @@ -1069,17 +1055,10 @@ define <8 x float> @test_mm256_castps128_ps256_freeze(<4 x float> %a0) nounwind define <4 x i64> @test_mm256_castsi128_si256_freeze(<2 x i64> %a0) nounwind { -; AVX-LABEL: test_mm256_castsi128_si256_freeze: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_mm256_castsi128_si256_freeze: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; CHECK-LABEL: test_mm256_castsi128_si256_freeze: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %a1 = freeze <2 x i64> poison %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> ret <4 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 3339549..8455121 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -7510,7 +7510,6 @@ define <8 x double> @test_mm256_castpd256_pd256_freeze(<4 x double> %a0) nounwin ; CHECK-LABEL: test_mm256_castpd256_pd256_freeze: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %a1 = freeze <4 x double> poison %res = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> @@ -7536,7 +7535,6 @@ define <16 x float> @test_mm256_castps256_ps512_freeze(<8 x float> %a0) nounwind ; CHECK-LABEL: test_mm256_castps256_ps512_freeze: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %a1 = freeze <8 x float> poison %res = shufflevector <8 x float> %a0, <8 x float> %a1, <16x i32> @@ -7562,7 +7560,6 @@ define <8 x i64> @test_mm512_castsi256_si512_pd256_freeze(<4 x i64> %a0) nounwin ; CHECK-LABEL: test_mm512_castsi256_si512_pd256_freeze: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %a1 = freeze <4 x i64> poison %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll index 7681b7b..6a46a68 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll @@ -1221,7 +1221,6 @@ define <16 x half> @test_mm256_castph128_ph256_freeze(<8 x half> %a0) nounwind { ; CHECK-LABEL: test_mm256_castph128_ph256_freeze: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq %a1 = freeze <8 x half> poison %res = shufflevector <8 x half> %a0, <8 x half> %a1, <16 x i32> @@ -1247,7 +1246,6 @@ define <32 x half> @test_mm512_castph256_ph512_freeze(<16 x half> %a0) nounwind ; CHECK-LABEL: test_mm512_castph256_ph512_freeze: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %a1 = freeze <16 x half> poison %res = shufflevector <16 x half> %a0, <16 x half> %a1, <32 x i32> -- 2.7.4