From 487fa6f8c3af87232f7ff9484568be7782f7f8b2 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 29 Jul 2022 09:32:08 +0100 Subject: [PATCH] [AArch64][DAGCombine] Add performBuildVectorCombine 'extract_elt ~> anyext' A build vector of two extracted elements is equivalent to an extract subvector where the inner vector is any-extended to the extract_vector_elt VT, because extract_vector_elt has the effect of an any-extend. (build_vector (extract_elt_i16_to_i32 vec Idx+0) (extract_elt_i16_to_i32 vec Idx+1)) => (extract_subvector (anyext_i16_to_i32 vec) Idx) Depends on D130697 Differential Revision: https://reviews.llvm.org/D130698 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 47 +++++++- llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll | 6 +- .../AArch64/sve-fixed-length-extract-subvector.ll | 7 +- .../AArch64/sve-fixed-length-masked-gather.ll | 5 +- .../AArch64/sve-fixed-length-masked-loads.ll | 5 +- .../AArch64/sve-fixed-length-masked-scatter.ll | 5 +- .../AArch64/sve-fixed-length-masked-stores.ll | 5 +- llvm/test/CodeGen/AArch64/vector-fcvt.ll | 134 ++++++++------------- 8 files changed, 100 insertions(+), 114 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c21470d..7ac8326 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -888,7 +888,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND, ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR, - ISD::INSERT_SUBVECTOR, ISD::STORE}); + ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR}); if (Subtarget->supportsAddressTopByteIgnored()) setTargetDAGCombine(ISD::LOAD); @@ -16031,6 +16031,49 @@ static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue performBuildVectorCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + SDLoc DL(N); + + // A build vector of two extracted elements is equivalent to an + // extract subvector where the inner vector is any-extended to the + // extract_vector_elt VT. + // (build_vector (extract_elt_iXX_to_i32 vec Idx+0) + // (extract_elt_iXX_to_i32 vec Idx+1)) + // => (extract_subvector (anyext_iXX_to_i32 vec) Idx) + + // For now, only consider the v2i32 case, which arises as a result of + // legalization. + if (N->getValueType(0) != MVT::v2i32) + return SDValue(); + + SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1); + // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT. + if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + // Constant index. + isa(Elt0->getOperand(1)) && + isa(Elt1->getOperand(1)) && + // Both EXTRACT_VECTOR_ELT from same vector... + Elt0->getOperand(0) == Elt1->getOperand(0) && + // ... and contiguous. First element's index +1 == second element's index. + Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1)) { + SDValue VecToExtend = Elt0->getOperand(0); + EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32); + if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT)) + return SDValue(); + + SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL); + + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext, + SubvectorIdx); + } + + return SDValue(); +} + static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -19500,6 +19543,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ADD: case ISD::SUB: return performAddSubCombine(N, DCI, DAG); + case ISD::BUILD_VECTOR: + return performBuildVectorCombine(N, DCI, DAG); case AArch64ISD::ANDS: return performFlagSettingCombine(N, DCI, ISD::AND); case AArch64ISD::ADC: diff --git a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll index 78b71f4..864ddc2 100644 --- a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll @@ -6,11 +6,7 @@ define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) { ; CHECK-LABEL: bitcast_v2i16_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %y = bitcast <2 x half> %x to <2 x i16> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll index 540627b..f1057fc 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -101,11 +101,8 @@ define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) vscale_ran define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index bf53482..11323ac 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -575,10 +575,7 @@ define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) vscale_range(2 ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq v1.4h, v1.4h, #0.0 -; CHECK-NEXT: umov w8, v1.h[0] -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-NEXT: sshr v1.2s, v1.2s, #16 ; CHECK-NEXT: fmov w8, s1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index 28e442e..eef6d60 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -17,10 +17,7 @@ define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) vscale_ra ; CHECK-NEXT: ldr s2, [x1] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: fcmeq v1.4h, v1.4h, v2.4h -; CHECK-NEXT: umov w8, v1.h[0] -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-NEXT: sshr v1.2s, v1.2s, #16 ; CHECK-NEXT: fmov w8, s1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index 58834bf..54b1603 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -538,10 +538,7 @@ define void @masked_scatter_v2f16(<2 x half>* %a, <2 x half*>* %b) vscale_range( ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0 ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: umov w8, v2.h[0] -; CHECK-NEXT: umov w9, v2.h[1] -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov v2.s[1], w9 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-NEXT: shl v2.2s, v2.2s, #16 ; CHECK-NEXT: sshr v2.2s, v2.2s, #16 ; CHECK-NEXT: fmov w8, s2 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll index 3d6099e..e57523e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -17,10 +17,7 @@ define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) vscale_range(2 ; CHECK-NEXT: ldr s2, [x1] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: fcmeq v2.4h, v1.4h, v2.4h -; CHECK-NEXT: umov w8, v2.h[0] -; CHECK-NEXT: umov w9, v2.h[1] -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov v2.s[1], w9 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-NEXT: shl v2.2s, v2.2s, #16 ; CHECK-NEXT: sshr v2.2s, v2.2s, #16 ; CHECK-NEXT: fmov w8, s2 diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll index 4b7736eb..7a47a3e 100644 --- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll @@ -194,23 +194,16 @@ define <8 x float> @uitofp_i64_float(<8 x i64> %a) { define <4 x double> @sitofp_v4i8_double(<4 x i8> %a) { ; CHECK-LABEL: sitofp_v4i8_double: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: shl v0.2s, v0.2s, #24 -; CHECK-NEXT: sshr v1.2s, v1.2s, #24 ; CHECK-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-NEXT: sshr v1.2s, v1.2s, #24 +; CHECK-NEXT: scvtf v0.2d, v0.2d ; CHECK-NEXT: sshll v1.2d, v1.2s, #0 -; CHECK-NEXT: sshll v2.2d, v0.2s, #0 -; CHECK-NEXT: scvtf v0.2d, v1.2d -; CHECK-NEXT: scvtf v1.2d, v2.2d +; CHECK-NEXT: scvtf v1.2d, v1.2d ; CHECK-NEXT: ret %1 = sitofp <4 x i8> %a to <4 x double> ret <4 x double> %1 @@ -333,39 +326,26 @@ define <16 x double> @sitofp_v16i8_double(<16 x i8> %a) { define <8 x double> @sitofp_i16_double(<8 x i16> %a) { ; CHECK-LABEL: sitofp_i16_double: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[1] -; CHECK-NEXT: umov w10, v1.h[0] -; CHECK-NEXT: umov w12, v1.h[2] -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: fmov s3, w10 -; CHECK-NEXT: umov w10, v1.h[3] -; CHECK-NEXT: fmov s1, w12 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v2.s[1], w11 -; CHECK-NEXT: mov v3.s[1], w9 -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: shl v2.2s, v2.2s, #16 -; CHECK-NEXT: sshr v0.2s, v0.2s, #16 -; CHECK-NEXT: shl v3.2s, v3.2s, #16 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: shl v2.2s, v1.2s, #16 +; CHECK-NEXT: shl v3.2s, v0.2s, #16 +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NEXT: sshr v2.2s, v2.2s, #16 -; CHECK-NEXT: sshll v4.2d, v0.2s, #0 -; CHECK-NEXT: sshr v0.2s, v3.2s, #16 -; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: sshr v3.2s, v3.2s, #16 ; CHECK-NEXT: sshll v2.2d, v2.2s, #0 -; CHECK-NEXT: sshll v3.2d, v0.2s, #0 -; CHECK-NEXT: sshll v5.2d, v1.2s, #0 -; CHECK-NEXT: scvtf v0.2d, v2.2d -; CHECK-NEXT: scvtf v1.2d, v4.2d -; CHECK-NEXT: scvtf v2.2d, v3.2d -; CHECK-NEXT: scvtf v3.2d, v5.2d +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: scvtf v2.2d, v2.2d +; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: sshll v3.2d, v3.2s, #0 +; CHECK-NEXT: sshll v4.2d, v1.2s, #0 +; CHECK-NEXT: sshll v1.2d, v0.2s, #0 +; CHECK-NEXT: scvtf v0.2d, v3.2d +; CHECK-NEXT: scvtf v1.2d, v1.2d +; CHECK-NEXT: scvtf v3.2d, v4.2d ; CHECK-NEXT: ret %1 = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %1 @@ -402,22 +382,15 @@ define <8 x double> @sitofp_i64_double(<8 x i64> %a) { define <4 x double> @uitofp_v4i8_double(<4 x i8> %a) { ; CHECK-LABEL: uitofp_v4i8_double: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: movi d1, #0x0000ff000000ff -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v2.s[1], w10 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: and v2.8b, v2.8b, v1.8b +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: ushll v1.2d, v2.2s, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ucvtf v0.2d, v1.2d -; CHECK-NEXT: ucvtf v1.2d, v2.2d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: and v1.8b, v2.8b, v1.8b +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf v1.2d, v1.2d ; CHECK-NEXT: ret %1 = uitofp <4 x i8> %a to <4 x double> ret <4 x double> %1 @@ -530,36 +503,23 @@ define <16 x double> @uitofp_v16i8_double(<16 x i8> %a) { define <8 x double> @uitofp_i16_double(<8 x i16> %a) { ; CHECK-LABEL: uitofp_i16_double: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[1] ; CHECK-NEXT: movi d1, #0x00ffff0000ffff -; CHECK-NEXT: umov w10, v2.h[0] -; CHECK-NEXT: umov w12, v2.h[2] -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: umov w9, v2.h[1] -; CHECK-NEXT: fmov s4, w10 -; CHECK-NEXT: umov w10, v2.h[3] -; CHECK-NEXT: fmov s2, w12 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v3.s[1], w11 -; CHECK-NEXT: mov v4.s[1], w9 -; CHECK-NEXT: mov v2.s[1], w10 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v3.8b, v3.8b, v1.8b -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: and v0.8b, v4.8b, v1.8b -; CHECK-NEXT: and v1.8b, v2.8b, v1.8b +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: and v3.8b, v2.8b, v1.8b +; CHECK-NEXT: and v4.8b, v0.8b, v1.8b +; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ushll v3.2d, v3.2s, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll v4.2d, v1.2s, #0 -; CHECK-NEXT: ucvtf v0.2d, v3.2d -; CHECK-NEXT: ucvtf v1.2d, v5.2d -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: ucvtf v3.2d, v4.2d +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: and v2.8b, v2.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ushll v5.2d, v2.2s, #0 +; CHECK-NEXT: ucvtf v2.2d, v3.2d +; CHECK-NEXT: ushll v1.2d, v0.2s, #0 +; CHECK-NEXT: ucvtf v0.2d, v4.2d +; CHECK-NEXT: ucvtf v1.2d, v1.2d +; CHECK-NEXT: ucvtf v3.2d, v5.2d ; CHECK-NEXT: ret %1 = uitofp <8 x i16> %a to <8 x double> ret <8 x double> %1 -- 2.7.4