From ba78cae20f1467ebba6bd1005ef3e48e1fd96dee Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 28 Oct 2020 19:48:20 +0000 Subject: [PATCH] [AArch64] Use DUP for BUILD_VECTOR with few different elements. If most elements of BUILD_VECTOR are the same, with a few different elements, it is better to use DUP for the common elements and INSERT_VECTOR_ELT for the different elements. Currently this transform is guarded quite restrictively to only trigger in clearly beneficial cases. With D90176, the lowering for patterns originating from code like ` float32x4_t y = {a,a,a,0};` (common in 3D apps) are lowered even better (unnecessary fmov is removed). Reviewed By: efriedma Differential Revision: https://reviews.llvm.org/D90233 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 37 ++++++++++- .../test/CodeGen/AArch64/arm64-vector-insertion.ll | 73 ++++++---------------- 2 files changed, 54 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4cf07e9..1579a28 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8991,14 +8991,18 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, bool isConstant = true; bool AllLanesExtractElt = true; unsigned NumConstantLanes = 0; + unsigned NumDifferentLanes = 0; + unsigned NumUndefLanes = 0; SDValue Value; SDValue ConstantValue; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) AllLanesExtractElt = false; - if (V.isUndef()) + if (V.isUndef()) { + ++NumUndefLanes; continue; + } if (i > 0) isOnlyLowElement = false; if (!isa(V) && !isa(V)) @@ -9014,8 +9018,10 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, if (!Value.getNode()) Value = V; - else if (V != Value) + else if (V != Value) { usesOnlyOneValue = false; + ++NumDifferentLanes; + } } if (!Value.getNode()) { @@ -9141,11 +9147,20 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, } } + // If we need to insert a small number of different non-constant elements and + // the vector width is sufficiently large, prefer using DUP with the common + // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred, + // skip the constant lane handling below. + bool PreferDUPAndInsert = + !isConstant && NumDifferentLanes >= 1 && + NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) && + NumDifferentLanes >= NumConstantLanes; + // If there was only one constant value used and for more than one lane, // start by splatting that value, then replace the non-constant lanes. This // is better than the default, which will perform a separate initialization // for each lane. - if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { + if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) { // Firstly, try to materialize the splat constant. SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue), Val = ConstantBuildVector(Vec, DAG); @@ -9181,6 +9196,22 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, return shuffle; } + if (PreferDUPAndInsert) { + // First, build a constant vector with the common element. + SmallVector Ops; + for (unsigned I = 0; I < NumElts; ++I) + Ops.push_back(Value); + SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG); + // Next, insert the elements that do not match the common value. + for (unsigned I = 0; I < NumElts; ++I) + if (Op.getOperand(I) != Value) + NewVector = + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector, + Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64)); + + return NewVector; + } + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll index 8d7cd46..0ddb87b 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll @@ -31,8 +31,6 @@ entry: ret void } -; TODO: This should jsut be a dup + clearing lane 4. - define <16 x i8> @test_insert_v16i8_insert_1(i8 %a) { ; CHECK-LABEL: test_insert_v16i8_insert_1: ; CHECK: // %bb.0: @@ -58,20 +56,9 @@ define <16 x i8> @test_insert_v16i8_insert_2(i8 %a) { define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) { ; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: mov.b v0[0], w0 -; CHECK-NEXT: mov.b v0[1], w0 -; CHECK-NEXT: mov.b v0[2], w0 -; CHECK-NEXT: mov.b v0[3], w0 -; CHECK-NEXT: mov.b v0[4], w0 -; CHECK-NEXT: mov.b v0[6], w0 -; CHECK-NEXT: mov.b v0[7], w0 -; CHECK-NEXT: mov.b v0[10], w0 -; CHECK-NEXT: mov.b v0[11], w0 -; CHECK-NEXT: mov.b v0[12], w0 -; CHECK-NEXT: mov.b v0[13], w0 -; CHECK-NEXT: mov.b v0[14], w0 -; CHECK-NEXT: mov.b v0[15], w0 +; CHECK-NEXT: dup.16b v0, w0 +; CHECK-NEXT: mov.b v0[5], wzr +; CHECK-NEXT: mov.b v0[9], wzr ; CHECK-NEXT: ret %v.0 = insertelement <16 x i8> , i8 %a, i32 0 %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1 @@ -93,19 +80,12 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) { define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a, i8 %b) { ; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_valeus: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: mov.b v0[0], w0 -; CHECK-NEXT: mov.b v0[1], w0 +; CHECK-NEXT: dup.16b v0, w0 ; CHECK-NEXT: mov.b v0[2], w1 -; CHECK-NEXT: mov.b v0[3], w0 -; CHECK-NEXT: mov.b v0[4], w0 -; CHECK-NEXT: mov.b v0[6], w0 +; CHECK-NEXT: mov.b v0[5], wzr ; CHECK-NEXT: mov.b v0[7], w1 -; CHECK-NEXT: mov.b v0[10], w0 -; CHECK-NEXT: mov.b v0[11], w0 +; CHECK-NEXT: mov.b v0[9], wzr ; CHECK-NEXT: mov.b v0[12], w1 -; CHECK-NEXT: mov.b v0[13], w0 -; CHECK-NEXT: mov.b v0[14], w0 ; CHECK-NEXT: mov.b v0[15], w1 ; CHECK-NEXT: ret %v.0 = insertelement <16 x i8> , i8 %a, i32 0 @@ -128,16 +108,11 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a, define <8 x half> @test_insert_v8f16_insert_1(half %a) { ; CHECK-LABEL: test_insert_v8f16_insert_1: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-NEXT: mov.h v1[0], v0[0] -; CHECK-NEXT: mov.h v1[1], v0[0] -; CHECK-NEXT: mov.h v1[2], v0[0] -; CHECK-NEXT: mov.h v1[3], v0[0] -; CHECK-NEXT: mov.h v1[4], v0[0] -; CHECK-NEXT: mov.h v1[5], v0[0] -; CHECK-NEXT: mov.h v1[6], v0[0] -; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: dup.8h v0, v0[0] +; CHECK-NEXT: add x8, x8, :lo12:.LCPI6_0 +; CHECK-NEXT: ld1.h { v0 }[7], [x8] ; CHECK-NEXT: ret %v.0 = insertelement <8 x half> , half %a, i32 0 %v.1 = insertelement <8 x half> %v.0, half %a, i32 1 @@ -167,13 +142,9 @@ define <8 x half> @test_insert_v8f16_insert_2(half %a) { define <8 x i16> @test_insert_v8i16_insert_2(i16 %a) { ; CHECK-LABEL: test_insert_v8i16_insert_2: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: mov.h v0[0], w0 -; CHECK-NEXT: mov.h v0[1], w0 -; CHECK-NEXT: mov.h v0[2], w0 -; CHECK-NEXT: mov.h v0[4], w0 -; CHECK-NEXT: mov.h v0[5], w0 -; CHECK-NEXT: mov.h v0[6], w0 +; CHECK-NEXT: dup.8h v0, w0 +; CHECK-NEXT: mov.h v0[3], wzr +; CHECK-NEXT: mov.h v0[7], wzr ; CHECK-NEXT: ret %v.0 = insertelement <8 x i16> , i16 %a, i32 0 %v.1 = insertelement <8 x i16> %v.0, i16 %a, i32 1 @@ -187,12 +158,10 @@ define <8 x i16> @test_insert_v8i16_insert_2(i16 %a) { define <8 x i16> @test_insert_v8i16_insert_3(i16 %a) { ; CHECK-LABEL: test_insert_v8i16_insert_3: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: mov.h v0[0], w0 -; CHECK-NEXT: mov.h v0[2], w0 -; CHECK-NEXT: mov.h v0[4], w0 -; CHECK-NEXT: mov.h v0[5], w0 -; CHECK-NEXT: mov.h v0[6], w0 +; CHECK-NEXT: dup.8h v0, w0 +; CHECK-NEXT: mov.h v0[1], wzr +; CHECK-NEXT: mov.h v0[3], wzr +; CHECK-NEXT: mov.h v0[7], wzr ; CHECK-NEXT: ret %v.0 = insertelement <8 x i16> , i16 %a, i32 0 %v.2 = insertelement <8 x i16> %v.0, i16 %a, i32 2 @@ -247,12 +216,10 @@ define <2 x float> @test_insert_v2f32_undef_zero_vector(float %a) { define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) { ; CHECK-LABEL: test_insert_3_f32_undef_zero_vector: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov.s v1[0], v0[0] -; CHECK-NEXT: mov.s v1[1], v0[0] -; CHECK-NEXT: mov.s v1[2], v0[0] -; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: mov.s v0[3], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <4 x float> , float %a, i32 0 %v.1 = insertelement <4 x float> %v.0, float %a, i32 1 -- 2.7.4