From c33af715d7762dee25e3b80720d84f21fedcbbe8 Mon Sep 17 00:00:00 2001 From: Sebastian Pop Date: Thu, 1 Mar 2018 15:47:39 +0000 Subject: [PATCH] [AArch64] generate vuzp instead of mov when a BUILD_VECTOR is created out of a sequence of EXTRACT_VECTOR_ELT with a specific pattern sequence, either <0, 2, 4, ...> or <1, 3, 5, ...>, replace the BUILD_VECTOR with either vuzp1 or vuzp2. With this patch LLVM generates the following code for the first function fun1 in the testcase: adrp x8, .LCPI0_0 ldr q0, [x8, :lo12:.LCPI0_0] tbl v0.16b, { v0.16b }, v0.16b ext v1.16b, v0.16b, v0.16b, #8 uzp1 v0.8b, v0.8b, v1.8b str d0, [x8] ret Without this patch LLVM currently generates this code: adrp x8, .LCPI0_0 ldr q0, [x8, :lo12:.LCPI0_0] tbl v0.16b, { v0.16b }, v0.16b mov v1.16b, v0.16b mov v1.b[1], v0.b[2] mov v1.b[2], v0.b[4] mov v1.b[3], v0.b[6] mov v1.b[4], v0.b[8] mov v1.b[5], v0.b[10] mov v1.b[6], v0.b[12] mov v1.b[7], v0.b[14] str d1, [x8] ret llvm-svn: 326443 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 59 +++++++++++++++++++++++++ llvm/test/CodeGen/AArch64/aarch64-vuzp.ll | 51 +++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-vuzp.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index eb21585..983d538 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6706,16 +6706,20 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, // select the values we'll be overwriting for the non-constant // lanes such that we can directly materialize the vector // some other way (MOVI, e.g.), we can be sneaky. + // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP. unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; bool usesOnlyOneConstantValue = true; bool isConstant = true; + bool AllLanesExtractElt = true; unsigned NumConstantLanes = 0; SDValue Value; SDValue ConstantValue; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); + if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + AllLanesExtractElt = false; if (V.isUndef()) continue; if (i > 0) @@ -6748,6 +6752,61 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); } + if (AllLanesExtractElt) { + SDNode *Vector = nullptr; + bool Even = false; + bool Odd = false; + // Check whether the extract elements match the Even pattern <0,2,4,...> or + // the Odd pattern <1,3,5,...>. + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + const SDNode *N = V.getNode(); + if (!isa(N->getOperand(1))) + break; + + // All elements are extracted from the same vector. + if (!Vector) + Vector = N->getOperand(0).getNode(); + else if (Vector != N->getOperand(0).getNode()) { + Odd = false; + Even = false; + break; + } + + // Extracted values are either at Even indices <0,2,4,...> or at Odd + // indices <1,3,5,...>. + uint64_t Val = N->getConstantOperandVal(1); + if (Val == 2 * i) { + Even = true; + continue; + } + if (Val - 1 == 2 * i) { + Odd = true; + continue; + } + + // Something does not match: abort. + Odd = false; + Even = false; + break; + } + if (Even || Odd) { + SDValue LHS = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), + DAG.getConstant(0, dl, MVT::i64)); + SDValue RHS = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), + DAG.getConstant(NumElts, dl, MVT::i64)); + + if (Even && !Odd) + return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS, + RHS); + if (Odd && !Even) + return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS, + RHS); + } + } + // Use DUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. if (usesOnlyOneValue) { diff --git a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll new file mode 100644 index 0000000..51866fa --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll @@ -0,0 +1,51 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +; CHECK-LABEL: fun1: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK-NOT: mov +define i32 @fun1() { +entry: + %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> , <16 x i8> undef) + %vuzp.i212.1 = shufflevector <16 x i8> %vtbl1.i.1, <16 x i8> undef, <8 x i32> + %scevgep = getelementptr <8 x i8>, <8 x i8>* undef, i64 1 + store <8 x i8> %vuzp.i212.1, <8 x i8>* %scevgep, align 1 + ret i32 undef +} + +; CHECK-LABEL: fun2: +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK-NOT: mov +define i32 @fun2() { +entry: + %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> , <16 x i8> undef) + %vuzp.i212.1 = shufflevector <16 x i8> %vtbl1.i.1, <16 x i8> undef, <8 x i32> + %scevgep = getelementptr <8 x i8>, <8 x i8>* undef, i64 1 + store <8 x i8> %vuzp.i212.1, <8 x i8>* %scevgep, align 1 + ret i32 undef +} + +; CHECK-LABEL: fun3: +; CHECK-NOT: uzp1 +; CHECK: mov +define i32 @fun3() { +entry: + %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> , <16 x i8> undef) + %vuzp.i212.1 = shufflevector <16 x i8> %vtbl1.i.1, <16 x i8> undef, <8 x i32> + %scevgep = getelementptr <8 x i8>, <8 x i8>* undef, i64 1 + store <8 x i8> %vuzp.i212.1, <8 x i8>* %scevgep, align 1 + ret i32 undef +} + +; CHECK-LABEL: fun4: +; CHECK-NOT: uzp2 +; CHECK: mov +define i32 @fun4() { +entry: + %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> , <16 x i8> undef) + %vuzp.i212.1 = shufflevector <16 x i8> %vtbl1.i.1, <16 x i8> undef, <8 x i32> + %scevgep = getelementptr <8 x i8>, <8 x i8>* undef, i64 1 + store <8 x i8> %vuzp.i212.1, <8 x i8>* %scevgep, align 1 + ret i32 undef +} + +declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) -- 2.7.4