From: David Green Date: Wed, 21 Dec 2022 14:59:59 +0000 (+0000) Subject: [AArch64] Combine Trunc(DUP) -> DUP X-Git-Tag: upstream/17.0.6~22959 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3e65ad7482e9e612abcc115f8fb2ed379fcad612;p=platform%2Fupstream%2Fllvm.git [AArch64] Combine Trunc(DUP) -> DUP This adds a simple fold of TRUNCATE(AArch64ISD::DUP) -> AArch64ISD::DUP, which can help generate more optimal UMULL sequences, and seems useful in general. Differential Revision: https://reviews.llvm.org/D140289 --- diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index faee3f8..058a124 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -930,6 +930,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR}); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MSTORE); @@ -17392,6 +17393,22 @@ static SDValue performBuildVectorCombine(SDNode *N, return SDValue(); } +static SDValue performTruncateCombine(SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() && + N0.getOpcode() == AArch64ISD::DUP) { + SDValue Op = N0.getOperand(0); + if (VT.getScalarType() == MVT::i32 && + N0.getOperand(0).getValueType().getScalarType() == MVT::i64) + Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op); + return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op); + } + + return SDValue(); +} + // Check an node is an extend or shift operand static bool isExtendOrShiftOperand(SDValue N) { unsigned Opcode = N.getOpcode(); @@ -21192,6 +21209,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performAddSubCombine(N, DCI, DAG); case ISD::BUILD_VECTOR: return performBuildVectorCombine(N, DCI, DAG); + case ISD::TRUNCATE: + return performTruncateCombine(N, DAG); case AArch64ISD::ANDS: return performFlagSettingCombine(N, DCI, ISD::AND); case AArch64ISD::ADC: diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll index 197c08dc..f02d3f1 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -116,12 +116,10 @@ entry: define <2 x i64> @dupzext_v2i16_v2i64(i16 %src, <2 x i16> %b) { ; CHECK-LABEL: dupzext_v2i16_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: and w8, w0, #0xffff ; CHECK-NEXT: movi d1, #0x00ffff0000ffff -; CHECK-NEXT: dup v2.2d, x8 +; CHECK-NEXT: dup v2.2s, w8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: xtn v2.2s, v2.2d ; CHECK-NEXT: umull v0.2d, v2.2s, v0.2s ; CHECK-NEXT: ret entry: @@ -223,9 +221,8 @@ define <8 x i16> @typei1_v8i1_v8i16(i1 %src, <8 x i1> %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: movi v1.8b, #1 -; CHECK-NEXT: dup v2.8h, w8 +; CHECK-NEXT: dup v2.8b, w8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: xtn v2.8b, v2.8h ; CHECK-NEXT: umull v0.8h, v2.8b, v0.8b ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 7dd2ae1..da0e428 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -1038,11 +1038,9 @@ define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) { ; CHECK-LABEL: umull_and_v8i32_dup: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: dup v2.8h, w8 +; CHECK-NEXT: umull2 v1.4s, v0.8h, v2.8h ; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h -; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h ; CHECK-NEXT: ret entry: %in1 = zext <8 x i16> %src1 to <8 x i32> @@ -1090,12 +1088,10 @@ entry: define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) { ; CHECK-LABEL: umull_and_v4i64_dup: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and x8, x0, #0xff -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: dup v2.2d, x8 -; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s ; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s ; CHECK-NEXT: ret entry: %in1 = zext <4 x i32> %src1 to <4 x i64> diff --git a/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll b/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll index 4fccf4d..f1bca14 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll @@ -11,8 +11,7 @@ define void @no_combine(i32 %p) local_unnamed_addr { ; CHECK-LABEL: no_combine: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v0.4h, #4 -; CHECK-NEXT: dup v1.4s, w0 -; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: dup v1.4h, w0 ; CHECK-NEXT: mov v1.d[1], v0.d[0] ; CHECK-NEXT: uzp1 v0.16b, v1.16b, v1.16b ; CHECK-NEXT: str q0, [x8]