From 3c0c24e0c1d37e903eab484b34ef854ec785346b Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 20 Dec 2022 13:50:34 +0000 Subject: [PATCH] [AArch64] Combine to UMULL if top bits are known zero Given mul(zext(a), b), we can convert to a umull so long as we know that the top bits of b are zero. This uses MaskedValueIsZero to detect that case for NEON UMULL patterns. Differential Revision: https://reviews.llvm.org/D140287 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 34 +++++++++ llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll | 20 +++--- llvm/test/CodeGen/AArch64/aarch64-smull.ll | 92 +++++++++---------------- 3 files changed, 74 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c0887ec..4ee8407 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4567,8 +4567,42 @@ static unsigned selectUmullSmull(SDNode *&N0, SDNode *&N1, SelectionDAG &DAG, return AArch64ISD::SMULL; } } + + // Select UMULL if we can replace the other operand with an extend. + if (IsN0ZExt || IsN1ZExt) { + EVT VT = N0->getValueType(0); + APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(), + VT.getScalarSizeInBits() / 2); + if (DAG.MaskedValueIsZero(SDValue(IsN0ZExt ? N1 : N0, 0), Mask)) { + EVT HalfVT; + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v2i64: + HalfVT = MVT::v2i32; + break; + case MVT::v4i32: + HalfVT = MVT::v4i16; + break; + case MVT::v8i16: + HalfVT = MVT::v8i8; + break; + default: + return 0; + } + // Truncate and then extend the result. + SDValue NewExt = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, + SDValue(IsN0ZExt ? N1 : N0, 0)); + NewExt = DAG.getZExtOrTrunc(NewExt, DL, VT); + if (IsN0ZExt) + N1 = NewExt.getNode(); + else + N0 = NewExt.getNode(); + return AArch64ISD::UMULL; + } + } + if (!IsN1SExt && !IsN1ZExt) return 0; + // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these // into (s/zext A * s/zext C) + (s/zext B * s/zext C) if (IsN1SExt && isAddSubSExt(N0, DAG)) { diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll index ea21af5..197c08dc 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -116,17 +116,13 @@ entry: define <2 x i64> @dupzext_v2i16_v2i64(i16 %src, <2 x i16> %b) { ; CHECK-LABEL: dupzext_v2i16_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi d1, #0x00ffff0000ffff ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: dup v2.2d, x8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: mul x9, x8, x9 -; CHECK-NEXT: mul x8, x8, x10 -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: umull v0.2d, v2.2s, v0.2s ; CHECK-NEXT: ret entry: %in = zext i16 %src to i64 @@ -225,12 +221,12 @@ define void @typei1_orig(i64 %a, ptr %p, ptr %q) { define <8 x i16> @typei1_v8i1_v8i16(i1 %src, <8 x i1> %b) { ; CHECK-LABEL: typei1_v8i1_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.8b, #1 ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: movi v1.8b, #1 +; CHECK-NEXT: dup v2.8h, w8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: dup v1.8h, w8 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-NEXT: xtn v2.8b, v2.8h +; CHECK-NEXT: umull v0.8h, v2.8b, v0.8b ; CHECK-NEXT: ret entry: %in = zext i1 %src to i16 diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 590a944..7dd2ae1 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -932,9 +932,9 @@ define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) { ; CHECK-LABEL: umull_and_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: bic v1.8h, #255, lsl #8 -; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v1.8b, v1.8h +; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> @@ -946,9 +946,9 @@ entry: define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) { ; CHECK-LABEL: umull_and_v8i16_c: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: bic v1.8h, #255, lsl #8 -; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-NEXT: xtn v1.8b, v1.8h +; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b ; CHECK-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> @@ -989,9 +989,9 @@ define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v2.8b, #15 ; CHECK-NEXT: bic v1.8h, #255, lsl #8 +; CHECK-NEXT: xtn v1.8b, v1.8h ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret entry: %in1 = zext <8 x i4> %src1 to <8 x i16> @@ -1004,9 +1004,9 @@ define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) { ; CHECK-LABEL: umull_and_v4i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret entry: %in1 = zext <4 x i16> %src1 to <4 x i32> @@ -1019,12 +1019,13 @@ define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) { ; CHECK-LABEL: umull_and_v8i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v3.2d, #0x0000ff000000ff -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: and v5.16b, v1.16b, v3.16b -; CHECK-NEXT: and v1.16b, v2.16b, v3.16b -; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s -; CHECK-NEXT: mul v0.4s, v4.4s, v5.4s +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: umull v1.4s, v4.4h, v2.4h ; CHECK-NEXT: ret entry: %in1 = zext <8 x i16> %src1 to <8 x i32> @@ -1037,11 +1038,11 @@ define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) { ; CHECK-LABEL: umull_and_v8i32_dup: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: mul v0.4s, v1.4s, v2.4s -; CHECK-NEXT: mul v1.4s, v3.4s, v2.4s +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h +; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h ; CHECK-NEXT: ret entry: %in1 = zext <8 x i16> %src1 to <8 x i32> @@ -1056,16 +1057,9 @@ define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) { ; CHECK-LABEL: umull_and_v2i64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v2.2d, #0x000000000000ff -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: fmov x10, d0 ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: mov x11, v0.d[1] -; CHECK-NEXT: mul x9, x10, x9 -; CHECK-NEXT: mul x8, x11, x8 -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: ret entry: %in1 = zext <2 x i32> %src1 to <2 x i64> @@ -1078,26 +1072,13 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) { ; CHECK-LABEL: umull_and_v4i64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v3.2d, #0x000000000000ff -; CHECK-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: fmov x14, d4 +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v2.16b, v2.16b, v3.16b -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: and v0.16b, v1.16b, v3.16b -; CHECK-NEXT: fmov x10, d2 -; CHECK-NEXT: fmov x13, d0 -; CHECK-NEXT: mov x8, v2.d[1] -; CHECK-NEXT: mov x12, v0.d[1] -; CHECK-NEXT: mul x10, x11, x10 -; CHECK-NEXT: mov x15, v4.d[1] -; CHECK-NEXT: mul x11, x14, x13 -; CHECK-NEXT: mul x8, x9, x8 -; CHECK-NEXT: fmov d1, x10 -; CHECK-NEXT: mul x9, x15, x12 -; CHECK-NEXT: fmov d0, x11 -; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: mov v0.d[1], x9 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: umull v1.2d, v4.2s, v2.2s ; CHECK-NEXT: ret entry: %in1 = zext <4 x i32> %src1 to <4 x i64> @@ -1109,21 +1090,12 @@ entry: define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) { ; CHECK-LABEL: umull_and_v4i64_dup: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: and x8, x0, #0xff -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: mov x10, v1.d[1] -; CHECK-NEXT: mov x12, v0.d[1] -; CHECK-NEXT: mul x9, x9, x8 -; CHECK-NEXT: mul x11, x11, x8 -; CHECK-NEXT: mul x10, x10, x8 -; CHECK-NEXT: mul x8, x12, x8 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: fmov d0, x11 -; CHECK-NEXT: mov v1.d[1], x10 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: dup v2.2d, x8 +; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s ; CHECK-NEXT: ret entry: %in1 = zext <4 x i32> %src1 to <4 x i64> -- 2.7.4