From a1e799288d33182d52554d3b1440255279863371 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 11 Nov 2022 08:27:44 +0000 Subject: [PATCH] [AArch64] Add smull sinking extract-and-splat tests and regenerate neon-vmull-high-p8.ll. NFC --- llvm/test/CodeGen/AArch64/neon-vmull-high-p8.ll | 22 ++++-- llvm/test/CodeGen/AArch64/sinksplat.ll | 101 ++++++++++++++++++------ 2 files changed, 93 insertions(+), 30 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/neon-vmull-high-p8.ll b/llvm/test/CodeGen/AArch64/neon-vmull-high-p8.ll index 137a865..7d07b41 100644 --- a/llvm/test/CodeGen/AArch64/neon-vmull-high-p8.ll +++ b/llvm/test/CodeGen/AArch64/neon-vmull-high-p8.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s ; Check that pmull2 instruction is used for vmull_high_p8 intrinsic ; even if shufflevector instructions are located in different basic blocks, @@ -8,10 +8,12 @@ define <8 x i16> @test_pmull2_sink(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c, i1 %t) { ; CHECK-LABEL: test_pmull2_sink: -; CHECK: tbz w0, #0, .LBB0_2 +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: tbz w0, #0, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %if.then ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: .LBB0_2: // %cleanup ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: @@ -30,10 +32,12 @@ cleanup: define <8 x i16> @test_pmull2_sink2(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c, i1 %t) { ; CHECK-LABEL: test_pmull2_sink2: -; CHECK: tbz w0, #0, .LBB1_2 +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: tbz w0, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %if.then ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v0.16b ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: .LBB1_2: // %cleanup ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: @@ -52,12 +56,14 @@ cleanup: define <8 x i16> @test_pmull2_sink3(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c, i1 %t, i1 %t2) { ; CHECK-LABEL: test_pmull2_sink3: -; CHECK: tbz w0, #0, .LBB2_2 +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: tbz w0, #0, .LBB2_2 +; CHECK-NEXT: // %bb.1: // %if.then ; CHECK-NEXT: tbz w1, #0, .LBB2_3 -; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: .LBB2_2: // %if.then.2 ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: .LBB2_3: // %cleanup ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v0.16b ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll index e897d4a9..59a9ba9 100644 --- a/llvm/test/CodeGen/AArch64/sinksplat.ll +++ b/llvm/test/CodeGen/AArch64/sinksplat.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s -define <4 x i32> @smull(<4 x i16> %x, <4 x i16> *%y) { +define <4 x i32> @smull(<4 x i16> %x, ptr %y) { ; CHECK-LABEL: smull: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov d1, d0 @@ -22,7 +22,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x i16>, <4 x i16> *%y + %l = load <4 x i16>, ptr %y %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %l, <4 x i16> %a) %c = add nsw <4 x i32> %q, %b %pa = add i32 %p, 1 @@ -33,7 +33,7 @@ l2: ret <4 x i32> %c } -define <4 x i32> @umull(<4 x i16> %x, <4 x i16> *%y) { +define <4 x i32> @umull(<4 x i16> %x, ptr %y) { ; CHECK-LABEL: umull: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov d1, d0 @@ -54,7 +54,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x i16>, <4 x i16> *%y + %l = load <4 x i16>, ptr %y %b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %l, <4 x i16> %a) %c = add nsw <4 x i32> %q, %b %pa = add i32 %p, 1 @@ -65,7 +65,7 @@ l2: ret <4 x i32> %c } -define <4 x i32> @sqadd(<4 x i32> %x, <4 x i32> *%y) { +define <4 x i32> @sqadd(<4 x i32> %x, ptr %y) { ; CHECK-LABEL: sqadd: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b @@ -87,7 +87,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x i32>, <4 x i32> *%y + %l = load <4 x i32>, ptr %y %b = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %l, <4 x i32> %a) %c = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %q, <4 x i32> %b) %pa = add i32 %p, 1 @@ -98,7 +98,7 @@ l2: ret <4 x i32> %c } -define <4 x i32> @sqsub(<4 x i32> %x, <4 x i32> *%y) { +define <4 x i32> @sqsub(<4 x i32> %x, ptr %y) { ; CHECK-LABEL: sqsub: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b @@ -120,7 +120,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x i32>, <4 x i32> *%y + %l = load <4 x i32>, ptr %y %b = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %l, <4 x i32> %a) %c = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %q, <4 x i32> %b) %pa = add i32 %p, 1 @@ -131,7 +131,7 @@ l2: ret <4 x i32> %c } -define <4 x i32> @sqdmulh(<4 x i32> %x, <4 x i32> *%y) { +define <4 x i32> @sqdmulh(<4 x i32> %x, ptr %y) { ; CHECK-LABEL: sqdmulh: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b @@ -153,7 +153,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x i32>, <4 x i32> *%y + %l = load <4 x i32>, ptr %y %b = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %l, <4 x i32> %a) %c = add nsw <4 x i32> %q, %b %pa = add i32 %p, 1 @@ -164,7 +164,7 @@ l2: ret <4 x i32> %c } -define <4 x i32> @sqdmull(<4 x i16> %x, <4 x i16> *%y) { +define <4 x i32> @sqdmull(<4 x i16> %x, ptr %y) { ; CHECK-LABEL: sqdmull: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov d1, d0 @@ -186,7 +186,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x i16>, <4 x i16> *%y + %l = load <4 x i16>, ptr %y %b = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %l, <4 x i16> %a) %c = add nsw <4 x i32> %q, %b %pa = add i32 %p, 1 @@ -197,7 +197,7 @@ l2: ret <4 x i32> %c } -define <4 x i32> @mlal(<4 x i32> %x, <4 x i32> *%y) { +define <4 x i32> @mlal(<4 x i32> %x, ptr %y) { ; CHECK-LABEL: mlal: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b @@ -219,7 +219,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x i32>, <4 x i32> *%y + %l = load <4 x i32>, ptr %y %b = mul <4 x i32> %l, %a %c = add <4 x i32> %q, %b %pa = add i32 %p, 1 @@ -230,7 +230,7 @@ l2: ret <4 x i32> %c } -define <4 x float> @fmul(<4 x float> %x, <4 x float> *%y) { +define <4 x float> @fmul(<4 x float> %x, ptr %y) { ; CHECK-LABEL: fmul: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b @@ -252,7 +252,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x float>, <4 x float> *%y + %l = load <4 x float>, ptr %y %b = fmul <4 x float> %l, %a %c = fadd <4 x float> %b, %q %pa = add i32 %p, 1 @@ -263,7 +263,7 @@ l2: ret <4 x float> %c } -define <4 x float> @fmuladd(<4 x float> %x, <4 x float> *%y) { +define <4 x float> @fmuladd(<4 x float> %x, ptr %y) { ; CHECK-LABEL: fmuladd: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b @@ -285,7 +285,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x float>, <4 x float> *%y + %l = load <4 x float>, ptr %y %b = fmul fast <4 x float> %l, %a %c = fadd fast <4 x float> %b, %q %pa = add i32 %p, 1 @@ -296,7 +296,7 @@ l2: ret <4 x float> %c } -define <4 x float> @fma(<4 x float> %x, <4 x float> *%y) { +define <4 x float> @fma(<4 x float> %x, ptr %y) { ; CHECK-LABEL: fma: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b @@ -320,7 +320,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x float>, <4 x float> *%y + %l = load <4 x float>, ptr %y %c = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %l, <4 x float> %q, <4 x float> %a) %pa = add i32 %p, 1 %c1 = icmp eq i32 %p, 0 @@ -330,7 +330,7 @@ l2: ret <4 x float> %c } -define <4 x i32> @smull_nonsplat(<4 x i16> %x, <4 x i16> *%y) { +define <4 x i32> @smull_nonsplat(<4 x i16> %x, ptr %y) { ; CHECK-LABEL: smull_nonsplat: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov d1, d0 @@ -353,7 +353,7 @@ entry: l1: %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ] - %l = load <4 x i16>, <4 x i16> *%y + %l = load <4 x i16>, ptr %y %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %l, <4 x i16> %a) %c = add nsw <4 x i32> %q, %b %pa = add i32 %p, 1 @@ -364,6 +364,63 @@ l2: ret <4 x i32> %c } +define <4 x i32> @smull_splat_and_extract(<4 x i16> %x, <8 x i16> %l, ptr %y, i1 %co) { +; CHECK-LABEL: smull_splat_and_extract: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d2, d0 +; CHECK-NEXT: smull v0.4s, v1.4h, v2.h[3] +; CHECK-NEXT: tbz w1, #0, .LBB11_2 +; CHECK-NEXT: // %bb.1: // %l1 +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: smlal v0.4s, v1.4h, v2.h[3] +; CHECK-NEXT: .LBB11_2: // %l2 +; CHECK-NEXT: ret +entry: + %a = shufflevector <4 x i16> %x, <4 x i16> undef, <4 x i32> + %e1 = shufflevector <8 x i16> %l, <8 x i16> undef, <4 x i32> + %e2 = shufflevector <8 x i16> %l, <8 x i16> undef, <4 x i32> + %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %e1, <4 x i16> %a) + br i1 %co, label %l1, label %l2 + +l1: + %b2 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %e2, <4 x i16> %a) + %c2 = add nsw <4 x i32> %b, %b2 + br label %l2 + +l2: + %r = phi <4 x i32> [ %b, %entry ], [ %c2, %l1 ] + ret <4 x i32> %r +} + +define <4 x i32> @umull_splat_and_extract(<4 x i16> %x, <8 x i16> %l, ptr %y, i1 %co) { +; CHECK-LABEL: umull_splat_and_extract: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d2, d0 +; CHECK-NEXT: umull v0.4s, v1.4h, v2.h[3] +; CHECK-NEXT: tbz w1, #0, .LBB12_2 +; CHECK-NEXT: // %bb.1: // %l1 +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: umlal v0.4s, v1.4h, v2.h[3] +; CHECK-NEXT: .LBB12_2: // %l2 +; CHECK-NEXT: ret +entry: + %a = shufflevector <4 x i16> %x, <4 x i16> undef, <4 x i32> + %e1 = shufflevector <8 x i16> %l, <8 x i16> undef, <4 x i32> + %e2 = shufflevector <8 x i16> %l, <8 x i16> undef, <4 x i32> + %b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %e1, <4 x i16> %a) + br i1 %co, label %l1, label %l2 + +l1: + %b2 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %e2, <4 x i16> %a) + %c2 = add nsw <4 x i32> %b, %b2 + br label %l2 + +l2: + %r = phi <4 x i32> [ %b, %entry ], [ %c2, %l1 ] + ret <4 x i32> %r +} + + declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) -- 2.7.4