From 0849f0a5a38a16f52d7c698b7b024bd58942fab7 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Sun, 4 Sep 2022 19:30:28 -0700 Subject: [PATCH] [AArch64] Pre-commit test case to show sub-optimal codegen for Github issue #57502 Pre-commit test cases to show cases when UZP1 (TRUNC, TRUNC) could be combined into TRUNC (UZP1) (with some proper bit conversions in the middle) to generate more efficient code. Differential Revision: https://reviews.llvm.org/D133280 --- llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll | 313 ++++++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll diff --git a/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll b/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll new file mode 100644 index 0000000..7f81ff6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll @@ -0,0 +1,313 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LE %s +; RUN: llc < %s -mtriple aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s + +; Test cases to show when UZP1 (TRUNC, TRUNC) could be combined to TRUNC (UZP1) but not yet implemented. + +define <4 x i16> @test_combine_v4i16_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LE-LABEL: test_combine_v4i16_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: xtn v0.2s, v0.2d +; CHECK-LE-NEXT: xtn v1.2s, v1.2d +; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test_combine_v4i16_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: xtn v1.2s, v1.2d +; CHECK-BE-NEXT: xtn v0.2s, v0.2d +; CHECK-BE-NEXT: rev32 v1.4h, v1.4h +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-BE-NEXT: rev64 v0.4h, v0.4h +; CHECK-BE-NEXT: ret + %a1 = trunc <2 x i64> %a to <2 x i32> + %b1 = trunc <2 x i64> %b to <2 x i32> + + %a2 = bitcast <2 x i32> %a1 to <4 x i16> + %b2 = bitcast <2 x i32> %b1 to <4 x i16> + + %ab = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> + ret <4 x i16> %ab +} + +define <4 x i16> @test_combine_v4i16_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LE-LABEL: test_combine_v4i16_v4i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: xtn v0.4h, v0.4s +; CHECK-LE-NEXT: xtn v1.4h, v1.4s +; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test_combine_v4i16_v4i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: rev64 v1.4s, v1.4s +; CHECK-BE-NEXT: rev64 v0.4s, v0.4s +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: xtn v1.4h, v1.4s +; CHECK-BE-NEXT: xtn v0.4h, v0.4s +; CHECK-BE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-BE-NEXT: rev64 v0.4h, v0.4h +; CHECK-BE-NEXT: ret + %a1 = trunc <4 x i32> %a to <4 x i16> + %b1 = trunc <4 x i32> %b to <4 x i16> + + %ab = shufflevector <4 x i16> %a1, <4 x i16> %b1, <4 x i32> + ret <4 x i16> %ab +} + +define <4 x i16> @test_combine_v4i16_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LE-LABEL: test_combine_v4i16_v8i16: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: xtn v0.8b, v0.8h +; CHECK-LE-NEXT: xtn v1.8b, v1.8h +; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test_combine_v4i16_v8i16: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: rev64 v1.8h, v1.8h +; CHECK-BE-NEXT: rev64 v0.8h, v0.8h +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: xtn v1.8b, v1.8h +; CHECK-BE-NEXT: xtn v0.8b, v0.8h +; CHECK-BE-NEXT: rev16 v1.8b, v1.8b +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-BE-NEXT: rev64 v0.4h, v0.4h +; CHECK-BE-NEXT: ret + %a1 = trunc <8 x i16> %a to <8 x i8> + %b1 = trunc <8 x i16> %b to <8 x i8> + + %a2 = bitcast <8 x i8> %a1 to <4 x i16> + %b2 = bitcast <8 x i8> %b1 to <4 x i16> + + %ab = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> + ret <4 x i16> %ab +} + + +define <8 x i8> @test_combine_v8i8_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LE-LABEL: test_combine_v8i8_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: xtn v0.2s, v0.2d +; CHECK-LE-NEXT: xtn v1.2s, v1.2d +; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test_combine_v8i8_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: xtn v1.2s, v1.2d +; CHECK-BE-NEXT: xtn v0.2s, v0.2d +; CHECK-BE-NEXT: rev32 v1.8b, v1.8b +; CHECK-BE-NEXT: rev32 v0.8b, v0.8b +; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-BE-NEXT: rev64 v0.8b, v0.8b +; CHECK-BE-NEXT: ret + %a1 = trunc <2 x i64> %a to <2 x i32> + %b1 = trunc <2 x i64> %b to <2 x i32> + + %a2 = bitcast <2 x i32> %a1 to <8 x i8> + %b2 = bitcast <2 x i32> %b1 to <8 x i8> + + %ab = shufflevector <8 x i8> %a2, <8 x i8> %b2, <8 x i32> + ret <8 x i8> %ab +} + +define <8 x i8> @test_combine_v8i8_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LE-LABEL: test_combine_v8i8_v4i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: xtn v0.4h, v0.4s +; CHECK-LE-NEXT: xtn v1.4h, v1.4s +; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test_combine_v8i8_v4i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: rev64 v1.4s, v1.4s +; CHECK-BE-NEXT: rev64 v0.4s, v0.4s +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: xtn v1.4h, v1.4s +; CHECK-BE-NEXT: xtn v0.4h, v0.4s +; CHECK-BE-NEXT: rev16 v1.8b, v1.8b +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-BE-NEXT: rev64 v0.8b, v0.8b +; CHECK-BE-NEXT: ret + %a1 = trunc <4 x i32> %a to <4 x i16> + %b1 = trunc <4 x i32> %b to <4 x i16> + + %a2 = bitcast <4 x i16> %a1 to <8 x i8> + %b2 = bitcast <4 x i16> %b1 to <8 x i8> + + %ab = shufflevector <8 x i8> %a2, <8 x i8> %b2, <8 x i32> + ret <8 x i8> %ab +} + +define <8 x i8> @test_combine_v8i8_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LE-LABEL: test_combine_v8i8_v8i16: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: xtn v0.8b, v0.8h +; CHECK-LE-NEXT: xtn v1.8b, v1.8h +; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test_combine_v8i8_v8i16: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: rev64 v1.8h, v1.8h +; CHECK-BE-NEXT: rev64 v0.8h, v0.8h +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: xtn v1.8b, v1.8h +; CHECK-BE-NEXT: xtn v0.8b, v0.8h +; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-BE-NEXT: rev64 v0.8b, v0.8b +; CHECK-BE-NEXT: ret + %a1 = trunc <8 x i16> %a to <8 x i8> + %b1 = trunc <8 x i16> %b to <8 x i8> + + %ab = shufflevector <8 x i8> %a1, <8 x i8> %b1, <8 x i32> + ret <8 x i8> %ab +} + +define <2 x i32> @test_combine_v2i32_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LE-LABEL: test_combine_v2i32_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: xtn v0.2s, v0.2d +; CHECK-LE-NEXT: xtn v1.2s, v1.2d +; CHECK-LE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test_combine_v2i32_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: xtn v1.2s, v1.2d +; CHECK-BE-NEXT: xtn v0.2s, v0.2d +; CHECK-BE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %a1 = trunc <2 x i64> %a to <2 x i32> + %b1 = trunc <2 x i64> %b to <2 x i32> + + %ab = shufflevector <2 x i32> %a1, <2 x i32> %b1, <2 x i32> + ret <2 x i32> %ab +} + +define <2 x i32> @test_combine_v2i32_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LE-LABEL: test_combine_v2i32_v4i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: xtn v0.4h, v0.4s +; CHECK-LE-NEXT: xtn v1.4h, v1.4s +; CHECK-LE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test_combine_v2i32_v4i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: rev64 v1.4s, v1.4s +; CHECK-BE-NEXT: rev64 v0.4s, v0.4s +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: xtn v1.4h, v1.4s +; CHECK-BE-NEXT: xtn v0.4h, v0.4s +; CHECK-BE-NEXT: rev32 v1.4h, v1.4h +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %a1 = trunc <4 x i32> %a to <4 x i16> + %b1 = trunc <4 x i32> %b to <4 x i16> + + %a2 = bitcast <4 x i16> %a1 to <2 x i32> + %b2 = bitcast <4 x i16> %b1 to <2 x i32> + + %ab = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> + ret <2 x i32> %ab +} + +define <2 x i32> @test_combine_v2i32_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LE-LABEL: test_combine_v2i32_v8i16: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: xtn v0.8b, v0.8h +; CHECK-LE-NEXT: xtn v1.8b, v1.8h +; CHECK-LE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test_combine_v2i32_v8i16: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: rev64 v1.8h, v1.8h +; CHECK-BE-NEXT: rev64 v0.8h, v0.8h +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: xtn v1.8b, v1.8h +; CHECK-BE-NEXT: xtn v0.8b, v0.8h +; CHECK-BE-NEXT: rev32 v1.8b, v1.8b +; CHECK-BE-NEXT: rev32 v0.8b, v0.8b +; CHECK-BE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %a1 = trunc <8 x i16> %a to <8 x i8> + %b1 = trunc <8 x i16> %b to <8 x i8> + + %a2 = bitcast <8 x i8> %a1 to <2 x i32> + %b2 = bitcast <8 x i8> %b1 to <2 x i32> + + %ab = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> + ret <2 x i32> %ab +} + +define i8 @trunc_v4i64_v4i8(<4 x i64> %input) { +; CHECK-LE-LABEL: trunc_v4i64_v4i8: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: xtn v1.2s, v1.2d +; CHECK-LE-NEXT: xtn v0.2s, v0.2d +; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-LE-NEXT: addv h0, v0.4h +; CHECK-LE-NEXT: fmov w0, s0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: trunc_v4i64_v4i8: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: xtn v0.2s, v0.2d +; CHECK-BE-NEXT: xtn v1.2s, v1.2d +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: rev32 v1.4h, v1.4h +; CHECK-BE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-BE-NEXT: addv h0, v0.4h +; CHECK-BE-NEXT: fmov w0, s0 +; CHECK-BE-NEXT: ret + %var = trunc <4 x i64> %input to <4 x i8> + ; llvm.vector.reduce.add.v4i8 is needed to reproduce the codegen (see `trunc_v4i64_v4i8_ret` below as a comparison) + %res = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %var) + ret i8 %res +} + +define <4 x i8> @trunc_v4i64_v4i8_ret(<4 x i64> %input) { +; CHECK-LE-LABEL: trunc_v4i64_v4i8_ret: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-LE-NEXT: xtn v0.4h, v0.4s +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: trunc_v4i64_v4i8_ret: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-BE-NEXT: xtn v0.4h, v0.4s +; CHECK-BE-NEXT: rev64 v0.4h, v0.4h +; CHECK-BE-NEXT: ret + %var = trunc <4 x i64> %input to <4 x i8> + ret <4 x i8> %var +} + +declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) -- 2.7.4