From ea6693d4c840272b5f9b83518b474ed7b2449744 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 18 Nov 2022 07:35:03 -0800 Subject: [PATCH] [Hexagon] Add missing patterns for mulhs/mulhu --- llvm/lib/Target/Hexagon/HexagonPatterns.td | 74 +++--- llvm/test/CodeGen/Hexagon/isel/mulh-scalar.ll | 315 ++++++++++++++++++++++++++ 2 files changed, 360 insertions(+), 29 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/isel/mulh-scalar.ll diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index bf6303f..ac4a352 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -1546,47 +1546,63 @@ def: Pat<(v2i32 (mulhu V2I32:$Rss, V2I32:$Rtt)), (Combinew (M2_mpyu_up (HiReg $Rss), (HiReg $Rtt)), (M2_mpyu_up (LoReg $Rss), (LoReg $Rtt)))>; -def: Pat<(v2i32 (mulhs V2I32:$Rs, V2I32:$Rt)), - (Combinew (M2_mpy_up (HiReg $Rs), (HiReg $Rt)), - (M2_mpy_up (LoReg $Rt), (LoReg $Rt)))>; +def: Pat<(v2i32 (mulhs V2I32:$Rss, V2I32:$Rtt)), + (Combinew (M2_mpy_up (HiReg $Rss), (HiReg $Rtt)), + (M2_mpy_up (LoReg $Rss), (LoReg $Rtt)))>; -def Mulhub: +def Mulhub4: + OutPatFrag<(ops node:$Rs, node:$Rt), (S2_vtrunohb (M5_vmpybuu $Rs, $Rt))>; +def Mulhub8: OutPatFrag<(ops node:$Rss, node:$Rtt), - (Combinew (S2_vtrunohb (M5_vmpybuu (HiReg $Rss), (HiReg $Rtt))), - (S2_vtrunohb (M5_vmpybuu (LoReg $Rss), (LoReg $Rtt))))>; + (Combinew (Mulhub4 (HiReg $Rss), (HiReg $Rtt)), + (Mulhub4 (LoReg $Rss), (LoReg $Rtt)))>; -// Equivalent of byte-wise arithmetic shift right by 7 in v8i8. -def Asr7: - OutPatFrag<(ops node:$Rss), (C2_mask (C2_not (A4_vcmpbgti $Rss, 0)))>; +// (mux (x >= 0), 0, y) +def Negbytes8: + OutPatFrag<(ops node:$Rss, node:$Rtt), + (C2_vmux (A4_vcmpbgti $Rss, -1), (A2_tfrpi 0), $Rtt)>; + +def: Pat<(v4i8 (mulhu V4I8:$Rs, V4I8:$Rt)), (Mulhub4 $Rs, $Rt)>; +def: Pat<(v8i8 (mulhu V8I8:$Rss, V8I8:$Rtt)), (Mulhub8 $Rss, $Rtt)>; -def: Pat<(v8i8 (mulhu V8I8:$Rss, V8I8:$Rtt)), - (Mulhub $Rss, $Rtt)>; +// (Mulhs x, y) = (Mulhu x, y) - (x < 0 ? y : 0) - (y < 0 ? x : 0) +def Mulhsb8: + OutPatFrag<(ops node:$Rss, node:$Rtt), + (A2_vsubub (Mulhub8 $Rss, $Rtt), + (A2_vaddub (Negbytes8 $Rss, $Rtt), + (Negbytes8 $Rtt, $Rss)))>; -def: Pat<(v8i8 (mulhs V8I8:$Rss, V8I8:$Rtt)), - (A2_vsubub - (Mulhub $Rss, $Rtt), - (A2_vaddub (A2_andp V8I8:$Rss, (Asr7 $Rtt)), - (A2_andp V8I8:$Rtt, (Asr7 $Rss))))>; +def: Pat<(v4i8 (mulhs V4I8:$Rs, V4I8:$Rt)), + (LoReg (Mulhsb8 (v8i8 (ToAext64 $Rs)), (v8i8 (ToAext64 $Rt))))>; +def: Pat<(v8i8 (mulhs V8I8:$Rss, V8I8:$Rtt)), (Mulhsb8 $Rss, $Rtt)>; -def Mpysh: +// v2i16 *s v2i16 -> v2i32 +def Muli16: OutPatFrag<(ops node:$Rs, node:$Rt), (M2_vmpy2s_s0 $Rs, $Rt)>; -def Mpyshh: - OutPatFrag<(ops node:$Rss, node:$Rtt), (Mpysh (HiReg $Rss), (HiReg $Rtt))>; -def Mpyshl: - OutPatFrag<(ops node:$Rss, node:$Rtt), (Mpysh (LoReg $Rss), (LoReg $Rtt))>; -def Mulhsh: +def Mulhsh2: + OutPatFrag<(ops node:$Rs, node:$Rt), + (A2_combine_hh (HiReg (Muli16 $Rs, $Rt)), + (LoReg (Muli16 $Rs, $Rt)))>; +def Mulhsh4: OutPatFrag<(ops node:$Rss, node:$Rtt), - (Combinew (A2_combine_hh (HiReg (Mpyshh $Rss, $Rtt)), - (LoReg (Mpyshh $Rss, $Rtt))), - (A2_combine_hh (HiReg (Mpyshl $Rss, $Rtt)), - (LoReg (Mpyshl $Rss, $Rtt))))>; + (Combinew (Mulhsh2 (HiReg $Rss), (HiReg $Rtt)), + (Mulhsh2 (LoReg $Rss), (LoReg $Rtt)))>; + +def: Pat<(v2i16 (mulhs V2I16:$Rs, V2I16:$Rt)), (Mulhsh2 $Rs, $Rt)>; +def: Pat<(v4i16 (mulhs V4I16:$Rss, V4I16:$Rtt)), (Mulhsh4 $Rss, $Rtt)>; -def: Pat<(v4i16 (mulhs V4I16:$Rss, V4I16:$Rtt)), (Mulhsh $Rss, $Rtt)>; +def: Pat<(v2i16 (mulhu V2I16:$Rs, V2I16:$Rt)), + (A2_svaddh + (Mulhsh2 $Rs, $Rt), + (A2_svaddh (LoReg (A2_andp (Combinew $Rt, $Rs), + (S2_asr_i_vh (Combinew $Rs, $Rt), 15))), + (HiReg (A2_andp (Combinew $Rt, $Rs), + (S2_asr_i_vh (Combinew $Rs, $Rt), 15)))))>; def: Pat<(v4i16 (mulhu V4I16:$Rss, V4I16:$Rtt)), (A2_vaddh - (Mulhsh $Rss, $Rtt), + (Mulhsh4 $Rss, $Rtt), (A2_vaddh (A2_andp V4I16:$Rss, (S2_asr_i_vh $Rtt, 15)), (A2_andp V4I16:$Rtt, (S2_asr_i_vh $Rss, 15))))>; @@ -1783,7 +1799,7 @@ def: Pat<(sub V4I8:$Rs, V4I8:$Rt), // half-words, and saturates the result to a 32-bit value, except the // saturation never happens (it can only occur with scaling). def: Pat<(v2i16 (mul V2I16:$Rs, V2I16:$Rt)), - (LoReg (S2_vtrunewh (A2_combineii 0, 0), + (LoReg (S2_vtrunewh (IMPLICIT_DEF), (M2_vmpy2s_s0 V2I16:$Rs, V2I16:$Rt)))>; def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)), (S2_vtrunewh (M2_vmpy2s_s0 (HiReg $Rs), (HiReg $Rt)), diff --git a/llvm/test/CodeGen/Hexagon/isel/mulh-scalar.ll b/llvm/test/CodeGen/Hexagon/isel/mulh-scalar.ll new file mode 100644 index 0000000..3364a3c --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/isel/mulh-scalar.ll @@ -0,0 +1,315 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=hexagon < %s | FileCheck %s + +define <4 x i8> @f0(<4 x i8> %a0, <4 x i8> %a1) #0 { +; CHECK-LABEL: f0: +; CHECK: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r2 = r1 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7:6 = combine(#0,#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5:4 = vmpybu(r0,r1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: p1 = vcmpb.gt(r1:0,#-1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: p0 = vcmpb.gt(r3:2,#-1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = vmux(p1,r7:6,r3:2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = vmux(p0,r7:6,r1:0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = vtrunohb(r5:4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7:6 = vmpybu(r0,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = vaddub(r3:2,r1:0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5 = vtrunohb(r7:6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = vsubub(r5:4,r1:0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %v0 = sext <4 x i8> %a0 to <4 x i16> + %v1 = sext <4 x i8> %a1 to <4 x i16> + %v2 = mul <4 x i16> %v0, %v1 + %v3 = lshr <4 x i16> %v2, + %v4 = trunc <4 x i16> %v3 to <4 x i8> + ret <4 x i8> %v4 +} + +define <4 x i8> @f1(<4 x i8> %a0, <4 x i8> %a1) #0 { +; CHECK-LABEL: f1: +; CHECK: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = vmpybu(r0,r1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = vtrunohb(r1:0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %v0 = zext <4 x i8> %a0 to <4 x i16> + %v1 = zext <4 x i8> %a1 to <4 x i16> + %v2 = mul <4 x i16> %v0, %v1 + %v3 = lshr <4 x i16> %v2, + %v4 = trunc <4 x i16> %v3 to <4 x i8> + ret <4 x i8> %v4 +} + +define <8 x i8> @f2(<8 x i8> %a0, <8 x i8> %a1) #0 { +; CHECK-LABEL: f2: +; CHECK: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7:6 = combine(#0,#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: p0 = vcmpb.gt(r3:2,#-1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5:4 = vmpybu(r0,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r9:8 = vmux(p0,r7:6,r1:0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: p0 = vcmpb.gt(r1:0,#-1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = vmpybu(r1,r3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7:6 = vmux(p0,r7:6,r3:2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = vtrunohb(r5:4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = vaddub(r7:6,r9:8) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5 = vtrunohb(r1:0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = vsubub(r5:4,r3:2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %v0 = sext <8 x i8> %a0 to <8 x i16> + %v1 = sext <8 x i8> %a1 to <8 x i16> + %v2 = mul <8 x i16> %v0, %v1 + %v3 = lshr <8 x i16> %v2, + %v4 = trunc <8 x i16> %v3 to <8 x i8> + ret <8 x i8> %v4 +} + +define <8 x i8> @f3(<8 x i8> %a0, <8 x i8> %a1) #0 { +; CHECK-LABEL: f3: +; CHECK: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r5:4 = vmpybu(r0,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7:6 = vmpybu(r1,r3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = vtrunohb(r5:4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1 = vtrunohb(r7:6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %v0 = zext <8 x i8> %a0 to <8 x i16> + %v1 = zext <8 x i8> %a1 to <8 x i16> + %v2 = mul <8 x i16> %v0, %v1 + %v3 = lshr <8 x i16> %v2, + %v4 = trunc <8 x i16> %v3 to <8 x i8> + ret <8 x i8> %v4 +} + +define <2 x i16> @f4(<2 x i16> %a0, <2 x i16> %a1) #0 { +; CHECK-LABEL: f4: +; CHECK: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = vmpyh(r0,r1):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = combine(r1.h,r0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %v0 = sext <2 x i16> %a0 to <2 x i32> + %v1 = sext <2 x i16> %a1 to <2 x i32> + %v2 = mul <2 x i32> %v0, %v1 + %v3 = lshr <2 x i32> %v2, + %v4 = trunc <2 x i32> %v3 to <2 x i16> + ret <2 x i16> %v4 +} + +define <2 x i16> @f5(<2 x i16> %a0, <2 x i16> %a1) #0 { +; CHECK-LABEL: f5: +; CHECK: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(r0,r1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = vasrh(r3:2,#15) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5:4 = vmpyh(r3,r2):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = and(r3,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1 = and(r2,r1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = combine(r5.h,r4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = vaddh(r0,r1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = vaddh(r4,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %v0 = zext <2 x i16> %a0 to <2 x i32> + %v1 = zext <2 x i16> %a1 to <2 x i32> + %v2 = mul <2 x i32> %v0, %v1 + %v3 = lshr <2 x i32> %v2, + %v4 = trunc <2 x i32> %v3 to <2 x i16> + ret <2 x i16> %v4 +} + +define <4 x i16> @f6(<4 x i16> %a0, <4 x i16> %a1) #0 { +; CHECK-LABEL: f6: +; CHECK: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r5:4 = vmpyh(r0,r2):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7:6 = vmpyh(r1,r3):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = combine(r5.h,r4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1 = combine(r7.h,r6.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %v0 = sext <4 x i16> %a0 to <4 x i32> + %v1 = sext <4 x i16> %a1 to <4 x i32> + %v2 = mul <4 x i32> %v0, %v1 + %v3 = lshr <4 x i32> %v2, + %v4 = trunc <4 x i32> %v3 to <4 x i16> + ret <4 x i16> %v4 +} + +define <4 x i16> @f7(<4 x i16> %a0, <4 x i16> %a1) #0 { +; CHECK-LABEL: f7: +; CHECK: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7:6 = vasrh(r1:0,#15) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r9:8 = vasrh(r3:2,#15) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5:4 = vmpyh(r0,r2):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7:6 = and(r3:2,r7:6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = vmpyh(r1,r3):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = and(r1:0,r9:8) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = combine(r5.h,r4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5 = combine(r3.h,r2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = vaddh(r1:0,r7:6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = vaddh(r5:4,r1:0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %v0 = zext <4 x i16> %a0 to <4 x i32> + %v1 = zext <4 x i16> %a1 to <4 x i32> + %v2 = mul <4 x i32> %v0, %v1 + %v3 = lshr <4 x i32> %v2, + %v4 = trunc <4 x i32> %v3 to <4 x i16> + ret <4 x i16> %v4 +} + +define <2 x i32> @f8(<2 x i32> %a0, <2 x i32> %a1) #0 { +; CHECK-LABEL: f8: +; CHECK: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r0 = mpy(r0,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1 = mpy(r1,r3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %v0 = sext <2 x i32> %a0 to <2 x i64> + %v1 = sext <2 x i32> %a1 to <2 x i64> + %v2 = mul <2 x i64> %v0, %v1 + %v3 = lshr <2 x i64> %v2, + %v4 = trunc <2 x i64> %v3 to <2 x i32> + ret <2 x i32> %v4 +} + +define <2 x i32> @f9(<2 x i32> %a0, <2 x i32> %a1) #0 { +; CHECK-LABEL: f9: +; CHECK: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r0 = mpyu(r0,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r1 = mpyu(r1,r3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %v0 = zext <2 x i32> %a0 to <2 x i64> + %v1 = zext <2 x i32> %a1 to <2 x i64> + %v2 = mul <2 x i64> %v0, %v1 + %v3 = lshr <2 x i64> %v2, + %v4 = trunc <2 x i64> %v3 to <2 x i32> + ret <2 x i32> %v4 +} + +attributes #0 = { nounwind memory(none) "target-features"="-packets" } -- 2.7.4