From 62f97123fbefced9091c621b4a0bd642a05a5a85 Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Wed, 15 Apr 2020 11:10:09 +0100 Subject: [PATCH] [ARM][MVE] Add patterns for VRHADD Add patterns which use standard add nodes along with arm vshr imm nodes. Differential Revision: https://reviews.llvm.org/D77069 --- llvm/lib/Target/ARM/ARMInstrMVE.td | 71 +++++--- llvm/test/CodeGen/Thumb2/mve-halving.ll | 288 ++++++++++++++++++++++++++++++++ 2 files changed, 339 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 8b9917a..5e4a3d5 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2015,6 +2015,26 @@ class MVE_VRHADD_Base size, list pattern=[]> let validForTailPredication = 1; } +def addnuw : PatFrag<(ops node:$lhs, node:$rhs), + (add node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoUnsignedWrap(); +}]>; + +def addnsw : PatFrag<(ops node:$lhs, node:$rhs), + (add node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoSignedWrap(); +}]>; + +def subnuw : PatFrag<(ops node:$lhs, node:$rhs), + (sub node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoUnsignedWrap(); +}]>; + +def subnsw : PatFrag<(ops node:$lhs, node:$rhs), + (sub node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoSignedWrap(); +}]>; + multiclass MVE_VRHADD_m { def "" : MVE_VRHADD_Base; @@ -2046,6 +2066,37 @@ defm MVE_VRHADDu8 : MVE_VRHADD; defm MVE_VRHADDu16 : MVE_VRHADD; defm MVE_VRHADDu32 : MVE_VRHADD; +// Rounding Halving Add perform the arithemtic operation with an extra bit of +// precision, before performing the shift, to void clipping errors. We're not +// modelling that here with these patterns, but we're using no wrap forms of +// add to ensure that the extra bit of information is not needed for the +// arithmetic or the rounding. +def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvmovImm (i32 3585)))), + (i32 1))), + (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvmovImm (i32 2049)))), + (i32 1))), + (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvmovImm (i32 1)))), + (i32 1))), + (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvmovImm (i32 3585)))), + (i32 1))), + (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvmovImm (i32 2049)))), + (i32 1))), + (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvmovImm (i32 1)))), + (i32 1))), + (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; + + class MVE_VHADDSUB size, list pattern=[]> : MVE_int { @@ -2095,26 +2146,6 @@ multiclass MVE_VHADD : MVE_VHADD_m; -def addnuw : PatFrag<(ops node:$lhs, node:$rhs), - (add node:$lhs, node:$rhs), [{ - return N->getFlags().hasNoUnsignedWrap(); -}]>; - -def addnsw : PatFrag<(ops node:$lhs, node:$rhs), - (add node:$lhs, node:$rhs), [{ - return N->getFlags().hasNoSignedWrap(); -}]>; - -def subnuw : PatFrag<(ops node:$lhs, node:$rhs), - (sub node:$lhs, node:$rhs), [{ - return N->getFlags().hasNoUnsignedWrap(); -}]>; - -def subnsw : PatFrag<(ops node:$lhs, node:$rhs), - (sub node:$lhs, node:$rhs), [{ - return N->getFlags().hasNoSignedWrap(); -}]>; - // Halving add/sub perform the arithemtic operation with an extra bit of // precision, before performing the shift, to void clipping errors. We're not // modelling that here with these patterns, but we're using no wrap forms of diff --git a/llvm/test/CodeGen/Thumb2/mve-halving.ll b/llvm/test/CodeGen/Thumb2/mve-halving.ll index 84f4f9a..08877d4 100644 --- a/llvm/test/CodeGen/Thumb2/mve-halving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-halving.ll @@ -230,3 +230,291 @@ define arm_aapcs_vfpcc <4 x i32> @vhsubu_v4i32_nw(<4 x i32> %x, <4 x i32> %y) { %half = lshr <4 x i32> %sub, ret <4 x i32> %half } +define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vrhadds_v16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0x1 +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vshr.s8 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <16 x i8> %x, %y + %round = add <16 x i8> %add, + %half = ashr <16 x i8> %round, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vrhaddu_v16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0x1 +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vshr.u8 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <16 x i8> %x, %y + %round = add <16 x i8> %add, + %half = lshr <16 x i8> %round, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vrhadds_v8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmov.i16 q1, #0x1 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vshr.s16 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <8 x i16> %x, %y + %round = add <8 x i16> %add, + %half = ashr <8 x i16> %round, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vrhaddu_v8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmov.i16 q1, #0x1 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <8 x i16> %x, %y + %round = add <8 x i16> %add, + %half = lshr <8 x i16> %round, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vrhadds_v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vshr.s32 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <4 x i32> %x, %y + %round = add <4 x i32> %add, + %half = ashr <4 x i32> %round, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vrhaddu_v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <4 x i32> %x, %y + %round = add <4 x i32> %add, + %half = lshr <4 x i32> %round, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8_nwop(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vrhadds_v16i8_nwop: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0x1 +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vshr.s8 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add nsw <16 x i8> %x, %y + %round = add <16 x i8> %add, + %half = ashr <16 x i8> %round, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8_nwop(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vrhaddu_v16i8_nwop: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0x1 +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vshr.u8 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add nuw <16 x i8> %x, %y + %round = add <16 x i8> %add, + %half = lshr <16 x i8> %round, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16_nwop(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vrhadds_v8i16_nwop: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmov.i16 q1, #0x1 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vshr.s16 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add nsw <8 x i16> %x, %y + %round = add <8 x i16> %add, + %half = ashr <8 x i16> %round, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16_nwop(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vrhaddu_v8i16_nwop: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmov.i16 q1, #0x1 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add nuw <8 x i16> %x, %y + %round = add <8 x i16> %add, + %half = lshr <8 x i16> %round, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32_nwop(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vrhadds_v4i32_nwop: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vshr.s32 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add nsw <4 x i32> %x, %y + %round = add <4 x i32> %add, + %half = ashr <4 x i32> %round, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32_nwop(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vrhaddu_v4i32_nwop: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add nuw <4 x i32> %x, %y + %round = add <4 x i32> %add, + %half = lshr <4 x i32> %round, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8_nwrnd(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vrhadds_v16i8_nwrnd: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0x1 +; CHECK-NEXT: vhadd.s8 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add <16 x i8> %x, %y + %round = add nsw <16 x i8> %add, + %half = ashr <16 x i8> %round, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8_nwrnd(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vrhaddu_v16i8_nwrnd: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0x1 +; CHECK-NEXT: vhadd.u8 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add <16 x i8> %x, %y + %round = add nuw <16 x i8> %add, + %half = lshr <16 x i8> %round, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16_nwrnd(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vrhadds_v8i16_nwrnd: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmov.i16 q1, #0x1 +; CHECK-NEXT: vhadd.s16 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add <8 x i16> %x, %y + %round = add nsw <8 x i16> %add, + %half = ashr <8 x i16> %round, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16_nwrnd(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vrhaddu_v8i16_nwrnd: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmov.i16 q1, #0x1 +; CHECK-NEXT: vhadd.u16 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add <8 x i16> %x, %y + %round = add nuw <8 x i16> %add, + %half = lshr <8 x i16> %round, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32_nwrnd(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vrhadds_v4i32_nwrnd: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vhadd.s32 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add <4 x i32> %x, %y + %round = add nsw <4 x i32> %add, + %half = ashr <4 x i32> %round, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32_nwrnd(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vrhaddu_v4i32_nwrnd: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vhadd.u32 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add <4 x i32> %x, %y + %round = add nuw <4 x i32> %add, + %half = lshr <4 x i32> %round, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8_both_nw(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vrhadds_v16i8_both_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vrhadd.s8 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nsw <16 x i8> %x, %y + %round = add nsw <16 x i8> %add, + %half = ashr <16 x i8> %round, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8_both_nw(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vrhaddu_v16i8_both_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vrhadd.u8 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nuw <16 x i8> %x, %y + %round = add nuw <16 x i8> %add, + %half = lshr <16 x i8> %round, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16_both_nw(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vrhadds_v8i16_both_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vrhadd.s16 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nsw <8 x i16> %x, %y + %round = add nsw <8 x i16> %add, + %half = ashr <8 x i16> %round, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16_both_nw(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vrhaddu_v8i16_both_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vrhadd.u16 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nuw <8 x i16> %x, %y + %round = add nuw <8 x i16> %add, + %half = lshr <8 x i16> %round, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32_both_nw(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vrhadds_v4i32_both_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vrhadd.s32 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nsw <4 x i32> %x, %y + %round = add nsw <4 x i32> %add, + %half = ashr <4 x i32> %round, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32_both_nw(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vrhaddu_v4i32_both_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vrhadd.u32 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nuw <4 x i32> %x, %y + %round = add nuw <4 x i32> %add, + %half = lshr <4 x i32> %round, + ret <4 x i32> %half +} -- 2.7.4