aarch64: Don't include vec_select high-half in SIMD multiply cost

author Jonathan Wright <jonathan.wright@arm.com>

Mon, 19 Jul 2021 09:19:30 +0000 (10:19 +0100)

committer Jonathan Wright <jonathan.wright@arm.com>

Wed, 4 Aug 2021 15:58:26 +0000 (16:58 +0100)
author Jonathan Wright <jonathan.wright@arm.com>
Mon, 19 Jul 2021 09:19:30 +0000 (10:19 +0100)
committer Jonathan Wright <jonathan.wright@arm.com>
Wed, 4 Aug 2021 15:58:26 +0000 (16:58 +0100)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 23829bb..e02cbcb 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -78,6 +78,7 @@
  #include "gimple-pretty-print.h"
  #include "tree-ssa-loop-niter.h"
  #include "fractional-cost.h"
+#include "rtlanal.h"
  
  /* This file should be included last.  */
  #include "target-def.h"
@@ -12046,6 +12047,22 @@ aarch64_strip_extend (rtx x, bool strip_shift)
    return x;
  }
  
+/* Helper function for rtx cost calculation. Strip extension as well as any
+   inner VEC_SELECT high-half from X. Returns the inner vector operand if
+   successful, or the original expression on failure.  */
+static rtx
+aarch64_strip_extend_vec_half (rtx x)
+{
+  if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+    {
+      x = XEXP (x, 0);
+      if (GET_CODE (x) == VEC_SELECT
+         && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
+                                   XEXP (x, 1)))
+       x = XEXP (x, 0);
+    }
+  return x;
+}
  
  /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
     any subsequent extend and VEC_SELECT from X. Returns the inner scalar
@@ -12133,6 +12150,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
        unsigned int vec_flags = aarch64_classify_vector_mode (mode);
        if (vec_flags & VEC_ADVSIMD)
         {
+         /* The select-operand-high-half versions of the instruction have the
+            same cost as the three vector version - don't add the costs of the
+            extension or selection into the costs of the multiply.  */
+         op0 = aarch64_strip_extend_vec_half (op0);
+         op1 = aarch64_strip_extend_vec_half (op1);
           /* The by-element versions of the instruction have the same costs as
              the normal 3-vector version.  We make an assumption that the input
              to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c

index f7f3acb..d37f778 100644 (file)
--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -6957,6 +6957,25 @@ register_asm_p (const_rtx x)
  
       (vec_select:RESULT_MODE OP SEL)
  
+   is equivalent to the highpart RESULT_MODE of OP.  */
+
+bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)
+{
+  int nunits;
+  if (GET_MODE_NUNITS (op_mode).is_constant (&nunits)
+      && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS))
+    {
+      int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0);
+      return rtvec_series_p (XVEC (sel, 0), offset);
+    }
+  return false;
+}
+
+/* Return true if, for all OP of mode OP_MODE:
+
+     (vec_select:RESULT_MODE OP SEL)
+
     is equivalent to the lowpart RESULT_MODE of OP.  */
  
  bool
diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h

index e164242..542dc78 100644 (file)
--- a/gcc/rtlanal.h
+++ b/gcc/rtlanal.h
@@ -332,6 +332,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base ()
  using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>;
  
  bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode,
+                      rtx sel);
+
+bool
  vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel);
  
  #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c

new file mode 100644 (file)

index 0000000..ecc02e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+       { \
+               rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
+                                             vget_high_ ## ts (c)); \
+               rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
+                                             vget_high_ ## ts (c)); \
+               return vqaddq ## _ ## rs (t0, t1); \
+       }
+
+TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
+TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
+TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MULL_N(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+       { \
+               rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
+               rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
+               return vqaddq ## _ ## rs (t0, t1); \
+       }
+
+TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MLXL_VEC(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
+                                   intype c) \
+       { \
+               acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
+                                           vget_high_ ## ts (b)); \
+               return name ## _ ## ts (acc, vget_high_ ## ts (a), \
+                                            vget_high_ ## ts (c)); \
+       }
+
+TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
+
+TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
+
+#define TEST_MLXL_N(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+       { \
+               acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+               return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+       }
+
+TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
+
+TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
author	Jonathan Wright <jonathan.wright@arm.com>
	Mon, 19 Jul 2021 09:19:30 +0000 (10:19 +0100)
committer	Jonathan Wright <jonathan.wright@arm.com>
	Wed, 4 Aug 2021 15:58:26 +0000 (16:58 +0100)
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/rtlanal.c		patch \| blob \| history
gcc/rtlanal.h		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c	[new file with mode: 0644]	patch \| blob