aarch64: Don't include vec_select element in SIMD multiply cost

author Jonathan Wright <jonathan.wright@arm.com>

Mon, 19 Jul 2021 13:01:52 +0000 (14:01 +0100)

committer Jonathan Wright <jonathan.wright@arm.com>

Wed, 4 Aug 2021 15:57:38 +0000 (16:57 +0100)
author Jonathan Wright <jonathan.wright@arm.com>
Mon, 19 Jul 2021 13:01:52 +0000 (14:01 +0100)
committer Jonathan Wright <jonathan.wright@arm.com>
Wed, 4 Aug 2021 15:57:38 +0000 (16:57 +0100)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 81c002b..23829bb 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12046,6 +12046,26 @@ aarch64_strip_extend (rtx x, bool strip_shift)
    return x;
  }
  
+
+/* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
+   any subsequent extend and VEC_SELECT from X. Returns the inner scalar
+   operand if successful, or the original expression on failure.  */
+static rtx
+aarch64_strip_duplicate_vec_elt (rtx x)
+{
+  if (GET_CODE (x) == VEC_DUPLICATE
+      && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
+    {
+      x = XEXP (x, 0);
+      if (GET_CODE (x) == VEC_SELECT)
+       x = XEXP (x, 0);
+      else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+              && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
+       x = XEXP (XEXP (x, 0), 0);
+    }
+  return x;
+}
+
  /* Return true iff CODE is a shift supported in combination
     with arithmetic instructions.  */
  
@@ -12114,15 +12134,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
        if (vec_flags & VEC_ADVSIMD)
         {
           /* The by-element versions of the instruction have the same costs as
-            the normal 3-vector version.  So don't add the costs of the
-            duplicate into the costs of the multiply.  We make an assumption
-            that the input to the VEC_DUPLICATE is already on the FP & SIMD
-            side.  This means costing of a MUL by element pre RA is a bit
-            optimistic.  */
-         if (GET_CODE (op0) == VEC_DUPLICATE)
-           op0 = XEXP (op0, 0);
-         else if (GET_CODE (op1) == VEC_DUPLICATE)
-           op1 = XEXP (op1, 0);
+            the normal 3-vector version.  We make an assumption that the input
+            to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
+            costing of a MUL by element pre RA is a bit optimistic.  */
+         op0 = aarch64_strip_duplicate_vec_elt (op0);
+         op1 = aarch64_strip_duplicate_vec_elt (op1);
         }
        cost += rtx_cost (op0, mode, MULT, 0, speed);
        cost += rtx_cost (op1, mode, MULT, 1, speed);
diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c

new file mode 100644 (file)

index 0000000..c153775
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MUL_UNIFORM(name, q, vectype, ts) \
+  vectype test_ ## name ## q ## _ ## ts (vectype a, vectype b, vectype c) \
+       { \
+               vectype t0 = name ## q ## _n_ ## ts (a, c[1]); \
+               vectype t1 = name ## q ## _n_ ## ts (b, c[1]); \
+               return vmul ## q ## _ ## ts (t0, t1); \
+       }
+
+TEST_MUL_UNIFORM (vmul, , int16x4_t, s16)
+TEST_MUL_UNIFORM (vmul, , uint16x4_t, u16)
+TEST_MUL_UNIFORM (vmul, , int32x2_t, s32)
+TEST_MUL_UNIFORM (vmul, , uint32x2_t, u32)
+TEST_MUL_UNIFORM (vmul, , float32x2_t, f32)
+TEST_MUL_UNIFORM (vmul, q, int16x8_t, s16)
+TEST_MUL_UNIFORM (vmul, q, uint16x8_t, u16)
+TEST_MUL_UNIFORM (vmul, q, int32x4_t, s32)
+TEST_MUL_UNIFORM (vmul, q, uint32x4_t, u32)
+TEST_MUL_UNIFORM (vmul, q, float32x4_t, f32)
+TEST_MUL_UNIFORM (vmul, q, float64x2_t, f64)
+
+#define TEST_MLX_UNIFORM(name, q, vectype, ts) \
+  vectype test_ ## name ## q ## _ ## ts (vectype acc, vectype a, vectype b) \
+       { \
+               acc = name ## q ## _n_ ## ts (acc, a, b[1]); \
+               return name ## q ## _n_ ## ts (acc, a, b[1]); \
+       }
+
+TEST_MLX_UNIFORM (vmla, , int16x4_t, s16)
+TEST_MLX_UNIFORM (vmla, , uint16x4_t, u16)
+TEST_MLX_UNIFORM (vmla, , int32x2_t, s32)
+TEST_MLX_UNIFORM (vmla, , uint32x2_t, u32)
+TEST_MLX_UNIFORM (vmla, , float32x2_t, f32)
+TEST_MLX_UNIFORM (vmla, q, int16x8_t, s16)
+TEST_MLX_UNIFORM (vmla, q, uint16x8_t, u16)
+TEST_MLX_UNIFORM (vmla, q, int32x4_t, s32)
+TEST_MLX_UNIFORM (vmla, q, uint32x4_t, u32)
+TEST_MLX_UNIFORM (vmla, q, float32x4_t, f32)
+
+TEST_MLX_UNIFORM (vmls, , int16x4_t, s16)
+TEST_MLX_UNIFORM (vmls, , uint16x4_t, u16)
+TEST_MLX_UNIFORM (vmls, , int32x2_t, s32)
+TEST_MLX_UNIFORM (vmls, , uint32x2_t, u32)
+TEST_MLX_UNIFORM (vmls, , float32x2_t, f32)
+TEST_MLX_UNIFORM (vmls, q, int16x8_t, s16)
+TEST_MLX_UNIFORM (vmls, q, uint16x8_t, u16)
+TEST_MLX_UNIFORM (vmls, q, int32x4_t, s32)
+TEST_MLX_UNIFORM (vmls, q, uint32x4_t, u32)
+TEST_MLX_UNIFORM (vmls, q, float32x4_t, f32)
+
+#define TEST_MUL_LONG(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## ts (intype a, intype b, intype c) \
+       { \
+               rettype t0 = name ## ts (a, c[1]); \
+               rettype t1 = name ## ts (b, c[1]); \
+               return vqaddq ## _ ## rs (t0, t1); \
+       }
+
+TEST_MUL_LONG (vmull_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MUL_LONG (vmull_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MUL_LONG (vmull_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MUL_LONG (vmull_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MUL_LONG (vqdmull_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MUL_LONG (vqdmull_n_, int64x2_t, int32x2_t, s32, s64)
+
+#define TEST_MLX_LONG(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+       { \
+               acc = name ## ts (acc, a, b[1]); \
+               return name ## ts (acc, a, b[1]); \
+       }
+
+TEST_MLX_LONG (vmlal_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vmlal_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MLX_LONG (vmlal_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MLX_LONG (vmlal_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MLX_LONG (vmlsl_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vmlsl_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MLX_LONG (vmlsl_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MLX_LONG (vmlsl_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MLX_LONG (vqdmlal_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vqdmlal_n_, int64x2_t, int32x2_t, s32, s64)
+
+TEST_MLX_LONG (vqdmlsl_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vqdmlsl_n_, int64x2_t, int32x2_t, s32, s64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
author	Jonathan Wright <jonathan.wright@arm.com>
	Mon, 19 Jul 2021 13:01:52 +0000 (14:01 +0100)
committer	Jonathan Wright <jonathan.wright@arm.com>
	Wed, 4 Aug 2021 15:57:38 +0000 (16:57 +0100)
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c	[new file with mode: 0644]	patch \| blob