#include "gimple-pretty-print.h"
#include "tree-ssa-loop-niter.h"
#include "fractional-cost.h"
+#include "rtlanal.h"
/* This file should be included last. */
#include "target-def.h"
return x;
}
+/* Helper function for rtx cost calculation. Strip extension as well as any
+ inner VEC_SELECT high-half from X. Returns the inner vector operand if
+ successful, or the original expression on failure. */
+static rtx
+aarch64_strip_extend_vec_half (rtx x)
+{
+ if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+ {
+ x = XEXP (x, 0);
+ if (GET_CODE (x) == VEC_SELECT
+ && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
+ XEXP (x, 1)))
+ x = XEXP (x, 0);
+ }
+ return x;
+}
/* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
any subsequent extend and VEC_SELECT from X. Returns the inner scalar
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
if (vec_flags & VEC_ADVSIMD)
{
+ /* The select-operand-high-half versions of the instruction have the
+ same cost as the three vector version - don't add the costs of the
+ extension or selection into the costs of the multiply. */
+ op0 = aarch64_strip_extend_vec_half (op0);
+ op1 = aarch64_strip_extend_vec_half (op1);
/* The by-element versions of the instruction have the same costs as
the normal 3-vector version. We make an assumption that the input
to the VEC_DUPLICATE is already on the FP & SIMD side. This means
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
+ rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+ { \
+ rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
+ vget_high_ ## ts (c)); \
+ rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
+ vget_high_ ## ts (c)); \
+ return vqaddq ## _ ## rs (t0, t1); \
+ }
+
+TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
+TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
+TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MULL_N(name, rettype, intype, ts, rs) \
+ rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+ { \
+ rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
+ rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
+ return vqaddq ## _ ## rs (t0, t1); \
+ }
+
+TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MLXL_VEC(name, rettype, intype, ts) \
+ rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
+ intype c) \
+ { \
+ acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
+ vget_high_ ## ts (b)); \
+ return name ## _ ## ts (acc, vget_high_ ## ts (a), \
+ vget_high_ ## ts (c)); \
+ }
+
+TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
+
+TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
+
+#define TEST_MLXL_N(name, rettype, intype, ts) \
+ rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+ { \
+ acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+ return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+ }
+
+TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
+
+TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */