From 7313381d2ce44b72b4c9f70bd5670e5d78d1f631 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Mon, 18 Jul 2022 12:57:10 +0100 Subject: [PATCH] arm: Replace arm_builtin_vectorized_function [PR106253] MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch extends the fix for PR106253 to AArch32. As with AArch64, we were using ACLE intrinsics to vectorise scalar built-ins, even though the two sometimes have different ECF_* flags. (That in turn is because the ACLE intrinsics should follow the instruction semantics as closely as possible, whereas the scalar built-ins follow language specs.) The patch also removes the copysignf built-in, which only existed for this purpose and wasn't a “real” arm_neon.h built-in. Doing this also has the side-effect of enabling vectorisation of rint and roundeven. Logically that should be a separate patch, but making it one would have meant adding a new int iterator for the original set of instructions and then removing it again when including new functions. I've restricted the bswap tests to little-endian because we end up with excessive spilling on big-endian. E.g.: sub sp, sp, #8 vstr d1, [sp] vldr d16, [sp] vrev16.8 d16, d16 vstr d16, [sp] vldr d0, [sp] add sp, sp, #8 @ sp needed bx lr Similarly, the copysign tests require little-endian because on big-endian we unnecessarily load the constant from the constant pool: vldr.32 s15, .L3 vdup.32 d0, d7[1] vbsl d0, d2, d1 bx lr .L3: .word -2147483648 gcc/ PR target/106253 * config/arm/arm-builtins.cc (arm_builtin_vectorized_function): Delete. * config/arm/arm-protos.h (arm_builtin_vectorized_function): Delete. * config/arm/arm.cc (TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION): Delete. * config/arm/arm_neon_builtins.def (copysignf): Delete. * config/arm/iterators.md (nvrint_pattern): New attribute. * config/arm/neon.md (2): New pattern. (l2): Likewise. (neon_copysignf): Rename to... (copysign3): ...this. gcc/testsuite/ PR target/106253 * gcc.target/arm/vect_unary_1.c: New test. * gcc.target/arm/vect_binary_1.c: Likewise. --- gcc/config/arm/arm-builtins.cc | 123 --------------- gcc/config/arm/arm-protos.h | 1 - gcc/config/arm/arm.cc | 4 - gcc/config/arm/arm_neon_builtins.def | 1 - gcc/config/arm/iterators.md | 7 + gcc/config/arm/neon.md | 17 +- gcc/testsuite/gcc.target/arm/vect_binary_1.c | 50 ++++++ gcc/testsuite/gcc.target/arm/vect_unary_1.c | 224 +++++++++++++++++++++++++++ 8 files changed, 297 insertions(+), 130 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/vect_binary_1.c create mode 100644 gcc/testsuite/gcc.target/arm/vect_unary_1.c diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index d917137..8f8155c 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -4026,129 +4026,6 @@ arm_expand_builtin (tree exp, return NULL_RTX; } -tree -arm_builtin_vectorized_function (unsigned int fn, tree type_out, tree type_in) -{ - machine_mode in_mode, out_mode; - int in_n, out_n; - bool out_unsigned_p = TYPE_UNSIGNED (type_out); - - /* Can't provide any vectorized builtins when we can't use NEON. */ - if (!TARGET_NEON) - return NULL_TREE; - - if (TREE_CODE (type_out) != VECTOR_TYPE - || TREE_CODE (type_in) != VECTOR_TYPE) - return NULL_TREE; - - out_mode = TYPE_MODE (TREE_TYPE (type_out)); - out_n = TYPE_VECTOR_SUBPARTS (type_out); - in_mode = TYPE_MODE (TREE_TYPE (type_in)); - in_n = TYPE_VECTOR_SUBPARTS (type_in); - -/* ARM_CHECK_BUILTIN_MODE and ARM_FIND_VRINT_VARIANT are used to find the - decl of the vectorized builtin for the appropriate vector mode. - NULL_TREE is returned if no such builtin is available. */ -#undef ARM_CHECK_BUILTIN_MODE -#define ARM_CHECK_BUILTIN_MODE(C) \ - (TARGET_VFP5 \ - && flag_unsafe_math_optimizations \ - && ARM_CHECK_BUILTIN_MODE_1 (C)) - -#undef ARM_CHECK_BUILTIN_MODE_1 -#define ARM_CHECK_BUILTIN_MODE_1(C) \ - (out_mode == SFmode && out_n == C \ - && in_mode == SFmode && in_n == C) - -#undef ARM_FIND_VRINT_VARIANT -#define ARM_FIND_VRINT_VARIANT(N) \ - (ARM_CHECK_BUILTIN_MODE (2) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v2sf, false) \ - : (ARM_CHECK_BUILTIN_MODE (4) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v4sf, false) \ - : NULL_TREE)) - - switch (fn) - { - CASE_CFN_FLOOR: - return ARM_FIND_VRINT_VARIANT (vrintm); - CASE_CFN_CEIL: - return ARM_FIND_VRINT_VARIANT (vrintp); - CASE_CFN_TRUNC: - return ARM_FIND_VRINT_VARIANT (vrintz); - CASE_CFN_ROUND: - return ARM_FIND_VRINT_VARIANT (vrinta); -#undef ARM_CHECK_BUILTIN_MODE_1 -#define ARM_CHECK_BUILTIN_MODE_1(C) \ - (out_mode == SImode && out_n == C \ - && in_mode == SFmode && in_n == C) - -#define ARM_FIND_VCVT_VARIANT(N) \ - (ARM_CHECK_BUILTIN_MODE (2) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v2sfv2si, false) \ - : (ARM_CHECK_BUILTIN_MODE (4) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v4sfv4si, false) \ - : NULL_TREE)) - -#define ARM_FIND_VCVTU_VARIANT(N) \ - (ARM_CHECK_BUILTIN_MODE (2) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##uv2sfv2si, false) \ - : (ARM_CHECK_BUILTIN_MODE (4) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##uv4sfv4si, false) \ - : NULL_TREE)) - CASE_CFN_LROUND: - return (out_unsigned_p - ? ARM_FIND_VCVTU_VARIANT (vcvta) - : ARM_FIND_VCVT_VARIANT (vcvta)); - CASE_CFN_LCEIL: - return (out_unsigned_p - ? ARM_FIND_VCVTU_VARIANT (vcvtp) - : ARM_FIND_VCVT_VARIANT (vcvtp)); - CASE_CFN_LFLOOR: - return (out_unsigned_p - ? ARM_FIND_VCVTU_VARIANT (vcvtm) - : ARM_FIND_VCVT_VARIANT (vcvtm)); -#undef ARM_CHECK_BUILTIN_MODE -#define ARM_CHECK_BUILTIN_MODE(C, N) \ - (out_mode == N##mode && out_n == C \ - && in_mode == N##mode && in_n == C) - case CFN_BUILT_IN_BSWAP16: - if (ARM_CHECK_BUILTIN_MODE (4, HI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4hi, false); - else if (ARM_CHECK_BUILTIN_MODE (8, HI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv8hi, false); - else - return NULL_TREE; - case CFN_BUILT_IN_BSWAP32: - if (ARM_CHECK_BUILTIN_MODE (2, SI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2si, false); - else if (ARM_CHECK_BUILTIN_MODE (4, SI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4si, false); - else - return NULL_TREE; - case CFN_BUILT_IN_BSWAP64: - if (ARM_CHECK_BUILTIN_MODE (2, DI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2di, false); - else - return NULL_TREE; - CASE_CFN_COPYSIGN: - if (ARM_CHECK_BUILTIN_MODE (2, SF)) - return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv2sf, false); - else if (ARM_CHECK_BUILTIN_MODE (4, SF)) - return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv4sf, false); - else - return NULL_TREE; - - default: - return NULL_TREE; - } - return NULL_TREE; -} -#undef ARM_FIND_VCVT_VARIANT -#undef ARM_FIND_VCVTU_VARIANT -#undef ARM_CHECK_BUILTIN_MODE -#undef ARM_FIND_VRINT_VARIANT - void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) { diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 9d14209..f8aabbd 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -103,7 +103,6 @@ extern void neon_pairwise_reduce (rtx, rtx, machine_mode, rtx (*) (rtx, rtx, rtx)); extern rtx mve_bool_vec_to_const (rtx const_vec); extern rtx neon_make_constant (rtx, bool generate = true); -extern tree arm_builtin_vectorized_function (unsigned int, tree, tree); extern void neon_expand_vector_init (rtx, rtx); extern void neon_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT, const_tree); extern void arm_const_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 33fb98d..eca99c9 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -739,10 +739,6 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_VECTORIZE_BUILTINS #define TARGET_VECTORIZE_BUILTINS -#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION -#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ - arm_builtin_vectorized_function - #undef TARGET_VECTOR_ALIGNMENT #define TARGET_VECTOR_ALIGNMENT arm_vector_alignment diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 445b2bf..2e642cc 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -264,7 +264,6 @@ VAR1 (UNOP, vcvtv4hf, v4sf) VAR10 (TERNOP, vbsl, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) VAR2 (TERNOP, vbsl, v8hf, v4hf) -VAR2 (UNOP, copysignf, v2sf, v4sf) VAR2 (UNOP, vrintn, v2sf, v4sf) VAR2 (UNOP, vrinta, v2sf, v4sf) VAR2 (UNOP, vrintp, v2sf, v4sf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 37cf797..29062cd 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1150,6 +1150,13 @@ (UNSPEC_VRINTA "unconditional") (UNSPEC_VRINTM "unconditional") (UNSPEC_VRINTR "nocond") (UNSPEC_VRINTX "nocond")]) +(define_int_attr nvrint_pattern [(UNSPEC_NVRINTZ "btrunc") + (UNSPEC_NVRINTP "ceil") + (UNSPEC_NVRINTA "round") + (UNSPEC_NVRINTM "floor") + (UNSPEC_NVRINTX "rint") + (UNSPEC_NVRINTN "roundeven")]) + (define_int_attr nvrint_variant [(UNSPEC_NVRINTZ "z") (UNSPEC_NVRINTP "p") (UNSPEC_NVRINTA "a") (UNSPEC_NVRINTM "m") (UNSPEC_NVRINTX "x") (UNSPEC_NVRINTN "n")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 275bcc1..e1dae28 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -635,6 +635,13 @@ [(set_attr "type" "neon_fp_mla_s")] ) +(define_expand "2" + [(set (match_operand:VCVTF 0 "s_register_operand") + (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand")] + NEON_VRINT))] + "TARGET_NEON && TARGET_VFP5 && flag_unsafe_math_optimizations" +) + (define_insn "neon_vrint" [(set (match_operand:VCVTF 0 "s_register_operand" "=w") (unspec:VCVTF [(match_operand:VCVTF 1 @@ -645,6 +652,14 @@ [(set_attr "type" "neon_fp_round_")] ) +(define_expand "l2" + [(set (match_operand: 0 "register_operand") + (FIXUORS: + (unspec:VCVTF [(match_operand:VCVTF 1 "register_operand")] + NEON_VCVT)))] + "TARGET_NEON && TARGET_VFP5 && flag_unsafe_math_optimizations" +) + (define_insn "neon_vcvt" [(set (match_operand: 0 "register_operand" "=w") (FIXUORS: (unspec:VCVTF @@ -3059,7 +3074,7 @@ "TARGET_I8MM" ) -(define_expand "neon_copysignf" +(define_expand "copysign3" [(match_operand:VCVTF 0 "register_operand") (match_operand:VCVTF 1 "register_operand") (match_operand:VCVTF 2 "register_operand")] diff --git a/gcc/testsuite/gcc.target/arm/vect_binary_1.c b/gcc/testsuite/gcc.target/arm/vect_binary_1.c new file mode 100644 index 0000000..c1fc905 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/vect_binary_1.c @@ -0,0 +1,50 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_hard_ok } */ +/* { dg-require-effective-target arm_v8_neon_ok } */ +/* { dg-add-options arm_v8_neon } */ +/* { dg-additional-options "-O3 -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include + +#define TEST2(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 2))) \ +test2_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(IN) * 2))) y, \ + IN __attribute__((vector_size(sizeof(IN) * 2))) z) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 2))) x; \ + x[0] = __builtin_##NAME (y[0], z[0]); \ + x[1] = __builtin_##NAME (y[1], z[1]); \ + return x; \ +} + +#define TEST4(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 4))) \ +test4_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(OUT) * 4))) y, \ + IN __attribute__((vector_size(sizeof(OUT) * 4))) z) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 4))) x; \ + x[0] = __builtin_##NAME (y[0], z[0]); \ + x[1] = __builtin_##NAME (y[1], z[1]); \ + x[2] = __builtin_##NAME (y[2], z[2]); \ + x[3] = __builtin_##NAME (y[3], z[3]); \ + return x; \ +} + +/* +** test2_float_copysignf_float: { target arm_little_endian } +** vmov.i32 d0, #(0x80000000|2147483648)(\s+.*) +** vbsl d0, d2, d1 +** bx lr +*/ +TEST2 (float, copysignf, float) + +/* +** test4_float_copysignf_float: { target arm_little_endian } +** vmov.i32 q0, #(0x80000000|2147483648)(\s+.*) +** vbsl q0, q2, q1 +** bx lr +*/ +TEST4 (float, copysignf, float) diff --git a/gcc/testsuite/gcc.target/arm/vect_unary_1.c b/gcc/testsuite/gcc.target/arm/vect_unary_1.c new file mode 100644 index 0000000..4677180 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/vect_unary_1.c @@ -0,0 +1,224 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_hard_ok } */ +/* { dg-require-effective-target arm_v8_neon_ok } */ +/* { dg-add-options arm_v8_neon } */ +/* { dg-additional-options "-Ofast -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include + +#define TEST2(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 2))) \ +test2_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(IN) * 2))) y) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 2))) x; \ + x[0] = __builtin_##NAME (y[0]); \ + x[1] = __builtin_##NAME (y[1]); \ + return x; \ +} + +#define TEST4(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 4))) \ +test4_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(OUT) * 4))) y) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 4))) x; \ + x[0] = __builtin_##NAME (y[0]); \ + x[1] = __builtin_##NAME (y[1]); \ + x[2] = __builtin_##NAME (y[2]); \ + x[3] = __builtin_##NAME (y[3]); \ + return x; \ +} + +#define TEST8(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 8))) \ +test8_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(OUT) * 8))) y) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 8))) x; \ + x[0] = __builtin_##NAME (y[0]); \ + x[1] = __builtin_##NAME (y[1]); \ + x[2] = __builtin_##NAME (y[2]); \ + x[3] = __builtin_##NAME (y[3]); \ + x[4] = __builtin_##NAME (y[4]); \ + x[5] = __builtin_##NAME (y[5]); \ + x[6] = __builtin_##NAME (y[6]); \ + x[7] = __builtin_##NAME (y[7]); \ + return x; \ +} + +/* +** test2_float_truncf_float: +** vrintz.f32 d0, d1 +** bx lr +*/ +TEST2 (float, truncf, float) + +/* +** test4_float_truncf_float: +** vrintz.f32 q0, q1 +** bx lr +*/ +TEST4 (float, truncf, float) + +/* +** test2_float_roundf_float: +** vrinta.f32 d0, d1 +** bx lr +*/ +TEST2 (float, roundf, float) + +/* +** test4_float_roundf_float: +** vrinta.f32 q0, q1 +** bx lr +*/ +TEST4 (float, roundf, float) + +/* +** test2_float_floorf_float: +** vrintm.f32 d0, d1 +** bx lr +*/ +TEST2 (float, floorf, float) + +/* +** test4_float_floorf_float: +** vrintm.f32 q0, q1 +** bx lr +*/ +TEST4 (float, floorf, float) + +/* +** test2_float_ceilf_float: +** vrintp.f32 d0, d1 +** bx lr +*/ +TEST2 (float, ceilf, float) + +/* +** test4_float_ceilf_float: +** vrintp.f32 q0, q1 +** bx lr +*/ +TEST4 (float, ceilf, float) + +/* +** test2_float_rintf_float: +** vrintx.f32 d0, d1 +** bx lr +*/ +TEST2 (float, rintf, float) + +/* +** test4_float_rintf_float: +** vrintx.f32 q0, q1 +** bx lr +*/ +TEST4 (float, rintf, float) + +/* +** test2_float_roundevenf_float: +** vrintn.f32 d0, d1 +** bx lr +*/ +TEST2 (float, roundevenf, float) + +/* +** test4_float_roundevenf_float: +** vrintn.f32 q0, q1 +** bx lr +*/ +TEST4 (float, roundevenf, float) + +/* +** test2_int_roundf_float: +** vcvta.s32.f32 d0, d1 +** bx lr +*/ +TEST2 (int, roundf, float) + +/* +** test4_int_roundf_float: +** vcvta.s32.f32 q0, q1 +** bx lr +*/ +TEST4 (int, roundf, float) + +/* +** test2_int_floorf_float: +** vcvtm.s32.f32 d0, d1 +** bx lr +*/ +TEST2 (int, floorf, float) + +/* +** test4_int_floorf_float: +** vcvtm.s32.f32 q0, q1 +** bx lr +*/ +TEST4 (int, floorf, float) + +/* +** test2_int_ceilf_float: +** vcvtp.s32.f32 d0, d1 +** bx lr +*/ +TEST2 (int, ceilf, float) + +/* +** test4_int_ceilf_float: +** vcvtp.s32.f32 q0, q1 +** bx lr +*/ +TEST4 (int, ceilf, float) + +/* +** test2_int_clz_int: +** vclz.i32 d0, d1 +** bx lr +*/ +TEST2 (int, clz, int) + +/* +** test4_int_clz_int: +** vclz.i32 q0, q1 +** bx lr +*/ +TEST4 (int, clz, int) + +/* +** test4_int16_t_bswap16_int16_t: { target arm_little_endian } +** vrev16.8 d0, d1 +** bx lr +*/ +TEST4 (int16_t, bswap16, int16_t) + +/* +** test8_int16_t_bswap16_int16_t: { target arm_little_endian } +** vrev16.8 q0, q1 +** bx lr +*/ +TEST8 (int16_t, bswap16, int16_t) + +/* +** test2_int_bswap32_int: { target arm_little_endian } +** vrev32.8 d0, d1 +** bx lr +*/ +TEST2 (int, bswap32, int) + +/* +** test4_int_bswap32_int: { target arm_little_endian } +** vrev32.8 q0, q1 +** bx lr +*/ +TEST4 (int, bswap32, int) + +/* +** test2_int64_t_bswap64_int64_t: { target arm_little_endian } +** vrev64.8 q0, q1 +** bx lr +*/ +TEST2 (int64_t, bswap64, int64_t) -- 2.7.4