From eccf4d702029b48512d573382ef5534a5df23893 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Thu, 11 Jan 2018 15:24:26 +0000 Subject: [PATCH] [arm][3/3] Implement fp16fml lane intrinsics This patch implements the lane-wise fp16fml intrinsics. There's quite a few of them so I've split them up from the other simpler fp16fml intrinsics. These ones expose instructions such as vfmal.f16 Dd, Sn, Sm[] 0 <= index <= 1 vfmal.f16 Qd, Dn, Dm[] 0 <= index <= 3 vfmsl.f16 Dd, Sn, Sm[] 0 <= index <= 1 vfmsl.f16 Qd, Dn, Dm[] 0 <= index <= 3 These instructions extract a single half-precision floating-point value from one of the source regs and perform a vfmal/vfmsl operation as per the normal variant with that value. The nuance here is that some of the intrinsics want to do things like: float32x2_t vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b, const int __index) where the float16x8_t value of '__b' is held in a Q register, so we need to be a bit smart about finding the right D or S sub-register and translating the lane number to a lane in that sub-register, instead of just passing the language-level const-int down to the assembly instruction. That's where most of the complexity of this patch comes from but hopefully it's orthogonal enough to make sense. Bootstrapped and tested on arm-none-linux-gnueabihf as well as armeb-none-eabi. * config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32, vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32, vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32, vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32, vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32, vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define. * config/arm/arm_neon_builtins.def (vfmal_lane_low, vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high, vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low, vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high, vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins. * config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes. (V_lane_reg): Likewise. * config/arm/neon.md (neon_vfml_lane_): New define_expand. (neon_vfml_lane_): Likewise. (vfmal_lane_low_intrinsic, vfmal_lane_low_intrinsic, vfmal_lane_high_intrinsic, vfmal_lane_high_intrinsic, vfmsl_lane_low_intrinsic, vfmsl_lane_low_intrinsic, vfmsl_lane_high_intrinsic, vfmsl_lane_high_intrinsic): New define_insns. * gcc.target/arm/simd/fp16fml_lane_high.c: New test. * gcc.target/arm/simd/fp16fml_lane_low.c: New test. From-SVN: r256540 --- gcc/ChangeLog | 26 ++ gcc/config/arm/arm_neon.h | 144 ++++++++++ gcc/config/arm/arm_neon_builtins.def | 12 + gcc/config/arm/iterators.md | 10 + gcc/config/arm/neon.md | 308 +++++++++++++++++++++ gcc/testsuite/ChangeLog | 5 + .../gcc.target/arm/simd/fp16fml_lane_high.c | 63 +++++ .../gcc.target/arm/simd/fp16fml_lane_low.c | 63 +++++ 8 files changed, 631 insertions(+) create mode 100644 gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_high.c create mode 100644 gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_low.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f8767cc..a23405a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,31 @@ 2018-01-11 Kyrylo Tkachov + * config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32, + vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32, + vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32, + vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32, + vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32, + vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define. + * config/arm/arm_neon_builtins.def (vfmal_lane_low, + vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high, + vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low, + vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high, + vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins. + * config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes. + (V_lane_reg): Likewise. + * config/arm/neon.md (neon_vfml_lane_): + New define_expand. + (neon_vfml_lane_): Likewise. + (vfmal_lane_low_intrinsic, + vfmal_lane_low_intrinsic, + vfmal_lane_high_intrinsic, + vfmal_lane_high_intrinsic, vfmsl_lane_low_intrinsic, + vfmsl_lane_low_intrinsic, + vfmsl_lane_high_intrinsic, + vfmsl_lane_high_intrinsic): New define_insns. + +2018-01-11 Kyrylo Tkachov + * config/arm/arm-cpus.in (fp16fml): New feature. (ALL_SIMD): Add fp16fml. (armv8.2-a): Add fp16fml as an option. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 954193c..6213a4a 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18160,6 +18160,150 @@ vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b) return __builtin_neon_vfmsl_highv4sf (__r, __a, __b); } +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlal_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + __builtin_arm_lane_check (4, __index); + return __builtin_neon_vfmal_lane_lowv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlal_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + __builtin_arm_lane_check (4, __index); + return __builtin_neon_vfmal_lane_highv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlalq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + __builtin_arm_lane_check (8, __index); + return __builtin_neon_vfmal_lane_lowv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlalq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + __builtin_arm_lane_check (4, __index); + return __builtin_neon_vfmal_lane_lowv4hfv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + __builtin_arm_lane_check (8, __index); + return __builtin_neon_vfmal_lane_lowv8hfv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlalq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + __builtin_arm_lane_check (8, __index); + return __builtin_neon_vfmal_lane_highv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlalq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + __builtin_arm_lane_check (4, __index); + return __builtin_neon_vfmal_lane_highv4hfv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlal_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + __builtin_arm_lane_check (8, __index); + return __builtin_neon_vfmal_lane_highv8hfv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlsl_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + __builtin_arm_lane_check (4, __index); + return __builtin_neon_vfmsl_lane_lowv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlsl_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + __builtin_arm_lane_check (4, __index); + return __builtin_neon_vfmsl_lane_highv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlslq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + __builtin_arm_lane_check (8, __index); + return __builtin_neon_vfmsl_lane_lowv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlslq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + __builtin_arm_lane_check (4, __index); + return __builtin_neon_vfmsl_lane_lowv4hfv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlsl_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + __builtin_arm_lane_check (8, __index); + return __builtin_neon_vfmsl_lane_lowv8hfv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlslq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + __builtin_arm_lane_check (8, __index); + return __builtin_neon_vfmsl_lane_highv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlslq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + __builtin_arm_lane_check (4, __index); + return __builtin_neon_vfmsl_lane_highv4hfv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlsl_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + __builtin_arm_lane_check (8, __index); + return __builtin_neon_vfmsl_lane_highv8hfv2sf (__r, __a, __b, __index); +} + #pragma GCC pop_options #endif diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 2a165c6..6ec2933 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -55,6 +55,18 @@ VAR2 (TERNOP, vfmal_low, v2sf, v4sf) VAR2 (TERNOP, vfmal_high, v2sf, v4sf) VAR2 (TERNOP, vfmsl_low, v2sf, v4sf) VAR2 (TERNOP, vfmsl_high, v2sf, v4sf) +VAR2 (MAC_LANE, vfmal_lane_low, v2sf, v4sf) +VAR1 (MAC_LANE, vfmal_lane_lowv4hf, v4sf) +VAR1 (MAC_LANE, vfmal_lane_lowv8hf, v2sf) +VAR2 (MAC_LANE, vfmal_lane_high, v2sf, v4sf) +VAR1 (MAC_LANE, vfmal_lane_highv4hf, v4sf) +VAR1 (MAC_LANE, vfmal_lane_highv8hf, v2sf) +VAR2 (MAC_LANE, vfmsl_lane_low, v2sf, v4sf) +VAR1 (MAC_LANE, vfmsl_lane_lowv4hf, v4sf) +VAR1 (MAC_LANE, vfmsl_lane_lowv8hf, v2sf) +VAR2 (MAC_LANE, vfmsl_lane_high, v2sf, v4sf) +VAR1 (MAC_LANE, vfmsl_lane_highv4hf, v4sf) +VAR1 (MAC_LANE, vfmsl_lane_highv8hf, v2sf) VAR3 (BINOP, vmullp, v8qi, v4hi, v2si) VAR3 (BINOP, vmulls, v8qi, v4hi, v2si) VAR3 (BINOP, vmullu, v8qi, v4hi, v2si) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index ea0836b..5772aa9 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -484,6 +484,12 @@ ;; Mode mapping for VFM[A,S]L instructions for the vec_select result. (define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")]) +;; Mode mapping for VFM[A,S]L instructions for some awkward lane-wise forms. +(define_mode_attr VFMLSEL2 [(V2SF "V8HF") (V4SF "V4HF")]) + +;; Same as the above, but lowercase. +(define_mode_attr vfmlsel2 [(V2SF "v8hf") (V4SF "v4hf")]) + ;; Similar, for three elements. (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (V4HI "BLK") (V8HI "BLK") @@ -516,6 +522,10 @@ ;; Output template to select the low VFP register of a mult-register value. (define_mode_attr V_lo [(V2SF "") (V4SF "e")]) +;; Helper attribute for printing output templates for awkward forms of +;; vfmlal/vfmlsl intrinsics. +(define_mode_attr V_lane_reg [(V2SF "") (V4SF "P")]) + ;; Wider modes with the same number of elements. (define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 23679357..59fb643 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2382,6 +2382,314 @@ [(set_attr "type" "neon_fp_mla_s")] ) +(define_expand "neon_vfml_lane_" + [(set:VCVTF (match_operand:VCVTF 0 "s_register_operand") + (unspec:VCVTF + [(match_operand:VCVTF 1 "s_register_operand") + (PLUSMINUS: + (match_operand: 2 "s_register_operand") + (match_operand: 3 "s_register_operand")) + (match_operand:SI 4 "const_int_operand")] VFMLHALVES))] + "TARGET_FP16FML" +{ + rtx lane = GEN_INT (NEON_ENDIAN_LANE_N (mode, INTVAL (operands[4]))); + rtx half = arm_simd_vect_par_cnst_half (mode, ); + emit_insn (gen_vfml_lane__intrinsic + (operands[0], operands[1], + operands[2], operands[3], + half, lane)); + DONE; +}) + +(define_insn "vfmal_lane_low_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select: + (match_operand: 2 "s_register_operand" "") + (match_operand: 4 "vect_par_constant_low" ""))) + (float_extend:VCVTF + (vec_duplicate: + (vec_select:HF + (match_operand: 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5])); + if (lane > GET_MODE_NUNITS (mode) - 1) + { + operands[5] = GEN_INT (lane - GET_MODE_NUNITS (mode)); + return "vfmal.f16\\t%0, %2, %3[%c5]"; + } + else + { + operands[5] = GEN_INT (lane); + return "vfmal.f16\\t%0, %2, %3[%c5]"; + } + } + [(set_attr "type" "neon_fp_mla_s")] +) + +(define_expand "neon_vfml_lane_" + [(set:VCVTF (match_operand:VCVTF 0 "s_register_operand") + (unspec:VCVTF + [(match_operand:VCVTF 1 "s_register_operand") + (PLUSMINUS: + (match_operand: 2 "s_register_operand") + (match_operand: 3 "s_register_operand")) + (match_operand:SI 4 "const_int_operand")] VFMLHALVES))] + "TARGET_FP16FML" +{ + rtx lane + = GEN_INT (NEON_ENDIAN_LANE_N (mode, INTVAL (operands[4]))); + rtx half = arm_simd_vect_par_cnst_half (mode, ); + emit_insn (gen_vfml_lane__intrinsic + (operands[0], operands[1], operands[2], operands[3], + half, lane)); + DONE; +}) + +;; Used to implement the intrinsics: +;; float32x4_t vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane) +;; float32x2_t vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane) +;; Needs a bit of care to get the modes of the different sub-expressions right +;; due to 'a' and 'b' having different sizes and make sure we use the right +;; S or D subregister to select the appropriate lane from. + +(define_insn "vfmal_lane_low_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select: + (match_operand: 2 "s_register_operand" "") + (match_operand: 4 "vect_par_constant_low" ""))) + (float_extend:VCVTF + (vec_duplicate: + (vec_select:HF + (match_operand: 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5])); + int elts_per_reg = GET_MODE_NUNITS (mode); + int new_lane = lane % elts_per_reg; + int regdiff = lane / elts_per_reg; + operands[5] = GEN_INT (new_lane); + /* We re-create operands[2] and operands[3] in the halved VFMLSEL modes + because we want the print_operand code to print the appropriate + S or D register prefix. */ + operands[3] = gen_rtx_REG (mode, REGNO (operands[3]) + regdiff); + operands[2] = gen_rtx_REG (mode, REGNO (operands[2])); + return "vfmal.f16\\t%0, %2, %3[%c5]"; + } + [(set_attr "type" "neon_fp_mla_s")] +) + +;; Used to implement the intrinsics: +;; float32x4_t vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane) +;; float32x2_t vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane) +;; Needs a bit of care to get the modes of the different sub-expressions right +;; due to 'a' and 'b' having different sizes and make sure we use the right +;; S or D subregister to select the appropriate lane from. + +(define_insn "vfmal_lane_high_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select: + (match_operand: 2 "s_register_operand" "") + (match_operand: 4 "vect_par_constant_high" ""))) + (float_extend:VCVTF + (vec_duplicate: + (vec_select:HF + (match_operand: 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5])); + int elts_per_reg = GET_MODE_NUNITS (mode); + int new_lane = lane % elts_per_reg; + int regdiff = lane / elts_per_reg; + operands[5] = GEN_INT (new_lane); + /* We re-create operands[3] in the halved VFMLSEL mode + because we've calculated the correct half-width subreg to extract + the lane from and we want to print *that* subreg instead. */ + operands[3] = gen_rtx_REG (mode, REGNO (operands[3]) + regdiff); + return "vfmal.f16\\t%0, %2, %3[%c5]"; + } + [(set_attr "type" "neon_fp_mla_s")] +) + +(define_insn "vfmal_lane_high_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select: + (match_operand: 2 "s_register_operand" "") + (match_operand: 4 "vect_par_constant_high" ""))) + (float_extend:VCVTF + (vec_duplicate: + (vec_select:HF + (match_operand: 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5])); + if (lane > GET_MODE_NUNITS (mode) - 1) + { + operands[5] = GEN_INT (lane - GET_MODE_NUNITS (mode)); + return "vfmal.f16\\t%0, %2, %3[%c5]"; + } + else + { + operands[5] = GEN_INT (lane); + return "vfmal.f16\\t%0, %2, %3[%c5]"; + } + } + [(set_attr "type" "neon_fp_mla_s")] +) + +(define_insn "vfmsl_lane_low_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg: + (vec_select: + (match_operand: 2 "s_register_operand" "") + (match_operand: 4 "vect_par_constant_low" "")))) + (float_extend:VCVTF + (vec_duplicate: + (vec_select:HF + (match_operand: 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5])); + if (lane > GET_MODE_NUNITS (mode) - 1) + { + operands[5] = GEN_INT (lane - GET_MODE_NUNITS (mode)); + return "vfmsl.f16\\t%0, %2, %3[%c5]"; + } + else + { + operands[5] = GEN_INT (lane); + return "vfmsl.f16\\t%0, %2, %3[%c5]"; + } + } + [(set_attr "type" "neon_fp_mla_s")] +) + +;; Used to implement the intrinsics: +;; float32x4_t vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane) +;; float32x2_t vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane) +;; Needs a bit of care to get the modes of the different sub-expressions right +;; due to 'a' and 'b' having different sizes and make sure we use the right +;; S or D subregister to select the appropriate lane from. + +(define_insn "vfmsl_lane_low_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg: + (vec_select: + (match_operand: 2 "s_register_operand" "") + (match_operand: 4 "vect_par_constant_low" "")))) + (float_extend:VCVTF + (vec_duplicate: + (vec_select:HF + (match_operand: 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5])); + int elts_per_reg = GET_MODE_NUNITS (mode); + int new_lane = lane % elts_per_reg; + int regdiff = lane / elts_per_reg; + operands[5] = GEN_INT (new_lane); + /* We re-create operands[2] and operands[3] in the halved VFMLSEL modes + because we want the print_operand code to print the appropriate + S or D register prefix. */ + operands[3] = gen_rtx_REG (mode, REGNO (operands[3]) + regdiff); + operands[2] = gen_rtx_REG (mode, REGNO (operands[2])); + return "vfmsl.f16\\t%0, %2, %3[%c5]"; + } + [(set_attr "type" "neon_fp_mla_s")] +) + +;; Used to implement the intrinsics: +;; float32x4_t vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane) +;; float32x2_t vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane) +;; Needs a bit of care to get the modes of the different sub-expressions right +;; due to 'a' and 'b' having different sizes and make sure we use the right +;; S or D subregister to select the appropriate lane from. + +(define_insn "vfmsl_lane_high_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg: + (vec_select: + (match_operand: 2 "s_register_operand" "") + (match_operand: 4 "vect_par_constant_high" "")))) + (float_extend:VCVTF + (vec_duplicate: + (vec_select:HF + (match_operand: 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5])); + int elts_per_reg = GET_MODE_NUNITS (mode); + int new_lane = lane % elts_per_reg; + int regdiff = lane / elts_per_reg; + operands[5] = GEN_INT (new_lane); + /* We re-create operands[3] in the halved VFMLSEL mode + because we've calculated the correct half-width subreg to extract + the lane from and we want to print *that* subreg instead. */ + operands[3] = gen_rtx_REG (mode, REGNO (operands[3]) + regdiff); + return "vfmsl.f16\\t%0, %2, %3[%c5]"; + } + [(set_attr "type" "neon_fp_mla_s")] +) + +(define_insn "vfmsl_lane_high_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg: + (vec_select: + (match_operand: 2 "s_register_operand" "") + (match_operand: 4 "vect_par_constant_high" "")))) + (float_extend:VCVTF + (vec_duplicate: + (vec_select:HF + (match_operand: 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5])); + if (lane > GET_MODE_NUNITS (mode) - 1) + { + operands[5] = GEN_INT (lane - GET_MODE_NUNITS (mode)); + return "vfmsl.f16\\t%0, %2, %3[%c5]"; + } + else + { + operands[5] = GEN_INT (lane); + return "vfmsl.f16\\t%0, %2, %3[%c5]"; + } + } + [(set_attr "type" "neon_fp_mla_s")] +) + ; Used for intrinsics when flag_unsafe_math_optimizations is false. (define_insn "neon_vmla_unspec" diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index f49b068..b3d2fcb 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,10 @@ 2018-01-11 Kyrylo Tkachov + * gcc.target/arm/simd/fp16fml_lane_high.c: New test. + * gcc.target/arm/simd/fp16fml_lane_low.c: New test. + +2018-01-11 Kyrylo Tkachov + * gcc.target/arm/multilib.exp: Add combination tests for fp16fml. * gcc.target/arm/simd/fp16fml_high.c: New test. * gcc.target/arm/simd/fp16fml_low.c: Likewise. diff --git a/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_high.c b/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_high.c new file mode 100644 index 0000000..67f5fa5 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_high.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_fp16fml_neon_ok } */ +/* { dg-add-options arm_fp16fml_neon } */ + +#include "arm_neon.h" + +float32x2_t +test_vfmlal_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + return vfmlal_lane_high_u32 (r, a, b, 0); +} + +float32x2_t +tets_vfmlsl_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + return vfmlsl_lane_high_u32 (r, a, b, 0); +} + +float32x2_t +test_vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b) +{ + return vfmlal_laneq_high_u32 (r, a, b, 6); +} + +float32x2_t +test_vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b) +{ + return vfmlsl_laneq_high_u32 (r, a, b, 6); +} + +float32x4_t +test_vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b) +{ + return vfmlalq_lane_high_u32 (r, a, b, 1); +} + +float32x4_t +test_vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b) +{ + return vfmlslq_lane_high_u32 (r, a, b, 1); +} + +float32x4_t +test_vfmlalq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + return vfmlalq_laneq_high_u32 (r, a, b, 7); +} + +float32x4_t +test_vfmlslq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + return vfmlslq_laneq_high_u32 (r, a, b, 7); +} + +/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */ + +/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_low.c b/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_low.c new file mode 100644 index 0000000..585f775 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_low.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_fp16fml_neon_ok } */ +/* { dg-add-options arm_fp16fml_neon } */ + +#include "arm_neon.h" + +float32x2_t +test_vfmlal_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + return vfmlal_lane_low_u32 (r, a, b, 0); +} + +float32x2_t +test_vfmlsl_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + return vfmlsl_lane_low_u32 (r, a, b, 0); +} + +float32x2_t +test_vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b) +{ + return vfmlal_laneq_low_u32 (r, a, b, 6); +} + +float32x2_t +test_vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b) +{ + return vfmlsl_laneq_low_u32 (r, a, b, 6); +} + +float32x4_t +test_vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b) +{ + return vfmlalq_lane_low_u32 (r, a, b, 1); +} + +float32x4_t +test_vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b) +{ + return vfmlslq_lane_low_u32 (r, a, b, 1); +} + +float32x4_t +test_vfmlalq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + return vfmlalq_laneq_low_u32 (r, a, b, 7); +} + +float32x4_t +test_vfmlslq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + return vfmlslq_laneq_low_u32 (r, a, b, 7); +} + +/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */ + +/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */ -- 2.7.4