From 6abd428605e3a279e533fde1cecbc9735ce03b66 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 25 Sep 2020 12:45:25 +0100 Subject: [PATCH] arm: Fix fp16 move patterns for base MVE This patch fixes ICEs in gcc.dg/torture/float16-basic.c for -march=armv8.1-m.main+mve -mfloat-abi=hard. The problem was that an fp16 argument was (rightly) being passed in FPRs, but the fp16 move patterns only handled GPRs. LRA then cycled trying to look for a way of handling the FPR. It looks like there are three related problems here: (1) We're using the wrong fp16 move pattern for base MVE. *mov_vfp_16 (the pattern we use for +mve.fp) works for base MVE too. (2) The fp16 MVE load and store patterns are separate from the main move patterns. The loads and stores should instead be alternatives of the main move patterns, so that LRA knows what to do with pseudo registers that become stack slots. (3) The range restrictions for the loads and stores were wrong for fp16: we were enforcing a multiple of 4 in [-255*4, 255*4] instead of a multiple of 2 in [-255*2, 255*2]. (2) came from a patch to prevent writeback being used for MVE. That patch also added a Uj constraint to enforce the correct memory types for MVE. I think the simplest fix is therefore to merge the loads and stores back into the main pattern and extend the Uj constraint so that it acts like Um for non-MVE. The testcase for that patch was mve-vldstr16-no-writeback.c, whose main function is: void fn1 (__fp16 *pSrc) { __fp16 high; __fp16 *pDst = 0; unsigned i; for (i = 0;; i++) if (pSrc[i]) pDst[i] = high; } Fixing (2) causes the store part to fail, not because we're using writeback, but because we decide to use GPRs to store high (which is uninitialised, and so gets replaced with zero). This patch therefore adds some scan-assembler-nots instead. (I wondered about changing the testcase to initialise high, but that seemed like a bad idea for a regression test.) For (3): MVE seems to be the only thing to use arm_coproc_mem_operand_wb (and its various interfaces) for 16-bit scalars: the Neon patterns only use it for 32-bit scalars. I've added new tests to try the various FPR alternatives of the move patterns. The range of offsets that GCC uses for FPR loads and stores is the intersection of the range allowed for GPRs and FPRs, so the tests include GPR<->memory tests as well. The fp32 and fp64 tests already pass, they're just there for completeness. gcc/ * config/arm/arm-protos.h (arm_mve_mode_and_operands_type_check): Delete. * config/arm/arm.c (arm_coproc_mem_operand_wb): Use a scale factor of 2 rather than 4 for 16-bit modes. (arm_mve_mode_and_operands_type_check): Delete. * config/arm/constraints.md (Uj): Allow writeback for Neon, but continue to disallow it for MVE. * config/arm/arm.md (*arm32_mov): Add !TARGET_HAVE_MVE. * config/arm/vfp.md (*mov_load_vfp_hf16, *mov_store_vfp_hf16): Fold back into... (*mov_vfp_16): ...here but use Uj for the FPR memory constraints. Use for base MVE too. gcc/testsuite/ * gcc.target/arm/mve/intrinsics/mve-vldstr16-no-writeback.c: Allow the store to use GPRs instead of FPRs. Add scan-assembler-nots for writeback. * gcc.target/arm/armv8_1m-fp16-move-1.c: New test. * gcc.target/arm/armv8_1m-fp32-move-1.c: Likewise. * gcc.target/arm/armv8_1m-fp64-move-1.c: Likewise. --- gcc/config/arm/arm-protos.h | 1 - gcc/config/arm/arm.c | 25 +- gcc/config/arm/arm.md | 4 +- gcc/config/arm/constraints.md | 9 +- gcc/config/arm/vfp.md | 32 +- .../gcc.target/arm/armv8_1m-fp16-move-1.c | 418 ++++++++++++++++++++ .../gcc.target/arm/armv8_1m-fp32-move-1.c | 420 ++++++++++++++++++++ .../gcc.target/arm/armv8_1m-fp64-move-1.c | 426 +++++++++++++++++++++ .../arm/mve/intrinsics/mve-vldstr16-no-writeback.c | 5 +- 9 files changed, 1295 insertions(+), 45 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/armv8_1m-fp16-move-1.c create mode 100644 gcc/testsuite/gcc.target/arm/armv8_1m-fp32-move-1.c create mode 100644 gcc/testsuite/gcc.target/arm/armv8_1m-fp64-move-1.c diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 0cc0ae7..9bb9c61 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -120,7 +120,6 @@ extern int arm_coproc_mem_operand_no_writeback (rtx); extern int arm_coproc_mem_operand_wb (rtx, int); extern int neon_vector_mem_operand (rtx, int, bool); extern int mve_vector_mem_operand (machine_mode, rtx, bool); -bool arm_mve_mode_and_operands_type_check (machine_mode, rtx, rtx); extern int neon_struct_mem_operand (rtx); extern rtx *neon_vcmla_lane_prepare_operands (rtx *); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 022ef6c..8105b39 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -13277,14 +13277,18 @@ arm_coproc_mem_operand_wb (rtx op, int wb_level) /* Match: (plus (reg) - (const)). */ + (const)) + + The encoded immediate for 16-bit modes is multiplied by 2, + while the encoded immediate for 32-bit and 64-bit modes is + multiplied by 4. */ + int factor = MIN (GET_MODE_SIZE (GET_MODE (op)), 4); if (GET_CODE (ind) == PLUS && REG_P (XEXP (ind, 0)) && REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode) && CONST_INT_P (XEXP (ind, 1)) - && INTVAL (XEXP (ind, 1)) > -1024 - && INTVAL (XEXP (ind, 1)) < 1024 - && (INTVAL (XEXP (ind, 1)) & 3) == 0) + && IN_RANGE (INTVAL (XEXP (ind, 1)), -255 * factor, 255 * factor) + && (INTVAL (XEXP (ind, 1)) & (factor - 1)) == 0) return TRUE; return FALSE; @@ -33578,17 +33582,4 @@ arm_mode_base_reg_class (machine_mode mode) struct gcc_target targetm = TARGET_INITIALIZER; -bool -arm_mve_mode_and_operands_type_check (machine_mode mode, rtx op0, rtx op1) -{ - if (!(TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT)) - return true; - else if (mode == E_BFmode) - return false; - else if ((s_register_operand (op0, mode) && MEM_P (op1)) - || (s_register_operand (op1, mode) && MEM_P (op0))) - return false; - return true; -} - #include "gt-arm.h" diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index c4fa116..147c4a5 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -7289,7 +7289,9 @@ (define_insn "*arm32_mov" [(set (match_operand:HFBF 0 "nonimmediate_operand" "=r,m,r,r") (match_operand:HFBF 1 "general_operand" " m,r,r,F"))] - "TARGET_32BIT && !TARGET_HARD_FLOAT + "TARGET_32BIT + && !TARGET_HARD_FLOAT + && !TARGET_HAVE_MVE && ( s_register_operand (operands[0], mode) || s_register_operand (operands[1], mode))" "* diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md index ff229aa..789e333 100644 --- a/gcc/config/arm/constraints.md +++ b/gcc/config/arm/constraints.md @@ -454,10 +454,13 @@ (define_memory_constraint "Uj" "@internal - In ARM/Thumb-2 state an VFP load/store address which does not support - writeback at all (eg vldr.16)." + In ARM/Thumb-2 state a VFP load/store address that supports writeback + for Neon but not for MVE" (and (match_code "mem") - (match_test "TARGET_32BIT && arm_coproc_mem_operand_no_writeback (op)"))) + (match_test "TARGET_32BIT") + (match_test "TARGET_HAVE_MVE + ? arm_coproc_mem_operand_no_writeback (op) + : neon_vector_mem_operand (op, 2, true)"))) (define_memory_constraint "Uy" "@internal diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index 6a2bc5a..72707c1 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -387,31 +387,15 @@ (set_attr "arch" "t2,any,any,any,a,t2,any,any,any,any,any,any")] ) -(define_insn "*mov_load_vfp_hf16" - [(set (match_operand:HF 0 "s_register_operand" "=t") - (match_operand:HF 1 "memory_operand" "Uj"))] - "TARGET_HAVE_MVE_FLOAT" - "vldr.16\\t%0, %E1" -) - -(define_insn "*mov_store_vfp_hf16" - [(set (match_operand:HF 0 "memory_operand" "=Uj") - (match_operand:HF 1 "s_register_operand" "t"))] - "TARGET_HAVE_MVE_FLOAT" - "vstr.16\\t%1, %E0" -) - ;; HFmode and BFmode moves (define_insn "*mov_vfp_16" [(set (match_operand:HFBF 0 "nonimmediate_operand" - "= ?r,?m,t,r,t,r,t, t, Um,r") + "= ?r,?m,t,r,t,r,t, t, Uj,r") (match_operand:HFBF 1 "general_operand" - " m,r,t,r,r,t,Dv,Um,t, F"))] + " m,r,t,r,r,t,Dv,Uj,t, F"))] "TARGET_32BIT - && TARGET_VFP_FP16INST - && arm_mve_mode_and_operands_type_check (mode, operands[0], - operands[1]) + && (TARGET_VFP_FP16INST || TARGET_HAVE_MVE) && (s_register_operand (operands[0], mode) || s_register_operand (operands[1], mode))" { @@ -430,9 +414,15 @@ case 6: /* S register from immediate. */ return \"vmov.f16\\t%0, %1\t%@ __\"; case 7: /* S register from memory. */ - return \"vld1.16\\t{%z0}, %A1\"; + if (TARGET_HAVE_MVE) + return \"vldr.16\\t%0, %1\"; + else + return \"vld1.16\\t{%z0}, %A1\"; case 8: /* Memory from S register. */ - return \"vst1.16\\t{%z1}, %A0\"; + if (TARGET_HAVE_MVE) + return \"vstr.16\\t%1, %0\"; + else + return \"vst1.16\\t{%z1}, %A0\"; case 9: /* ARM register from constant. */ { long bits; diff --git a/gcc/testsuite/gcc.target/arm/armv8_1m-fp16-move-1.c b/gcc/testsuite/gcc.target/arm/armv8_1m-fp16-move-1.c new file mode 100644 index 0000000..67a9f41 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/armv8_1m-fp16-move-1.c @@ -0,0 +1,418 @@ +/* { dg-do compile } */ +/* { dg-options "-O -mfloat-abi=hard -mfp16-format=ieee" } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-add-options arm_v8_1m_mve } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +/* +** r_w: +** vmov.f16 r0, s0 @ __fp16 +** bx lr +*/ +void +r_w (_Float16 s0) +{ + register _Float16 r0 asm ("r0"); + r0 = s0; + asm volatile ("" :: "r" (r0)); +} + +/* +** w_r: +** vmov.f16 s0, r0 @ __fp16 +** bx lr +*/ +_Float16 +w_r () +{ + register _Float16 r0 asm ("r0"); + asm volatile ("" : "=r" (r0)); + return r0; +} + +/* +** w_w: +** vmov s1, s0 @ __fp16 +** bx lr +*/ +void +w_w (_Float16 s0) +{ + register _Float16 s1 asm ("s1"); + s1 = s0; + asm volatile ("" :: "w" (s1)); +} + +/* +** r_m_m128: +** sub (r[0-9]+), r0, #256 +** ldrh r1, \[\1\] @ __fp16 +** bx lr +*/ +void +r_m_m128 (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + r1 = r0[-128]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_m127: +** ldrh r1, \[r0, #-254\] @ __fp16 +** bx lr +*/ +void +r_m_m127 (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + r1 = r0[-127]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_m1: +** ldrh r1, \[r0, #-2\] @ __fp16 +** bx lr +*/ +void +r_m_m1 (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + r1 = r0[-1]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_0: +** ldrh r1, \[r0\] @ __fp16 +** bx lr +*/ +void +r_m_0 (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + r1 = r0[0]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_1: +** ldrh r1, \[r0, #2\] @ __fp16 +** bx lr +*/ +void +r_m_1 (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + r1 = r0[1]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_255: +** ldrh r1, \[r0, #510\] @ __fp16 +** bx lr +*/ +void +r_m_255 (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + r1 = r0[255]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_256: +** ldrh r1, \[r0, #512\] @ __fp16 +** bx lr +*/ +void +r_m_256 (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + r1 = r0[256]; + asm volatile ("" :: "r" (r1)); +} + +/* ??? This could be done in one instruction, but without mve.fp, + it makes more sense for memory_operand to enforce the GPR range. */ +/* +** w_m_m128: +** sub (r[0-9]+), r0, #256 +** vldr.16 s0, \[\1\] +** bx lr +*/ +void +w_m_m128 (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + s0 = r0[-128]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_m127: +** vldr.16 s0, \[r0, #-254\] +** bx lr +*/ +void +w_m_m127 (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + s0 = r0[-127]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_m1: +** vldr.16 s0, \[r0, #-2\] +** bx lr +*/ +void +w_m_m1 (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + s0 = r0[-1]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_0: +** vldr.16 s0, \[r0\] +** bx lr +*/ +void +w_m_0 (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + s0 = r0[0]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_1: +** vldr.16 s0, \[r0, #2\] +** bx lr +*/ +void +w_m_1 (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + s0 = r0[1]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_255: +** vldr.16 s0, \[r0, #510\] +** bx lr +*/ +void +w_m_255 (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + s0 = r0[255]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_256: +** add (r[0-9]+), r0, #512 +** vldr.16 s0, \[\1\] +** bx lr +*/ +void +w_m_256 (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + s0 = r0[256]; + asm volatile ("" :: "w" (s0)); +} + +/* +** m_m128_r: +** sub (r[0-9]+), r0, #256 +** strh r1, \[\1\] @ __fp16 +** bx lr +*/ +void +m_m128_r (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[-128] = r1; +} + +/* +** m_m127_r: +** strh r1, \[r0, #-254\] @ __fp16 +** bx lr +*/ +void +m_m127_r (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[-127] = r1; +} + +/* +** m_m1_r: +** strh r1, \[r0, #-2\] @ __fp16 +** bx lr +*/ +void +m_m1_r (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[-1] = r1; +} + +/* +** m_0_r: +** strh r1, \[r0\] @ __fp16 +** bx lr +*/ +void +m_0_r (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[0] = r1; +} + +/* +** m_1_r: +** strh r1, \[r0, #2\] @ __fp16 +** bx lr +*/ +void +m_1_r (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[1] = r1; +} + +/* +** m_255_r: +** strh r1, \[r0, #510\] @ __fp16 +** bx lr +*/ +void +m_255_r (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[255] = r1; +} + +/* +** m_256_r: +** strh r1, \[r0, #512\] @ __fp16 +** bx lr +*/ +void +m_256_r (_Float16 *r0) +{ + register _Float16 r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[256] = r1; +} + +/* ??? This could be done in one instruction, but without mve.fp, + it makes more sense for memory_operand to enforce the GPR range. */ +/* +** m_m128_w: +** sub (r[0-9]+), r0, #256 +** vstr.16 s0, \[\1\] +** bx lr +*/ +void +m_m128_w (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[-128] = s0; +} + +/* +** m_m127_w: +** vstr.16 s0, \[r0, #-254\] +** bx lr +*/ +void +m_m127_w (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[-127] = s0; +} + +/* +** m_m1_w: +** vstr.16 s0, \[r0, #-2\] +** bx lr +*/ +void +m_m1_w (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[-1] = s0; +} + +/* +** m_0_w: +** vstr.16 s0, \[r0\] +** bx lr +*/ +void +m_0_w (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[0] = s0; +} + +/* +** m_1_w: +** vstr.16 s0, \[r0, #2\] +** bx lr +*/ +void +m_1_w (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[1] = s0; +} + +/* +** m_255_w: +** vstr.16 s0, \[r0, #510\] +** bx lr +*/ +void +m_255_w (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[255] = s0; +} + +/* +** m_256_w: +** add (r[0-9]+), r0, #512 +** vstr.16 s0, \[\1\] +** bx lr +*/ +void +m_256_w (_Float16 *r0) +{ + register _Float16 s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[256] = s0; +} diff --git a/gcc/testsuite/gcc.target/arm/armv8_1m-fp32-move-1.c b/gcc/testsuite/gcc.target/arm/armv8_1m-fp32-move-1.c new file mode 100644 index 0000000..1ecb839 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/armv8_1m-fp32-move-1.c @@ -0,0 +1,420 @@ +/* { dg-do compile } */ +/* { dg-options "-O -mfloat-abi=hard" } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-add-options arm_v8_1m_mve } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +/* +** r_w: +** vmov r0, s0 +** bx lr +*/ +void +r_w (float s0) +{ + register float r0 asm ("r0"); + r0 = s0; + asm volatile ("" :: "r" (r0)); +} + +/* +** w_r: +** vmov s0, r0 +** bx lr +*/ +float +w_r () +{ + register float r0 asm ("r0"); + asm volatile ("" : "=r" (r0)); + return r0; +} + +/* +** w_w: +** vmov.f32 s1, s0 +** bx lr +*/ +void +w_w (float s0) +{ + register float s1 asm ("s1"); + s1 = s0; + asm volatile ("" :: "w" (s1)); +} + +/* +** r_m_m64: +** sub (r[0-9]+), r0, #256 +** ldr r1, \[\1\] @ float +** bx lr +*/ +void +r_m_m64 (float *r0) +{ + register float r1 asm ("r1"); + r1 = r0[-64]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_m63: +** ldr r1, \[r0, #-252\] @ float +** bx lr +*/ +void +r_m_m63 (float *r0) +{ + register float r1 asm ("r1"); + r1 = r0[-63]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_m1: +** ldr r1, \[r0, #-4\] @ float +** bx lr +*/ +void +r_m_m1 (float *r0) +{ + register float r1 asm ("r1"); + r1 = r0[-1]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_0: +** ldr r1, \[r0\] @ float +** bx lr +*/ +void +r_m_0 (float *r0) +{ + register float r1 asm ("r1"); + r1 = r0[0]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_1: +** ldr r1, \[r0, #4\] @ float +** bx lr +*/ +void +r_m_1 (float *r0) +{ + register float r1 asm ("r1"); + r1 = r0[1]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_255: +** ldr r1, \[r0, #1020\] @ float +** bx lr +*/ +void +r_m_255 (float *r0) +{ + register float r1 asm ("r1"); + r1 = r0[255]; + asm volatile ("" :: "r" (r1)); +} + +/* +** r_m_256: +** add (r[0-9]+), r0, #1024 +** ldr r1, \[r0\] @ float +** bx lr +*/ +void +r_m_256 (float *r0) +{ + register float r1 asm ("r1"); + r1 = r0[256]; + asm volatile ("" :: "r" (r1)); +} + +/* ??? This could be done in one instruction, but without mve.fp, + it makes more sense for memory_operand to enforce the GPR range. */ +/* +** w_m_m64: +** sub (r[0-9]+), r0, #256 +** vldr.32 s0, \[\1\] +** bx lr +*/ +void +w_m_m64 (float *r0) +{ + register float s0 asm ("s0"); + s0 = r0[-64]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_m63: +** vldr.32 s0, \[r0, #-252\] +** bx lr +*/ +void +w_m_m63 (float *r0) +{ + register float s0 asm ("s0"); + s0 = r0[-63]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_m1: +** vldr.32 s0, \[r0, #-4\] +** bx lr +*/ +void +w_m_m1 (float *r0) +{ + register float s0 asm ("s0"); + s0 = r0[-1]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_0: +** vldr.32 s0, \[r0\] +** bx lr +*/ +void +w_m_0 (float *r0) +{ + register float s0 asm ("s0"); + s0 = r0[0]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_1: +** vldr.32 s0, \[r0, #4\] +** bx lr +*/ +void +w_m_1 (float *r0) +{ + register float s0 asm ("s0"); + s0 = r0[1]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_255: +** vldr.32 s0, \[r0, #1020\] +** bx lr +*/ +void +w_m_255 (float *r0) +{ + register float s0 asm ("s0"); + s0 = r0[255]; + asm volatile ("" :: "w" (s0)); +} + +/* +** w_m_256: +** add (r[0-9]+), r0, #1024 +** vldr.32 s0, \[\1\] +** bx lr +*/ +void +w_m_256 (float *r0) +{ + register float s0 asm ("s0"); + s0 = r0[256]; + asm volatile ("" :: "w" (s0)); +} + +/* +** m_m64_r: +** sub (r[0-9]+), r0, #256 +** str r1, \[\1\] @ float +** bx lr +*/ +void +m_m64_r (float *r0) +{ + register float r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[-64] = r1; +} + +/* +** m_m63_r: +** str r1, \[r0, #-252\] @ float +** bx lr +*/ +void +m_m63_r (float *r0) +{ + register float r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[-63] = r1; +} + +/* +** m_m1_r: +** str r1, \[r0, #-4\] @ float +** bx lr +*/ +void +m_m1_r (float *r0) +{ + register float r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[-1] = r1; +} + +/* +** m_0_r: +** str r1, \[r0\] @ float +** bx lr +*/ +void +m_0_r (float *r0) +{ + register float r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[0] = r1; +} + +/* +** m_1_r: +** str r1, \[r0, #4\] @ float +** bx lr +*/ +void +m_1_r (float *r0) +{ + register float r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[1] = r1; +} + +/* +** m_255_r: +** str r1, \[r0, #1020\] @ float +** bx lr +*/ +void +m_255_r (float *r0) +{ + register float r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[255] = r1; +} + +/* +** m_256_r: +** add (r[0-9]+), r0, #1024 +** str r1, \[r0\] @ float +** bx lr +*/ +void +m_256_r (float *r0) +{ + register float r1 asm ("r1"); + asm volatile ("" : "=r" (r1)); + r0[256] = r1; +} + +/* ??? This could be done in one instruction, but without mve.fp, + it makes more sense for memory_operand to enforce the GPR range. */ +/* +** m_m64_w: +** sub (r[0-9]+), r0, #256 +** vstr.32 s0, \[\1\] +** bx lr +*/ +void +m_m64_w (float *r0) +{ + register float s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[-64] = s0; +} + +/* +** m_m63_w: +** vstr.32 s0, \[r0, #-252\] +** bx lr +*/ +void +m_m63_w (float *r0) +{ + register float s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[-63] = s0; +} + +/* +** m_m1_w: +** vstr.32 s0, \[r0, #-4\] +** bx lr +*/ +void +m_m1_w (float *r0) +{ + register float s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[-1] = s0; +} + +/* +** m_0_w: +** vstr.32 s0, \[r0\] +** bx lr +*/ +void +m_0_w (float *r0) +{ + register float s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[0] = s0; +} + +/* +** m_1_w: +** vstr.32 s0, \[r0, #4\] +** bx lr +*/ +void +m_1_w (float *r0) +{ + register float s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[1] = s0; +} + +/* +** m_255_w: +** vstr.32 s0, \[r0, #1020\] +** bx lr +*/ +void +m_255_w (float *r0) +{ + register float s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[255] = s0; +} + +/* +** m_256_w: +** add (r[0-9]+), r0, #1024 +** vstr.32 s0, \[\1\] +** bx lr +*/ +void +m_256_w (float *r0) +{ + register float s0 asm ("s0"); + asm volatile ("" : "=w" (s0)); + r0[256] = s0; +} diff --git a/gcc/testsuite/gcc.target/arm/armv8_1m-fp64-move-1.c b/gcc/testsuite/gcc.target/arm/armv8_1m-fp64-move-1.c new file mode 100644 index 0000000..3f81350 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/armv8_1m-fp64-move-1.c @@ -0,0 +1,426 @@ +/* { dg-do compile } */ +/* { dg-options "-O -mfloat-abi=hard" } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-add-options arm_v8_1m_mve } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +/* +** r_w: +** vmov r0, r1, d0 +** bx lr +*/ +void +r_w (double d0) +{ + register double r0 asm ("r0"); + r0 = d0; + asm volatile ("" :: "r" (r0)); +} + +/* +** w_r: +** vmov d0, r0, r1 +** bx lr +*/ +double +w_r () +{ + register double r0 asm ("r0"); + asm volatile ("" : "=r" (r0)); + return r0; +} + +/* +** w_w: +** ( +** vmov.f32 s2, s0 +** vmov.f32 s3, s1 +** | +** vmov.f32 s3, s1 +** vmov.f32 s2, s0 +** ) +** bx lr +*/ +void +w_w (double d0) +{ + register double d1 asm ("d1"); + d1 = d0; + asm volatile ("" :: "w" (d1)); +} + +/* +** r_m_m32: +** sub (r[0-9]+), r0, #256 +** ldrd r2, \[\1\] +** bx lr +*/ +void +r_m_m32 (double *r0) +{ + register double r2 asm ("r2"); + r2 = r0[-32]; + asm volatile ("" :: "r" (r2)); +} + +/* +** r_m_m31: +** ldrd r2, \[r0, #-248\] +** bx lr +*/ +void +r_m_m31 (double *r0) +{ + register double r2 asm ("r2"); + r2 = r0[-31]; + asm volatile ("" :: "r" (r2)); +} + +/* +** r_m_m1: +** ldrd r2, \[r0, #-8\] +** bx lr +*/ +void +r_m_m1 (double *r0) +{ + register double r2 asm ("r2"); + r2 = r0[-1]; + asm volatile ("" :: "r" (r2)); +} + +/* +** r_m_0: +** ldrd r2, \[r0\] +** bx lr +*/ +void +r_m_0 (double *r0) +{ + register double r2 asm ("r2"); + r2 = r0[0]; + asm volatile ("" :: "r" (r2)); +} + +/* +** r_m_1: +** ldrd r2, \[r0, #8\] +** bx lr +*/ +void +r_m_1 (double *r0) +{ + register double r2 asm ("r2"); + r2 = r0[1]; + asm volatile ("" :: "r" (r2)); +} + +/* +** r_m_127: +** ldrd r2, \[r0, #1016\] +** bx lr +*/ +void +r_m_127 (double *r0) +{ + register double r2 asm ("r2"); + r2 = r0[127]; + asm volatile ("" :: "r" (r2)); +} + +/* +** r_m_128: +** add (r[0-9]+), r0, #1024 +** ldrd r2, \[r0\] +** bx lr +*/ +void +r_m_128 (double *r0) +{ + register double r2 asm ("r2"); + r2 = r0[128]; + asm volatile ("" :: "r" (r2)); +} + +/* ??? This could be done in one instruction, but without mve.fp, + it makes more sense for memory_operand to enforce the GPR range. */ +/* +** w_m_m32: +** sub (r[0-9]+), r0, #256 +** vldr.64 d0, \[\1\] +** bx lr +*/ +void +w_m_m32 (double *r0) +{ + register double d0 asm ("d0"); + d0 = r0[-32]; + asm volatile ("" :: "w" (d0)); +} + +/* +** w_m_m31: +** vldr.64 d0, \[r0, #-248\] +** bx lr +*/ +void +w_m_m31 (double *r0) +{ + register double d0 asm ("d0"); + d0 = r0[-31]; + asm volatile ("" :: "w" (d0)); +} + +/* +** w_m_m1: +** vldr.64 d0, \[r0, #-8\] +** bx lr +*/ +void +w_m_m1 (double *r0) +{ + register double d0 asm ("d0"); + d0 = r0[-1]; + asm volatile ("" :: "w" (d0)); +} + +/* +** w_m_0: +** vldr.64 d0, \[r0\] +** bx lr +*/ +void +w_m_0 (double *r0) +{ + register double d0 asm ("d0"); + d0 = r0[0]; + asm volatile ("" :: "w" (d0)); +} + +/* +** w_m_1: +** vldr.64 d0, \[r0, #8\] +** bx lr +*/ +void +w_m_1 (double *r0) +{ + register double d0 asm ("d0"); + d0 = r0[1]; + asm volatile ("" :: "w" (d0)); +} + +/* +** w_m_127: +** vldr.64 d0, \[r0, #1016\] +** bx lr +*/ +void +w_m_127 (double *r0) +{ + register double d0 asm ("d0"); + d0 = r0[127]; + asm volatile ("" :: "w" (d0)); +} + +/* +** w_m_128: +** add (r[0-9]+), r0, #1024 +** vldr.64 d0, \[\1\] +** bx lr +*/ +void +w_m_128 (double *r0) +{ + register double d0 asm ("d0"); + d0 = r0[128]; + asm volatile ("" :: "w" (d0)); +} + +/* +** m_m32_r: +** sub (r[0-9]+), r0, #256 +** strd r2, \[\1\] +** bx lr +*/ +void +m_m32_r (double *r0) +{ + register double r2 asm ("r2"); + asm volatile ("" : "=r" (r2)); + r0[-32] = r2; +} + +/* +** m_m31_r: +** strd r2, \[r0, #-248\] +** bx lr +*/ +void +m_m31_r (double *r0) +{ + register double r2 asm ("r2"); + asm volatile ("" : "=r" (r2)); + r0[-31] = r2; +} + +/* +** m_m1_r: +** strd r2, \[r0, #-8\] +** bx lr +*/ +void +m_m1_r (double *r0) +{ + register double r2 asm ("r2"); + asm volatile ("" : "=r" (r2)); + r0[-1] = r2; +} + +/* +** m_0_r: +** strd r2, \[r0\] +** bx lr +*/ +void +m_0_r (double *r0) +{ + register double r2 asm ("r2"); + asm volatile ("" : "=r" (r2)); + r0[0] = r2; +} + +/* +** m_1_r: +** strd r2, \[r0, #8\] +** bx lr +*/ +void +m_1_r (double *r0) +{ + register double r2 asm ("r2"); + asm volatile ("" : "=r" (r2)); + r0[1] = r2; +} + +/* +** m_127_r: +** strd r2, \[r0, #1016\] +** bx lr +*/ +void +m_127_r (double *r0) +{ + register double r2 asm ("r2"); + asm volatile ("" : "=r" (r2)); + r0[127] = r2; +} + +/* +** m_128_r: +** add (r[0-9]+), r0, #1024 +** strd r2, \[r0\] +** bx lr +*/ +void +m_128_r (double *r0) +{ + register double r2 asm ("r2"); + asm volatile ("" : "=r" (r2)); + r0[128] = r2; +} + +/* ??? This could be done in one instruction, but without mve.fp, + it makes more sense for memory_operand to enforce the GPR range. */ +/* +** m_m32_w: +** sub (r[0-9]+), r0, #256 +** vstr.64 d0, \[\1\] +** bx lr +*/ +void +m_m32_w (double *r0) +{ + register double d0 asm ("d0"); + asm volatile ("" : "=w" (d0)); + r0[-32] = d0; +} + +/* +** m_m31_w: +** vstr.64 d0, \[r0, #-248\] +** bx lr +*/ +void +m_m31_w (double *r0) +{ + register double d0 asm ("d0"); + asm volatile ("" : "=w" (d0)); + r0[-31] = d0; +} + +/* +** m_m1_w: +** vstr.64 d0, \[r0, #-8\] +** bx lr +*/ +void +m_m1_w (double *r0) +{ + register double d0 asm ("d0"); + asm volatile ("" : "=w" (d0)); + r0[-1] = d0; +} + +/* +** m_0_w: +** vstr.64 d0, \[r0\] +** bx lr +*/ +void +m_0_w (double *r0) +{ + register double d0 asm ("d0"); + asm volatile ("" : "=w" (d0)); + r0[0] = d0; +} + +/* +** m_1_w: +** vstr.64 d0, \[r0, #8\] +** bx lr +*/ +void +m_1_w (double *r0) +{ + register double d0 asm ("d0"); + asm volatile ("" : "=w" (d0)); + r0[1] = d0; +} + +/* +** m_127_w: +** vstr.64 d0, \[r0, #1016\] +** bx lr +*/ +void +m_127_w (double *r0) +{ + register double d0 asm ("d0"); + asm volatile ("" : "=w" (d0)); + r0[127] = d0; +} + +/* +** m_128_w: +** add (r[0-9]+), r0, #1024 +** vstr.64 d0, \[\1\] +** bx lr +*/ +void +m_128_w (double *r0) +{ + register double d0 asm ("d0"); + asm volatile ("" : "=w" (d0)); + r0[128] = d0; +} diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve-vldstr16-no-writeback.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve-vldstr16-no-writeback.c index 0a69ace..50b1953 100644 --- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve-vldstr16-no-writeback.c +++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve-vldstr16-no-writeback.c @@ -13,5 +13,6 @@ fn1 (__fp16 *pSrc) pDst[i] = high; } -/* { dg-final { scan-assembler {vldr\.16\ts[0-9]+, \[r[0-9]+\]\n} } } */ -/* { dg-final { scan-assembler {vstr\.16\ts[0-9]+, \[r[0-9]+\]\n} } } */ +/* { dg-final { scan-assembler {vldr\.16\ts[0-9]+, \[r[0-9]+(, #-?[0-9]+)?\]\n} } } */ +/* { dg-final { scan-assembler-not {vldr\.16\t[^\n]*\]!} } } */ +/* { dg-final { scan-assembler-not {vstr\.16\t[^\n]*\]!} } } */ -- 2.7.4