aarch64: Improve size optimisation heuristic for setmem expansion

author Kyrylo Tkachov <kyrylo.tkachov@arm.com>

Wed, 29 Sep 2021 10:00:14 +0000 (11:00 +0100)

committer Kyrylo Tkachov <kyrylo.tkachov@arm.com>

Wed, 29 Sep 2021 10:00:14 +0000 (11:00 +0100)
author Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Wed, 29 Sep 2021 10:00:14 +0000 (11:00 +0100)
committer Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Wed, 29 Sep 2021 10:00:14 +0000 (11:00 +0100)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 36519cc..ac17c1c 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23534,40 +23534,37 @@ aarch64_expand_setmem (rtx *operands)
    if (!CONST_INT_P (operands[1]))
      return false;
  
-  bool speed_p = !optimize_function_for_size_p (cfun);
+  bool size_p = optimize_function_for_size_p (cfun);
  
    /* Default the maximum to 256-bytes.  */
    unsigned max_set_size = 256;
  
-  /* In case we are optimizing for size or if the core does not
-     want to use STP Q regs, lower the max_set_size.  */
-  max_set_size = (!speed_p
-                 || (aarch64_tune_params.extra_tuning_flags
-                     & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-                 ? max_set_size / 2 : max_set_size;
-
    len = INTVAL (operands[1]);
  
    /* Upper bound check.  */
    if (len > max_set_size)
      return false;
  
+  /* Attempt a sequence with a vector broadcast followed by stores.
+     Count the number of operations involved to see if it's worth it for
+     code size.  */
+  start_sequence ();
+  unsigned nops = 0;
    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
  
    /* Prepare the val using a DUP/MOVI v0.16B, val.  */
    src = expand_vector_broadcast (V16QImode, val);
    src = force_reg (V16QImode, src);
-
+  nops++;
    /* Convert len to bits to make the rest of the code simpler.  */
    n = len * BITS_PER_UNIT;
  
    /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
       AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  setmem expand
       pattern is only turned on for TARGET_SIMD.  */
-  const int copy_limit = (speed_p
-                         && (aarch64_tune_params.extra_tuning_flags
-                             & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
+                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
                           ? GET_MODE_BITSIZE (TImode) : 256;
  
    while (n > 0)
@@ -23583,7 +23580,7 @@ aarch64_expand_setmem (rtx *operands)
  
        mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
        aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-
+      nops++;
        n -= mode_bits;
  
        /* Do certain trailing copies as overlapping if it's going to be
@@ -23599,7 +23596,15 @@ aarch64_expand_setmem (rtx *operands)
           n = n_bits;
         }
      }
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+  /* A call to memset in the worst case requires 3 instructions to prepare
+     the arguments + 1 for the call.  Prefer the inline sequence for size if
+     it is no longer than that.  */
+  if (size_p && nops > 4)
+    return false;
  
+  emit_insn (seq);
    return true;
  }
  
diff --git a/gcc/testsuite/gcc.target/aarch64/memset-corner-cases-2.c b/gcc/testsuite/gcc.target/aarch64/memset-corner-cases-2.c

new file mode 100644 (file)

index 0000000..9ee96f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memset-corner-cases-2.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+/* 127 bytes should use libcall for size.
+**set127byte:
+**     mov     x2, 127
+**     b       memset
+*/
+void __attribute__((__noinline__))
+set127byte (int64_t *src, int c)
+{
+  __builtin_memset (src, c, 127);
+}
+
+/* 128 bytes should use libcall for size.
+**set128byte:
+**     mov     x2, 128
+**     b       memset
+*/
+void __attribute__((__noinline__))
+set128byte (int64_t *src, int c)
+{
+  __builtin_memset (src, c, 128);
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c

index 5cdc8a4..664d43a 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
@@ -1,14 +1,14 @@
  /* { dg-do compile } */
-/* { dg-options "-Os -mstrict-align" } */
+/* { dg-options "-O2 -mstrict-align" } */
  
-struct s { char x[95]; };
+struct s { char x[255]; };
  void foo (struct s *);
  void bar (void) { struct s s1 = {}; foo (&s1); }
  
-/* memset (s1 = {}, sizeof = 95) should be expanded out
+/* memset (s1 = {}, sizeof = 255) should be expanded out
     such that there are no overlap stores when -mstrict-align
     is in use.
-   so 2 pair 16 bytes stores (64 bytes).
+   so 7 pairs of 16 bytes stores (224 bytes).
     1 16 byte stores
     1 8 byte store
     1 4 byte store
@@ -16,7 +16,7 @@ void bar (void) { struct s s1 = {}; foo (&s1); }
     1 1 byte store
     */
  
-/* { dg-final { scan-assembler-times "stp\tq" 2 } } */
+/* { dg-final { scan-assembler-times "stp\tq" 7 } } */
  /* { dg-final { scan-assembler-times "str\tq" 1 } } */
  /* { dg-final { scan-assembler-times "str\txzr" 1 } } */
  /* { dg-final { scan-assembler-times "str\twzr" 1 } } */
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>
	Wed, 29 Sep 2021 10:00:14 +0000 (11:00 +0100)
committer	Kyrylo Tkachov <kyrylo.tkachov@arm.com>
	Wed, 29 Sep 2021 10:00:14 +0000 (11:00 +0100)
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/memset-corner-cases-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c		patch \| blob \| history