aarch64: Add internal tune flag to minimise VL-based scalar ops
authorKyrylo Tkachov <kyrylo.tkachov@arm.com>
Mon, 22 Feb 2021 21:24:41 +0000 (21:24 +0000)
committerKyrylo Tkachov <kyrylo.tkachov@arm.com>
Mon, 22 Feb 2021 21:24:41 +0000 (21:24 +0000)
This patch introduces an internal tune flag to break up VL-based scalar ops
into a GP-reg scalar op with the VL read kept separate. This can be preferable on some CPUs.

I went for a tune param rather than extending the rtx costs as our RTX costs tables aren't set up to track
this intricacy.

I've confirmed that on the simple loop:
void vadd (int *dst, int *op1, int *op2, int count)
{
  for (int i = 0; i < count; ++i)
    dst[i] = op1[i] + op2[i];
}

we now split the incw into a cntw outside the loop and the add inside.

+       cntw    x5
...
loop:
-       incw    x4
+       add     x4, x4, x5

gcc/ChangeLog:

* config/aarch64/aarch64-tuning-flags.def (cse_sve_vl_constants):
Define.
* config/aarch64/aarch64.md (add<mode>3): Force CONST_POLY_INT immediates
into a register when the above is enabled.
* config/aarch64/aarch64.c (neoversev1_tunings):
AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.
(aarch64_rtx_costs): Use AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.

gcc/testsuite/

* gcc.target/aarch64/sve/cse_sve_vl_constants_1.c: New test.

gcc/config/aarch64/aarch64-tuning-flags.def
gcc/config/aarch64/aarch64.c
gcc/config/aarch64/aarch64.md
gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c [new file with mode: 0644]

index aae9952..588edf4 100644 (file)
@@ -46,4 +46,6 @@ AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
 
+AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
index 6fda6bc..6997669 100644 (file)
@@ -1492,7 +1492,7 @@ static const struct tune_params neoversev1_tunings =
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),   /* tune_flags.  */
   &generic_prefetch_tune
 };
 
@@ -12589,8 +12589,18 @@ cost_plus:
            *cost += rtx_cost (op0, mode, PLUS, 0, speed);
 
            if (speed)
-             /* ADD (immediate).  */
-             *cost += extra_cost->alu.arith;
+             {
+               /* ADD (immediate).  */
+               *cost += extra_cost->alu.arith;
+
+               /* Some tunings prefer to not use the VL-based scalar ops.
+                  Increase the cost of the poly immediate to prevent their
+                  formation.  */
+               if (GET_CODE (op1) == CONST_POLY_INT
+                   && (aarch64_tune_params.extra_tuning_flags
+                       & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
+                 *cost += COSTS_N_INSNS (1);
+             }
            return true;
          }
 
index a482419..65d00c4 100644 (file)
       && (!REG_P (op1)
         || !REGNO_PTR_FRAME_P (REGNO (op1))))
     operands[2] = force_reg (<MODE>mode, operands[2]);
+  /* Some tunings prefer to avoid VL-based operations.
+     Split off the poly immediate here.  The rtx costs hook will reject attempts
+     to combine them back.  */
+  else if (GET_CODE (operands[2]) == CONST_POLY_INT
+          && can_create_pseudo_p ()
+          && (aarch64_tune_params.extra_tuning_flags
+              & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
   /* Expand polynomial additions now if the destination is the stack
      pointer, since we don't want to use that as a temporary.  */
   else if (operands[0] == stack_pointer_rtx
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c
new file mode 100644 (file)
index 0000000..dd04b66
--- /dev/null
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -moverride=tune=cse_sve_vl_constants" } */
+
+void __attribute__((noinline, noclone))
+vadd (int *dst, int *op1, int *op2, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not {\tincw\tx[0-9]+} } } */
+