Account for the cost of generating loop masks

author Richard Sandiford <richard.sandiford@arm.com>

Wed, 13 Nov 2019 09:12:17 +0000 (09:12 +0000)

committer Richard Sandiford <rsandifo@gcc.gnu.org>

Wed, 13 Nov 2019 09:12:17 +0000 (09:12 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Wed, 13 Nov 2019 09:12:17 +0000 (09:12 +0000)
committer Richard Sandiford <rsandifo@gcc.gnu.org>
Wed, 13 Nov 2019 09:12:17 +0000 (09:12 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index e7b0433..0470528 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,10 @@
  2019-11-13  Richard Sandiford  <richard.sandiford@arm.com>
  
+       * tree-vect-loop.c (vect_estimate_min_profitable_iters): Include
+       the cost of generating loop masks.
+
+2019-11-13  Richard Sandiford  <richard.sandiford@arm.com>
+
         * tree-vectorizer.h (vect_apply_runtime_profitability_check_p):
         New function.
         * tree-vect-loop-manip.c (vect_loop_versioning): Use it.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index a253a53..834c17a 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,5 +1,15 @@
  2019-11-13  Richard Sandiford  <richard.sandiford@arm.com>
  
+       * gcc.target/aarch64/sve/mask_struct_store_3.c: Add
+       -fno-vect-cost-model.
+       * gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_2.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_3.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise.
+
+2019-11-13  Richard Sandiford  <richard.sandiford@arm.com>
+
         PR c++/92206
         * g++.dg/cpp0x/alias-decl-pr92206-1.C: New test.
         * g++.dg/cpp0x/alias-decl-pr92206-2.C: Likewise.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c

index 001f5be..1765d54 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
  
  #include <stdint.h>
  
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c

index 31d661b..4dbe033 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c
@@ -1,5 +1,5 @@
  /* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
  
  #include "mask_struct_store_3.c"
  
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c

index e792cdf..df82d58 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
@@ -1,7 +1,7 @@
  /* { dg-do compile } */
  /* Pick an arbitrary target for which unaligned accesses are more
     expensive.  */
-/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
+/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */
  
  #define N 512
  #define START 7
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c

index 9c5ae1b..b978535 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c
@@ -1,6 +1,6 @@
  /* { dg-do run { target aarch64_sve_hw } } */
  /* { dg-options "-O3 -mtune=thunderx" } */
-/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256 -fno-vect-cost-model" { target aarch64_sve256_hw } } */
  
  #include "peel_ind_2.c"
  
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c

index 441589e..1707f02 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c
@@ -1,7 +1,7 @@
  /* { dg-do compile } */
  /* Pick an arbitrary target for which unaligned accesses are more
     expensive.  */
-/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
+/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */
  
  #define N 32
  #define MAX_START 8
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c

index 384a38e..9838967 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c
@@ -1,6 +1,6 @@
  /* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O3 -mtune=thunderx" } */
-/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -mtune=thunderx -fno-vect-cost-model" } */
+/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256 -fno-vect-cost-model" { target aarch64_sve256_hw } } */
  
  #include "peel_ind_3.c"
  
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index 83fb848..005fa30 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3291,6 +3291,32 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
                                   si->kind, si->stmt_info, si->misalign,
                                   vect_epilogue);
         }
+
+      /* Calculate how many masks we need to generate.  */
+      unsigned int num_masks = 0;
+      rgroup_masks *rgm;
+      unsigned int num_vectors_m1;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
+       if (rgm->mask_type)
+         num_masks += num_vectors_m1 + 1;
+      gcc_assert (num_masks > 0);
+
+      /* In the worst case, we need to generate each mask in the prologue
+        and in the loop body.  One of the loop body mask instructions
+        replaces the comparison in the scalar loop, and since we don't
+        count the scalar comparison against the scalar body, we shouldn't
+        count that vector instruction against the vector body either.
+
+        Sometimes we can use unpacks instead of generating prologue
+        masks and sometimes the prologue mask will fold to a constant,
+        so the actual prologue cost might be smaller.  However, it's
+        simpler and safer to use the worst-case cost; if this ends up
+        being the tie-breaker between vectorizing or not, then it's
+        probably better not to vectorize.  */
+      (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
+                           NULL, 0, vect_prologue);
+      (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
+                           NULL, 0, vect_body);
      }
    else if (npeel < 0)
      {
author	Richard Sandiford <richard.sandiford@arm.com>
	Wed, 13 Nov 2019 09:12:17 +0000 (09:12 +0000)
committer	Richard Sandiford <rsandifo@gcc.gnu.org>
	Wed, 13 Nov 2019 09:12:17 +0000 (09:12 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c		patch \| blob \| history
gcc/tree-vect-loop.c		patch \| blob \| history