Use single-iteration epilogues when peeling for gaps

author Richard Sandiford <richard.sandiford@linaro.org>

Sat, 13 Jan 2018 18:00:41 +0000 (18:00 +0000)

committer Richard Sandiford <rsandifo@gcc.gnu.org>

Sat, 13 Jan 2018 18:00:41 +0000 (18:00 +0000)
author Richard Sandiford <richard.sandiford@linaro.org>
Sat, 13 Jan 2018 18:00:41 +0000 (18:00 +0000)
committer Richard Sandiford <rsandifo@gcc.gnu.org>
Sat, 13 Jan 2018 18:00:41 +0000 (18:00 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 73bfb41..e5e7bf7 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -2,6 +2,21 @@
             Alan Hayward  <alan.hayward@arm.com>
             David Sherwood  <david.sherwood@arm.com>
  
+       * tree-vect-loop-manip.c (vect_gen_scalar_loop_niters): Replace
+       vfm1 with a bound_epilog parameter.
+       (vect_do_peeling): Update calls accordingly, and move the prologue
+       call earlier in the function.  Treat the base bound_epilog as 0 for
+       fully-masked loops and retain vf - 1 for other loops.  Add 1 to
+       this base when peeling for gaps.
+       * tree-vect-loop.c (vect_analyze_loop_2): Allow peeling for gaps
+       with fully-masked loops.
+       (vect_estimate_min_profitable_iters): Handle the single peeled
+       iteration in that case.
+
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
         * tree-vect-data-refs.c (vect_analyze_group_access_1): Allow
         single-element interleaving even if the size is not a power of 2.
         * tree-vect-stmts.c (get_load_store_type): Disallow elementwise
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 72da419..bedb8e3 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -2,6 +2,22 @@
             Alan Hayward  <alan.hayward@arm.com>
             David Sherwood  <david.sherwood@arm.com>
  
+       * gcc.target/aarch64/sve/struct_vect_18.c: Check the number
+       of branches.
+       * gcc.target/aarch64/sve/struct_vect_19.c: Likewise.
+       * gcc.target/aarch64/sve/struct_vect_20.c: New test.
+       * gcc.target/aarch64/sve/struct_vect_20_run.c: Likewise.
+       * gcc.target/aarch64/sve/struct_vect_21.c: Likewise.
+       * gcc.target/aarch64/sve/struct_vect_21_run.c: Likewise.
+       * gcc.target/aarch64/sve/struct_vect_22.c: Likewise.
+       * gcc.target/aarch64/sve/struct_vect_22_run.c: Likewise.
+       * gcc.target/aarch64/sve/struct_vect_23.c: Likewise.
+       * gcc.target/aarch64/sve/struct_vect_23_run.c: Likewise.
+
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
         * gcc.target/aarch64/sve/struct_vect_18.c: New test.
         * gcc.target/aarch64/sve/struct_vect_18_run.c: Likewise.
         * gcc.target/aarch64/sve/struct_vect_19.c: Likewise.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c

index 67b08d1..8b93b4c 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
@@ -42,3 +42,6 @@ TEST (test)
  /* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
  /* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
  /* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
+
+/* The only branches should be in the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c

index 3754190..6a67e18 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
@@ -40,3 +40,8 @@ TEST (test)
  /* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
  /* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
  /* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
+
+/* Each function should have three branches: one directly to the exit
+   (n <= 0), one to the single scalar epilogue iteration (n == 1),
+   and one branch-back for the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 12 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c

new file mode 100644 (file)

index 0000000..6d616eb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define N 2000
+
+#define TEST_LOOP(NAME, TYPE)                                  \
+  void __attribute__ ((noinline, noclone))                     \
+  NAME (TYPE *restrict dest, TYPE *restrict src)               \
+  {                                                            \
+    for (int i = 0; i < N; ++i)                                        \
+      dest[i] += src[i * 2];                                   \
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i8, signed char) \
+  TEST_LOOP (NAME##_i16, unsigned short) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */
+
+/* Check the scalar tail.  */
+/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
+
+/* The only branches should be in the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20_run.c

new file mode 100644 (file)

index 0000000..978f02b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20_run.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "struct_vect_20.c"
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)                          \
+  {                                                    \
+    TYPE out[N];                                       \
+    TYPE in[N * 2];                                    \
+    for (int i = 0; i < N; ++i)                                \
+      {                                                        \
+       out[i] = i * 7 / 2;                             \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+    for (int i = 0; i < N * 2; ++i)                    \
+      {                                                        \
+       in[i] = i * 9 / 2;                              \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+    NAME (out, in);                                    \
+    for (int i = 0; i < N; ++i)                                \
+      {                                                        \
+       TYPE expected = i * 7 / 2 + in[i * 2];          \
+       if (out[i] != expected)                         \
+         __builtin_abort ();                           \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c

new file mode 100644 (file)

index 0000000..4758c9d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TEST_LOOP(NAME, TYPE)                                  \
+  void __attribute__ ((noinline, noclone))                     \
+  NAME (TYPE *restrict dest, TYPE *restrict src, int n)                \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      dest[i] += src[i * 2];                                   \
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i8, signed char) \
+  TEST_LOOP (NAME##_i16, unsigned short) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */
+
+/* Check the scalar tail.  */
+/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
+
+/* Each function should have three branches: one directly to the exit
+   (n <= 0), one to the single scalar epilogue iteration (n == 1),
+   and one branch-back for the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 12 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21_run.c

new file mode 100644 (file)

index 0000000..1f21929
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21_run.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "struct_vect_21.c"
+
+#define N 1000
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)                  \
+  {                                            \
+    TYPE out[N];                               \
+    TYPE in[N * 2];                            \
+    int counts[] = { 0, 1, N - 1 };            \
+    for (int j = 0; j < 3; ++j)                        \
+      {                                                \
+       int count = counts[j];                  \
+       for (int i = 0; i < N; ++i)             \
+         {                                     \
+           out[i] = i * 7 / 2;                 \
+           asm volatile ("" ::: "memory");     \
+         }                                     \
+       for (int i = 0; i < N * 2; ++i)         \
+         {                                     \
+           in[i] = i * 9 / 2;                  \
+           asm volatile ("" ::: "memory");     \
+         }                                     \
+       NAME (out, in, count);                  \
+       for (int i = 0; i < N; ++i)             \
+         {                                     \
+           TYPE expected = i * 7 / 2;          \
+           if (i < count)                      \
+             expected += in[i * 2];            \
+           if (out[i] != expected)             \
+             __builtin_abort ();               \
+           asm volatile ("" ::: "memory");     \
+         }                                     \
+      }                                                \
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c

new file mode 100644 (file)

index 0000000..322b561
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define N 2000
+
+#define TEST_LOOP(NAME, TYPE)                                  \
+  void __attribute__ ((noinline, noclone))                     \
+  NAME (TYPE *restrict dest, TYPE *restrict src)               \
+  {                                                            \
+    for (int i = 0; i < N; ++i)                                        \
+      dest[i] += src[i * 4];                                   \
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i8, signed char) \
+  TEST_LOOP (NAME##_i16, unsigned short) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */
+
+/* Check the scalar tail.  */
+/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
+
+/* The only branches should be in the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22_run.c

new file mode 100644 (file)

index 0000000..e9386f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22_run.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "struct_vect_22.c"
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)                          \
+  {                                                    \
+    TYPE out[N];                                       \
+    TYPE in[N * 4];                                    \
+    for (int i = 0; i < N; ++i)                                \
+      {                                                        \
+       out[i] = i * 7 / 2;                             \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+    for (int i = 0; i < N * 4; ++i)                    \
+      {                                                        \
+       in[i] = i * 9 / 2;                              \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+    NAME (out, in);                                    \
+    for (int i = 0; i < N; ++i)                                \
+      {                                                        \
+       TYPE expected = i * 7 / 2 + in[i * 4];          \
+       if (out[i] != expected)                         \
+         __builtin_abort ();                           \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c

new file mode 100644 (file)

index 0000000..1698a2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TEST_LOOP(NAME, TYPE)                                  \
+  void __attribute__ ((noinline, noclone))                     \
+  NAME (TYPE *restrict dest, TYPE *restrict src, int n)                \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      dest[i] += src[i * 4];                                   \
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i8, signed char) \
+  TEST_LOOP (NAME##_i16, unsigned short) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */
+
+/* Check the scalar tail.  */
+/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
+
+/* Each function should have three branches: one directly to the exit
+   (n <= 0), one to the single scalar epilogue iteration (n == 1),
+   and one branch-back for the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 12 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23_run.c

new file mode 100644 (file)

index 0000000..55906d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23_run.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "struct_vect_23.c"
+
+#define N 1000
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)                  \
+  {                                            \
+    TYPE out[N];                               \
+    TYPE in[N * 4];                            \
+    int counts[] = { 0, 1, N - 1 };            \
+    for (int j = 0; j < 3; ++j)                        \
+      {                                                \
+       int count = counts[j];                  \
+       for (int i = 0; i < N; ++i)             \
+         {                                     \
+           out[i] = i * 7 / 2;                 \
+           asm volatile ("" ::: "memory");     \
+         }                                     \
+       for (int i = 0; i < N * 4; ++i)         \
+         {                                     \
+           in[i] = i * 9 / 2;                  \
+           asm volatile ("" ::: "memory");     \
+         }                                     \
+       NAME (out, in, count);                  \
+       for (int i = 0; i < N; ++i)             \
+         {                                     \
+           TYPE expected = i * 7 / 2;          \
+           if (i < count)                      \
+             expected += in[i * 4];            \
+           if (out[i] != expected)             \
+             __builtin_abort ();               \
+           asm volatile ("" ::: "memory");     \
+         }                                     \
+      }                                                \
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c

index b9bb047..a2b4989 100644 (file)
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -618,8 +618,9 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
  
  /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
     LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
-   number of iterations of the original scalar loop.  NITERS_MAYBE_ZERO
-   and FINAL_IV are as for vect_set_loop_condition.
+   number of iterations of the original scalar loop that should be
+   handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
+   as for vect_set_loop_condition.
  
     Insert the branch-back condition before LOOP_COND_GSI and return the
     final gcond.  */
@@ -1836,23 +1837,24 @@ vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
  /* Calculate the number of iterations above which vectorized loop will be
     preferred than scalar loop.  NITERS_PROLOG is the number of iterations
     of prolog loop.  If it's integer const, the integer number is also passed
-   in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (included) of
-   number of iterations of prolog loop.  VFM1 is vector factor minus one.
-   If CHECK_PROFITABILITY is true, TH is the threshold below which scalar
-   (rather than vectorized) loop will be executed.  This function stores
-   upper bound (included) of the result in BOUND_SCALAR.  */
+   in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
+   number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
+   value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
+   threshold below which the scalar (rather than vectorized) loop will be
+   executed.  This function stores the upper bound (inclusive) of the result
+   in BOUND_SCALAR.  */
  
  static tree
  vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
-                            int bound_prolog, poly_int64 vfm1, int th,
+                            int bound_prolog, poly_int64 bound_epilog, int th,
                              poly_uint64 *bound_scalar,
                              bool check_profitability)
  {
    tree type = TREE_TYPE (niters_prolog);
    tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
-                            build_int_cst (type, vfm1));
+                            build_int_cst (type, bound_epilog));
  
-  *bound_scalar = vfm1 + bound_prolog;
+  *bound_scalar = bound_prolog + bound_epilog;
    if (check_profitability)
      {
        /* TH indicates the minimum niters of vectorized loop, while we
@@ -1861,18 +1863,18 @@ vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
        /* Peeling for constant times.  */
        if (int_niters_prolog >= 0)
         {
-         *bound_scalar = upper_bound (int_niters_prolog + vfm1, th);
+         *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
           return build_int_cst (type, *bound_scalar);
         }
-      /* Peeling for unknown times.  Note BOUND_PROLOG is the upper
-        bound (inlcuded) of niters of prolog loop.  */
-      if (known_ge (th, vfm1 + bound_prolog))
+      /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
+        and BOUND_EPILOG are inclusive upper bounds.  */
+      if (known_ge (th, bound_prolog + bound_epilog))
         {
           *bound_scalar = th;
           return build_int_cst (type, th);
         }
        /* Need to do runtime comparison.  */
-      else if (maybe_gt (th, vfm1))
+      else if (maybe_gt (th, bound_epilog))
         {
           *bound_scalar = upper_bound (*bound_scalar, th);
           return fold_build2 (MAX_EXPR, type,
@@ -2405,14 +2407,20 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
    tree type = TREE_TYPE (niters), guard_cond;
    basic_block guard_bb, guard_to;
    profile_probability prob_prolog, prob_vector, prob_epilog;
-  int bound_prolog = 0;
-  poly_uint64 bound_scalar = 0;
    int estimated_vf;
    int prolog_peeling = 0;
    if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
      prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-  bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
-                        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  poly_uint64 bound_epilog = 0;
+  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
+    bound_epilog += vf - 1;
+  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    bound_epilog += 1;
+  bool epilog_peeling = maybe_ne (bound_epilog, 0U);
+  poly_uint64 bound_scalar = bound_epilog;
  
    if (!prolog_peeling && !epilog_peeling)
      return NULL;
@@ -2423,7 +2431,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
      estimated_vf = 3;
    prob_prolog = prob_epilog = profile_probability::guessed_always ()
                         .apply_scale (estimated_vf - 1, estimated_vf);
-  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
  
    struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
    struct loop *first_loop = loop;
@@ -2438,14 +2445,29 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
      }
    initialize_original_copy_tables ();
  
+  /* Record the anchor bb at which the guard should be placed if the scalar
+     loop might be preferred.  */
+  basic_block anchor = loop_preheader_edge (loop)->src;
+
+  /* Generate the number of iterations for the prolog loop.  We do this here
+     so that we can also get the upper bound on the number of iterations.  */
+  tree niters_prolog;
+  int bound_prolog = 0;
+  if (prolog_peeling)
+    niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
+                                                &bound_prolog);
+  else
+    niters_prolog = build_int_cst (type, 0);
+
    /* Prolog loop may be skipped.  */
    bool skip_prolog = (prolog_peeling != 0);
    /* Skip to epilog if scalar loop may be preferred.  It's only needed
       when we peel for epilog loop and when it hasn't been checked with
       loop versioning.  */
-  bool skip_vector = ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-                      && !LOOP_REQUIRES_VERSIONING (loop_vinfo))
-                     || !vf.is_constant ());
+  bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+                     ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
+                                 bound_prolog + bound_epilog)
+                     : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
    /* Epilog loop must be executed if the number of iterations for epilog
       loop is known at compile time, otherwise we need to add a check at
       the end of vector loop and skip to the end of epilog loop.  */
@@ -2456,9 +2478,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
    if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
      skip_epilog = false;
  
-  /* Record the anchor bb at which guard should be placed if scalar loop
-     may be preferred.  */
-  basic_block anchor = loop_preheader_edge (loop)->src;
    if (skip_vector)
      {
        split_edge (loop_preheader_edge (loop));
@@ -2476,7 +2495,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
         }
      }
  
-  tree niters_prolog = build_int_cst (type, 0);
    source_location loop_loc = find_loop_location (loop);
    struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
    if (prolog_peeling)
@@ -2500,9 +2518,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
        first_loop = prolog;
        reset_original_copy_tables ();
  
-      /* Generate and update the number of iterations for prolog loop.  */
-      niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
-                                                  &bound_prolog);
+      /* Update the number of iterations for prolog loop.  */
        tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
        vect_set_loop_condition (prolog, NULL, niters_prolog,
                                step_prolog, NULL_TREE, false);
@@ -2577,10 +2593,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
        if (skip_vector)
         {
           /* Additional epilogue iteration is peeled if gap exists.  */
-         bool peel_for_gaps = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
-                                               bound_prolog,
-                                               peel_for_gaps ? vf : vf - 1,
+                                               bound_prolog, bound_epilog,
                                                 th, &bound_scalar,
                                                 check_profitability);
           /* Build guard against NITERSM1 since NITERS may overflow.  */
@@ -2664,14 +2678,12 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
        else
         slpeel_update_phi_nodes_for_lcssa (epilog);
  
-      unsigned HOST_WIDE_INT bound1, bound2;
-      if (vf.is_constant (&bound1) && bound_scalar.is_constant (&bound2))
+      unsigned HOST_WIDE_INT bound;
+      if (bound_scalar.is_constant (&bound))
         {
-         bound1 -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 2;
-         if (bound2)
-           /* We share epilog loop with scalar version loop.  */
-           bound1 = MAX (bound1, bound2 - 1);
-         record_niter_bound (epilog, bound1, false, true);
+         gcc_assert (bound != 0);
+         /* -1 to convert loop iterations to latch iterations.  */
+         record_niter_bound (epilog, bound - 1, false, true);
         }
  
        delete_update_ssa ();
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index b3779e7..4b9226f 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2260,16 +2260,6 @@ start_over:
        return false;
      }
  
-  if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
-      && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-    {
-      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "can't use a fully-masked loop because peeling for"
-                        " gaps is required.\n");
-    }
-
    /* Decide whether to use a fully-masked loop for this vectorization
       factor.  */
    LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
@@ -3714,6 +3704,23 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      {
        peel_iters_prologue = 0;
        peel_iters_epilogue = 0;
+
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+       {
+         /* We need to peel exactly one iteration.  */
+         peel_iters_epilogue += 1;
+         stmt_info_for_cost *si;
+         int j;
+         FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
+                           j, si)
+           {
+             struct _stmt_vec_info *stmt_info
+               = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
+             (void) add_stmt_cost (target_cost_data, si->count,
+                                   si->kind, stmt_info, si->misalign,
+                                   vect_epilogue);
+           }
+       }
      }
    else if (npeel < 0)
      {
author	Richard Sandiford <richard.sandiford@linaro.org>
	Sat, 13 Jan 2018 18:00:41 +0000 (18:00 +0000)
committer	Richard Sandiford <rsandifo@gcc.gnu.org>
	Sat, 13 Jan 2018 18:00:41 +0000 (18:00 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23_run.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-loop-manip.c		patch \| blob \| history
gcc/tree-vect-loop.c		patch \| blob \| history