Handle peeling for alignment with masking
authorRichard Sandiford <richard.sandiford@linaro.org>
Sat, 13 Jan 2018 17:59:32 +0000 (17:59 +0000)
committerRichard Sandiford <rsandifo@gcc.gnu.org>
Sat, 13 Jan 2018 17:59:32 +0000 (17:59 +0000)
This patch adds support for aligning vectors by using a partial
first iteration.  E.g. if the start pointer is 3 elements beyond
an aligned address, the first iteration will have a mask in which
the first three elements are false.

On SVE, the optimisation is only useful for vector-length-specific
code.  Vector-length-agnostic code doesn't try to align vectors
since the vector length might not be a power of 2.

2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
    Alan Hayward  <alan.hayward@arm.com>
    David Sherwood  <david.sherwood@arm.com>

gcc/
* tree-vectorizer.h (_loop_vec_info::mask_skip_niters): New field.
(LOOP_VINFO_MASK_SKIP_NITERS): New macro.
(vect_use_loop_mask_for_alignment_p): New function.
(vect_prepare_for_masked_peels, vect_gen_while_not): Declare.
* tree-vect-loop-manip.c (vect_set_loop_masks_directly): Add an
niters_skip argument.  Make sure that the first niters_skip elements
of the first iteration are inactive.
(vect_set_loop_condition_masked): Handle LOOP_VINFO_MASK_SKIP_NITERS.
Update call to vect_set_loop_masks_directly.
(get_misalign_in_elems): New function, split out from...
(vect_gen_prolog_loop_niters): ...here.
(vect_update_init_of_dr): Take a code argument that specifies whether
the adjustment should be added or subtracted.
(vect_update_init_of_drs): Likewise.
(vect_prepare_for_masked_peels): New function.
(vect_do_peeling): Skip prologue peeling if we're using a mask
instead.  Update call to vect_update_inits_of_drs.
* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize
mask_skip_niters.
(vect_analyze_loop_2): Allow fully-masked loops with peeling for
alignment.  Do not include the number of peeled iterations in
the minimum threshold in that case.
(vectorizable_induction): Adjust the start value down by
LOOP_VINFO_MASK_SKIP_NITERS iterations.
(vect_transform_loop): Call vect_prepare_for_masked_peels.
Take the number of skipped iterations into account when calculating
the loop bounds.
* tree-vect-stmts.c (vect_gen_while_not): New function.

gcc/testsuite/
* gcc.target/aarch64/sve/nopeel_1.c: New test.
* gcc.target/aarch64/sve/peel_ind_1.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_1_run.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_2.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_3.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_4.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_4_run.c: Likewise.

Co-Authored-By: Alan Hayward <alan.hayward@arm.com>
Co-Authored-By: David Sherwood <david.sherwood@arm.com>
From-SVN: r256630

15 files changed:
gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/sve/nopeel_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4_run.c [new file with mode: 0644]
gcc/tree-vect-loop-manip.c
gcc/tree-vect-loop.c
gcc/tree-vect-stmts.c
gcc/tree-vectorizer.h

index 01fa70b..c4cbe9d 100644 (file)
@@ -2,6 +2,39 @@
            Alan Hayward  <alan.hayward@arm.com>
            David Sherwood  <david.sherwood@arm.com>
 
+       * tree-vectorizer.h (_loop_vec_info::mask_skip_niters): New field.
+       (LOOP_VINFO_MASK_SKIP_NITERS): New macro.
+       (vect_use_loop_mask_for_alignment_p): New function.
+       (vect_prepare_for_masked_peels, vect_gen_while_not): Declare.
+       * tree-vect-loop-manip.c (vect_set_loop_masks_directly): Add an
+       niters_skip argument.  Make sure that the first niters_skip elements
+       of the first iteration are inactive.
+       (vect_set_loop_condition_masked): Handle LOOP_VINFO_MASK_SKIP_NITERS.
+       Update call to vect_set_loop_masks_directly.
+       (get_misalign_in_elems): New function, split out from...
+       (vect_gen_prolog_loop_niters): ...here.
+       (vect_update_init_of_dr): Take a code argument that specifies whether
+       the adjustment should be added or subtracted.
+       (vect_update_init_of_drs): Likewise.
+       (vect_prepare_for_masked_peels): New function.
+       (vect_do_peeling): Skip prologue peeling if we're using a mask
+       instead.  Update call to vect_update_inits_of_drs.
+       * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize
+       mask_skip_niters.
+       (vect_analyze_loop_2): Allow fully-masked loops with peeling for
+       alignment.  Do not include the number of peeled iterations in
+       the minimum threshold in that case.
+       (vectorizable_induction): Adjust the start value down by
+       LOOP_VINFO_MASK_SKIP_NITERS iterations.
+       (vect_transform_loop): Call vect_prepare_for_masked_peels.
+       Take the number of skipped iterations into account when calculating
+       the loop bounds.
+       * tree-vect-stmts.c (vect_gen_while_not): New function.
+
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
        * doc/sourcebuild.texi (vect_fully_masked): Document.
        * params.def (PARAM_MIN_VECT_LOOP_BOUND): Change minimum and
        default value to 0.
index 3c83b5d..55e58c2 100644 (file)
@@ -2,6 +2,20 @@
            Alan Hayward  <alan.hayward@arm.com>
            David Sherwood  <david.sherwood@arm.com>
 
+       * gcc.target/aarch64/sve/nopeel_1.c: New test.
+       * gcc.target/aarch64/sve/peel_ind_1.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_1_run.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_2.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_3.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_4.c: Likewise.
+       * gcc.target/aarch64/sve/peel_ind_4_run.c: Likewise.
+
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
        * lib/target-supports.exp (check_effective_target_vect_fully_masked):
        New proc.
        * gcc.dg/vect/slp-3.c: Expect all loops to be vectorized if
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/nopeel_1.c b/gcc/testsuite/gcc.target/aarch64/sve/nopeel_1.c
new file mode 100644 (file)
index 0000000..d77c32c
--- /dev/null
@@ -0,0 +1,39 @@
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+#define TEST(NAME, TYPE)                       \
+ void                                          \
+ NAME##1 (TYPE *x, int n)                      \
+ {                                             \
+   for (int i = 0; i < n; ++i)                 \
+     x[i] += 1;                                        \
+ }                                             \
+ TYPE NAME##_array[1024];                      \
+ void                                          \
+ NAME##2 (void)                                        \
+ {                                             \
+   for (int i = 1; i < 200; ++i)               \
+     NAME##_array[i] += 1;                     \
+ }
+
+TEST (s8, int8_t)
+TEST (u8, uint8_t)
+TEST (s16, int16_t)
+TEST (u16, uint16_t)
+TEST (s32, int32_t)
+TEST (u32, uint32_t)
+TEST (s64, int64_t)
+TEST (u64, uint64_t)
+TEST (f16, _Float16)
+TEST (f32, float)
+TEST (f64, double)
+
+/* No scalar memory accesses.  */
+/* { dg-final { scan-assembler-not {[wx][0-9]*, \[} } } */
+/* 2 for each NAME##1 test, one in the header and one in the main loop
+   and 1 for each NAME##2 test, in the main loop only.  */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b,} 6 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 9 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 9 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 9 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c
new file mode 100644 (file)
index 0000000..8640264
--- /dev/null
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* Pick an arbitrary target for which unaligned accesses are more
+   expensive.  */
+/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
+
+#define N 512
+#define START 1
+#define END 505
+
+int x[N] __attribute__((aligned(32)));
+
+void __attribute__((noinline, noclone))
+foo (void)
+{
+  unsigned int v = 0;
+  for (unsigned int i = START; i < END; ++i)
+    {
+      x[i] = v;
+      v += 5;
+    }
+}
+
+/* We should operate on aligned vectors.  */
+/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */
+/* We should use an induction that starts at -5, with only the last
+   7 elements of the first iteration being active.  */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #-5, #5\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1_run.c
new file mode 100644 (file)
index 0000000..3fa0e46
--- /dev/null
@@ -0,0 +1,18 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 -mtune=thunderx" } */
+/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "peel_ind_1.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  foo ();
+  for (int i = 0; i < N; ++i)
+    {
+      if (x[i] != (i < START || i >= END ? 0 : (i - START) * 5))
+       __builtin_abort ();
+      asm volatile ("" ::: "memory");
+    }
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
new file mode 100644 (file)
index 0000000..2bfc09a
--- /dev/null
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* Pick an arbitrary target for which unaligned accesses are more
+   expensive.  */
+/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
+
+#define N 512
+#define START 7
+#define END 22
+
+int x[N] __attribute__((aligned(32)));
+
+void __attribute__((noinline, noclone))
+foo (void)
+{
+  for (unsigned int i = START; i < END; ++i)
+    x[i] = i;
+}
+
+/* We should operate on aligned vectors.  */
+/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */
+/* We should unroll the loop three times.  */
+/* { dg-final { scan-assembler-times "\tst1w\t" 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c
new file mode 100644 (file)
index 0000000..9c5ae1b
--- /dev/null
@@ -0,0 +1,18 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 -mtune=thunderx" } */
+/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "peel_ind_2.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  foo ();
+  for (int i = 0; i < N; ++i)
+    {
+      if (x[i] != (i < START || i >= END ? 0 : i))
+       __builtin_abort ();
+      asm volatile ("" ::: "memory");
+    }
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c
new file mode 100644 (file)
index 0000000..8364dc6
--- /dev/null
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* Pick an arbitrary target for which unaligned accesses are more
+   expensive.  */
+/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
+
+#define N 32
+#define MAX_START 8
+#define COUNT 16
+
+int x[MAX_START][N] __attribute__((aligned(32)));
+
+void __attribute__((noinline, noclone))
+foo (int start)
+{
+  for (int i = start; i < start + COUNT; ++i)
+    x[start][i] = i;
+}
+
+/* We should operate on aligned vectors.  */
+/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */
+/* { dg-final { scan-assembler {\tubfx\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c
new file mode 100644 (file)
index 0000000..384a38e
--- /dev/null
@@ -0,0 +1,21 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 -mtune=thunderx" } */
+/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "peel_ind_3.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  for (int start = 0; start < MAX_START; ++start)
+    {
+      foo (start);
+      for (int i = 0; i < N; ++i)
+       {
+         if (x[start][i] != (i < start || i >= start + COUNT ? 0 : i))
+           __builtin_abort ();
+         asm volatile ("" ::: "memory");
+       }
+    }
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4.c
new file mode 100644 (file)
index 0000000..5b5d88a
--- /dev/null
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* Pick an arbitrary target for which unaligned accesses are more
+   expensive.  */
+/* { dg-options "-Ofast -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */
+
+#define START 1
+#define END 505
+
+void __attribute__((noinline, noclone))
+foo (double *x)
+{
+  double v = 10.0;
+  for (unsigned int i = START; i < END; ++i)
+    {
+      x[i] = v;
+      v += 5.0;
+    }
+}
+
+/* We should operate on aligned vectors.  */
+/* { dg-final { scan-assembler {\tubfx\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4_run.c
new file mode 100644 (file)
index 0000000..7834a62
--- /dev/null
@@ -0,0 +1,29 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -mtune=thunderx" } */
+/* { dg-options "-Ofast -mtune=thunderx -mtune=thunderx" { target aarch64_sve256_hw } } */
+
+#include "peel_ind_4.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  double x[END + 1];
+  for (int i = 0; i < END + 1; ++i)
+    {
+      x[i] = i;
+      asm volatile ("" ::: "memory");
+    }
+  foo (x);
+  for (int i = 0; i < END + 1; ++i)
+    {
+      double expected;
+      if (i < START || i >= END)
+       expected = i;
+      else
+       expected = 10 + (i - START) * 5;
+      if (x[i] != expected)
+       __builtin_abort ();
+      asm volatile ("" ::: "memory");
+    }
+  return 0;
+}
index 496df38..b9bb047 100644 (file)
@@ -384,6 +384,11 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
    times and has been vectorized according to LOOP_VINFO.  Each iteration
    of the vectorized loop handles VF iterations of the scalar loop.
 
+   If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
+   starts with NITERS_SKIP dummy iterations of the scalar loop before
+   the real work starts.  The mask elements for these dummy iterations
+   must be 0, to ensure that the extra iterations do not have an effect.
+
    It is known that:
 
      NITERS * RGM->max_nscalars_per_iter
@@ -395,7 +400,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 
    might overflow before hitting a value above:
 
-     NITERS * RGM->max_nscalars_per_iter
+     (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 
    This means that we cannot guarantee that such an induction variable
    would ever hit a value that produces a set of all-false masks for RGM.  */
@@ -405,7 +410,8 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
                              gimple_seq *preheader_seq,
                              gimple_stmt_iterator loop_cond_gsi,
                              rgroup_masks *rgm, tree vf,
-                             tree niters, bool might_wrap_p)
+                             tree niters, tree niters_skip,
+                             bool might_wrap_p)
 {
   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
   tree mask_type = rgm->mask_type;
@@ -413,10 +419,12 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 
   /* Calculate the maximum number of scalar values that the rgroup
-     handles in total and the number that it handles for each iteration
-     of the vector loop.  */
+     handles in total, the number that it handles for each iteration
+     of the vector loop, and the number that it should skip during the
+     first iteration of the vector loop.  */
   tree nscalars_total = niters;
   tree nscalars_step = vf;
+  tree nscalars_skip = niters_skip;
   if (nscalars_per_iter != 1)
     {
       /* We checked before choosing to use a fully-masked loop that these
@@ -426,6 +434,9 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
                                     nscalars_total, factor);
       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
                                    nscalars_step, factor);
+      if (nscalars_skip)
+       nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+                                     nscalars_skip, factor);
     }
 
   /* Create an induction variable that counts the number of scalars
@@ -438,29 +449,66 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
   create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
             insert_after, &index_before_incr, &index_after_incr);
 
-  tree test_index, test_limit;
+  tree test_index, test_limit, first_limit;
   gimple_stmt_iterator *test_gsi;
   if (might_wrap_p)
     {
       /* In principle the loop should stop iterating once the incremented
-        IV reaches a value greater than or equal to NSCALAR_TOTAL.
-        However, there's no guarantee that the IV hits a value above
-        this value before wrapping around.  We therefore adjust the
-        limit down by one IV step:
+        IV reaches a value greater than or equal to:
+
+          NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
+
+        However, there's no guarantee that this addition doesn't overflow
+        the comparison type, or that the IV hits a value above it before
+        wrapping around.  We therefore adjust the limit down by one
+        IV step:
 
-          NSCALARS_TOTAL -[infinite-prec] NSCALARS_STEP
+          (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
+          -[infinite-prec] NSCALARS_STEP
 
         and compare the IV against this limit _before_ incrementing it.
         Since the comparison type is unsigned, we actually want the
         subtraction to saturate at zero:
 
-          NSCALARS_TOTAL -[sat] NSCALARS_STEP.  */
+          (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
+          -[sat] NSCALARS_STEP
+
+        And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
+
+          NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
+
+        where the rightmost subtraction can be done directly in
+        COMPARE_TYPE.  */
       test_index = index_before_incr;
+      tree adjust = nscalars_step;
+      if (nscalars_skip)
+       adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
+                              adjust, nscalars_skip);
       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
-                                nscalars_total, nscalars_step);
+                                nscalars_total, adjust);
       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
-                                test_limit, nscalars_step);
+                                test_limit, adjust);
       test_gsi = &incr_gsi;
+
+      /* Get a safe limit for the first iteration.  */
+      if (nscalars_skip)
+       {
+         /* The first vector iteration can handle at most NSCALARS_STEP
+            scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
+            NSCALARS_SKIP to that cannot overflow.  */
+         tree const_limit = build_int_cst (compare_type,
+                                           LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+                                           * nscalars_per_iter);
+         first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
+                                     nscalars_total, const_limit);
+         first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
+                                     first_limit, nscalars_skip);
+       }
+      else
+       /* For the first iteration it doesn't matter whether the IV hits
+          a value above NSCALARS_TOTAL.  That only matters for the latch
+          condition.  */
+       first_limit = nscalars_total;
     }
   else
     {
@@ -468,7 +516,12 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
         the bound before wrapping.  */
       test_index = index_after_incr;
       test_limit = nscalars_total;
+      if (nscalars_skip)
+       test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
+                                  test_limit, nscalars_skip);
       test_gsi = &loop_cond_gsi;
+
+      first_limit = test_limit;
     }
 
   /* Provide a definition of each mask in the group.  */
@@ -487,7 +540,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
         to have a full mask.  */
       poly_uint64 const_limit;
       bool first_iteration_full
-       = (poly_int_tree_p (nscalars_total, &const_limit)
+       = (poly_int_tree_p (first_limit, &const_limit)
           && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 
       /* Rather than have a new IV that starts at BIAS and goes up to
@@ -504,12 +557,13 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
                                          bias_tree);
        }
 
-      /* Create the initial mask.  */
+      /* Create the initial mask.  First include all scalars that
+        are within the loop limit.  */
       tree init_mask = NULL_TREE;
       if (!first_iteration_full)
        {
          tree start, end;
-         if (nscalars_total == test_limit)
+         if (first_limit == test_limit)
            {
              /* Use a natural test between zero (the initial IV value)
                 and the loop limit.  The "else" block would be valid too,
@@ -520,8 +574,11 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
            }
          else
            {
+             /* FIRST_LIMIT is the maximum number of scalars handled by the
+                first iteration of the vector loop.  Test the portion
+                associated with this mask.  */
              start = bias_tree;
-             end = nscalars_total;
+             end = first_limit;
            }
 
          init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
@@ -529,6 +586,22 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
          gimple_seq_add_stmt (preheader_seq, tmp_stmt);
        }
 
+      /* Now AND out the bits that are within the number of skipped
+        scalars.  */
+      poly_uint64 const_skip;
+      if (nscalars_skip
+         && !(poly_int_tree_p (nscalars_skip, &const_skip)
+              && known_le (const_skip, bias)))
+       {
+         tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
+                                                   bias_tree, nscalars_skip);
+         if (init_mask)
+           init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
+                                     init_mask, unskipped_mask);
+         else
+           init_mask = unskipped_mask;
+       }
+
       if (!init_mask)
        /* First iteration is full.  */
        init_mask = build_minus_one_cst (mask_type);
@@ -586,6 +659,9 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
   else
     niters = gimple_convert (&preheader_seq, compare_type, niters);
 
+  /* Convert skip_niters to the right type.  */
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+
   /* Now calculate the value that the induction variable must be able
      to hit in order to ensure that we end the loop with an all-false mask.
      This involves adding the maximum number of inactive trailing scalar
@@ -594,6 +670,15 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
   bool known_max_iters = max_loop_iterations (loop, &iv_limit);
   if (known_max_iters)
     {
+      if (niters_skip)
+       {
+         /* Add the maximum number of skipped iterations to the
+            maximum iteration count.  */
+         if (TREE_CODE (niters_skip) == INTEGER_CST)
+           iv_limit += wi::to_widest (niters_skip);
+         else
+           iv_limit += max_vf - 1;
+       }
       /* IV_LIMIT is the maximum number of latch iterations, which is also
         the maximum in-range IV value.  Round this value down to the previous
         vector alignment boundary and then add an extra full iteration.  */
@@ -639,7 +724,8 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
        test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
                                                  &preheader_seq,
                                                  loop_cond_gsi, rgm, vf,
-                                                 niters, might_wrap_p);
+                                                 niters, niters_skip,
+                                                 might_wrap_p);
       }
 
   /* Emit all accumulated statements.  */
@@ -1463,6 +1549,46 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
     }
 }
 
+/* Return a gimple value containing the misalignment (measured in vector
+   elements) for the loop described by LOOP_VINFO, i.e. how many elements
+   it is away from a perfectly aligned address.  Add any new statements
+   to SEQ.  */
+
+static tree
+get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
+{
+  struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
+  gimple *dr_stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+
+  unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
+  gcc_assert (target_align != 0);
+
+  bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
+  tree offset = (negative
+                ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
+                : size_zero_node);
+  tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, seq,
+                                                         offset);
+  tree type = unsigned_type_for (TREE_TYPE (start_addr));
+  tree target_align_minus_1 = build_int_cst (type, target_align - 1);
+  HOST_WIDE_INT elem_size
+    = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
+  tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
+
+  /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
+  tree int_start_addr = fold_convert (type, start_addr);
+  tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
+                                       target_align_minus_1);
+
+  /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
+  tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
+                                       elem_size_log);
+
+  return misalign_in_elems;
+}
+
 /* Function vect_gen_prolog_loop_niters
 
    Generate the number of iterations which should be peeled as prolog for the
@@ -1474,7 +1600,7 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
    If the misalignment of DR is known at compile time:
      addr_mis = int mis = DR_MISALIGNMENT (dr);
    Else, compute address misalignment in bytes:
-     addr_mis = addr & (vectype_align - 1)
+     addr_mis = addr & (target_align - 1)
 
    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
 
@@ -1521,33 +1647,17 @@ vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
     }
   else
     {
-      bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
-      tree offset = negative
-         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
-      tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
-                                                             &stmts, offset);
-      tree type = unsigned_type_for (TREE_TYPE (start_addr));
-      tree target_align_minus_1 = build_int_cst (type, target_align - 1);
+      tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
+      tree type = TREE_TYPE (misalign_in_elems);
       HOST_WIDE_INT elem_size
        = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
-      tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
       HOST_WIDE_INT align_in_elems = target_align / elem_size;
       tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
-      tree misalign_in_bytes;
-      tree misalign_in_elems;
-
-      /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
-      misalign_in_bytes
-       = fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr),
-                      target_align_minus_1);
-
-      /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
-      misalign_in_elems
-       = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, elem_size_log);
 
       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
                                 & (align_in_elems - 1)).  */
+      bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
       if (negative)
        iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
                             align_in_elems_tree);
@@ -1587,20 +1697,22 @@ vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
 
 /* Function vect_update_init_of_dr
 
-   NITERS iterations were peeled from LOOP.  DR represents a data reference
-   in LOOP.  This function updates the information recorded in DR to
-   account for the fact that the first NITERS iterations had already been
-   executed.  Specifically, it updates the OFFSET field of DR.  */
+   If CODE is PLUS, the vector loop starts NITERS iterations after the
+   scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
+   iterations before the scalar one (using masking to skip inactive
+   elements).  This function updates the information recorded in DR to
+   account for the difference.  Specifically, it updates the OFFSET
+   field of DR.  */
 
 static void
-vect_update_init_of_dr (struct data_reference *dr, tree niters)
+vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
 {
   tree offset = DR_OFFSET (dr);
 
   niters = fold_build2 (MULT_EXPR, sizetype,
                        fold_convert (sizetype, niters),
                        fold_convert (sizetype, DR_STEP (dr)));
-  offset = fold_build2 (PLUS_EXPR, sizetype,
+  offset = fold_build2 (code, sizetype,
                        fold_convert (sizetype, offset), niters);
   DR_OFFSET (dr) = offset;
 }
@@ -1608,14 +1720,12 @@ vect_update_init_of_dr (struct data_reference *dr, tree niters)
 
 /* Function vect_update_inits_of_drs
 
-   NITERS iterations were peeled from the loop represented by LOOP_VINFO.
-   This function updates the information recorded for the data references in
-   the loop to account for the fact that the first NITERS iterations had
-   already been executed.  Specifically, it updates the initial_condition of
-   the access_function of all the data_references in the loop.  */
+   Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
+   CODE and NITERS are as for vect_update_inits_of_dr.  */
 
 static void
-vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
+vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
+                         tree_code code)
 {
   unsigned int i;
   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
@@ -1642,9 +1752,57 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
     }
 
   FOR_EACH_VEC_ELT (datarefs, i, dr)
-    vect_update_init_of_dr (dr, niters);
+    vect_update_init_of_dr (dr, niters, code);
 }
 
+/* For the information recorded in LOOP_VINFO prepare the loop for peeling
+   by masking.  This involves calculating the number of iterations to
+   be peeled and then aligning all memory references appropriately.  */
+
+void
+vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
+{
+  tree misalign_in_elems;
+  tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+
+  gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
+
+  /* From the information recorded in LOOP_VINFO get the number of iterations
+     that need to be skipped via masking.  */
+  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
+    {
+      poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+                            - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
+      misalign_in_elems = build_int_cst (type, misalign);
+    }
+  else
+    {
+      gimple_seq seq1 = NULL, seq2 = NULL;
+      misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
+      misalign_in_elems = fold_convert (type, misalign_in_elems);
+      misalign_in_elems = force_gimple_operand (misalign_in_elems,
+                                               &seq2, true, NULL_TREE);
+      gimple_seq_add_seq (&seq1, seq2);
+      if (seq1)
+       {
+         edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+         basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
+         gcc_assert (!new_bb);
+       }
+    }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "misalignment for fully-masked loop: ");
+      dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
+      dump_printf (MSG_NOTE, "\n");
+    }
+
+  LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
+
+  vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
+}
 
 /* This function builds ni_name = number of iterations.  Statements
    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
@@ -2250,7 +2408,9 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   int bound_prolog = 0;
   poly_uint64 bound_scalar = 0;
   int estimated_vf;
-  int prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+  int prolog_peeling = 0;
+  if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
+    prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
   bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
                         || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
 
@@ -2367,7 +2527,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
          scale_loop_profile (prolog, prob_prolog, bound_prolog);
        }
       /* Update init address of DRs.  */
-      vect_update_inits_of_drs (loop_vinfo, niters_prolog);
+      vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
       /* Update niters for vector loop.  */
       LOOP_VINFO_NITERS (loop_vinfo)
        = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
index 1666332..d7cc12f 100644 (file)
@@ -1121,6 +1121,7 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
     versioning_threshold (0),
     vectorization_factor (0),
     max_vectorization_factor (0),
+    mask_skip_niters (NULL_TREE),
     mask_compare_type (NULL_TREE),
     unaligned_dr (NULL),
     peeling_for_alignment (0),
@@ -2269,16 +2270,6 @@ start_over:
                         " gaps is required.\n");
     }
 
-  if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
-      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
-    {
-      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "can't use a fully-masked loop because peeling for"
-                        " alignment is required.\n");
-    }
-
   /* Decide whether to use a fully-masked loop for this vectorization
      factor.  */
   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
@@ -2379,18 +2370,21 @@ start_over:
      increase threshold for this case if necessary.  */
   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
     {
-      poly_uint64 niters_th;
+      poly_uint64 niters_th = 0;
 
-      /* Niters for peeled prolog loop.  */
-      if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+      if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
        {
-         struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
-         tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
-
-         niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
+         /* Niters for peeled prolog loop.  */
+         if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+           {
+             struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
+             tree vectype
+               = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
+             niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
+           }
+         else
+           niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
        }
-      else
-       niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
 
       /* Niters for at least one iteration of vectorized loop.  */
       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
@@ -7336,9 +7330,28 @@ vectorizable_induction (gimple *phi,
   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
                                     loop_preheader_edge (iv_loop));
 
-  /* Convert the step to the desired type.  */
+  /* Convert the initial value and step to the desired type.  */
   stmts = NULL;
+  init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
+
+  /* If we are using the loop mask to "peel" for alignment then we need
+     to adjust the start value here.  */
+  tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  if (skip_niters != NULL_TREE)
+    {
+      if (FLOAT_TYPE_P (vectype))
+       skip_niters = gimple_build (&stmts, FLOAT_EXPR, TREE_TYPE (vectype),
+                                   skip_niters);
+      else
+       skip_niters = gimple_convert (&stmts, TREE_TYPE (vectype),
+                                     skip_niters);
+      tree skip_step = gimple_build (&stmts, MULT_EXPR, TREE_TYPE (vectype),
+                                    skip_niters, step_expr);
+      init_expr = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (vectype),
+                               init_expr, skip_step);
+    }
+
   if (stmts)
     {
       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
@@ -8209,6 +8222,11 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
   split_edge (loop_preheader_edge (loop));
 
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && vect_use_loop_mask_for_alignment_p (loop_vinfo))
+    /* This will deal with any possible peeling.  */
+    vect_prepare_for_masked_peels (loop_vinfo);
+
   /* FORNOW: the vectorizer supports only loops which body consist
      of one basic block (header + empty latch). When the vectorizer will
      support more involved loop forms, the order by which the BBs are
@@ -8488,29 +8506,40 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   /* +1 to convert latch counts to loop iteration counts,
      -min_epilogue_iters to remove iterations that cannot be performed
        by the vector code.  */
-  int bias = 1 - min_epilogue_iters;
+  int bias_for_lowest = 1 - min_epilogue_iters;
+  int bias_for_assumed = bias_for_lowest;
+  int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+  if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* When the amount of peeling is known at compile time, the first
+        iteration will have exactly alignment_npeels active elements.
+        In the worst case it will have at least one.  */
+      int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
+      bias_for_lowest += lowest_vf - min_first_active;
+      bias_for_assumed += assumed_vf - min_first_active;
+    }
   /* In these calculations the "- 1" converts loop iteration counts
      back to latch counts.  */
   if (loop->any_upper_bound)
     loop->nb_iterations_upper_bound
       = (final_iter_may_be_partial
-        ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias,
+        ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
                          lowest_vf) - 1
-        : wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
+        : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
                           lowest_vf) - 1);
   if (loop->any_likely_upper_bound)
     loop->nb_iterations_likely_upper_bound
       = (final_iter_may_be_partial
-        ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound + bias,
-                         lowest_vf) - 1
-        : wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
-                          lowest_vf) - 1);
+        ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
+                         + bias_for_lowest, lowest_vf) - 1
+        : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
+                          + bias_for_lowest, lowest_vf) - 1);
   if (loop->any_estimate)
     loop->nb_iterations_estimate
       = (final_iter_may_be_partial
-        ? wi::udiv_ceil (loop->nb_iterations_estimate + bias,
+        ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
                          assumed_vf) - 1
-        : wi::udiv_floor (loop->nb_iterations_estimate + bias,
+        : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
                           assumed_vf) - 1);
 
   if (dump_enabled_p ())
index 1381b5f..c8850d4 100644 (file)
@@ -9991,3 +9991,16 @@ vect_gen_while (tree mask, tree start_index, tree end_index)
   gimple_call_set_lhs (call, mask);
   return call;
 }
+
+/* Generate a vector mask of type MASK_TYPE for which index I is false iff
+   J + START_INDEX < END_INDEX for all J <= I.  Add the statements to SEQ.  */
+
+tree
+vect_gen_while_not (gimple_seq *seq, tree mask_type, tree start_index,
+                   tree end_index)
+{
+  tree tmp = make_ssa_name (mask_type);
+  gcall *call = vect_gen_while (tmp, start_index, end_index);
+  gimple_seq_add_stmt (seq, call);
+  return gimple_build (seq, BIT_NOT_EXPR, mask_type, tmp);
+}
index 580c21e..c01bd9b 100644 (file)
@@ -351,6 +351,12 @@ typedef struct _loop_vec_info : public vec_info {
      on inactive scalars.  */
   vec_loop_masks masks;
 
+  /* If we are using a loop mask to align memory addresses, this variable
+     contains the number of vector elements that we should skip in the
+     first iteration of the vector loop (i.e. the number of leading
+     elements that should be false in the first mask).  */
+  tree mask_skip_niters;
+
   /* Type of the variables to use in the WHILE_ULT call for fully-masked
      loops.  */
   tree mask_compare_type;
@@ -480,6 +486,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_VECT_FACTOR(L)          (L)->vectorization_factor
 #define LOOP_VINFO_MAX_VECT_FACTOR(L)      (L)->max_vectorization_factor
 #define LOOP_VINFO_MASKS(L)                (L)->masks
+#define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
 #define LOOP_VINFO_MASK_COMPARE_TYPE(L)    (L)->mask_compare_type
 #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
 #define LOOP_VINFO_LOOP_NEST(L)            (L)->loop_nest
@@ -1230,6 +1237,17 @@ unlimited_cost_model (loop_p loop)
   return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED);
 }
 
+/* Return true if the loop described by LOOP_VINFO is fully-masked and
+   if the first iteration should use a partial mask in order to achieve
+   alignment.  */
+
+static inline bool
+vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo)
+{
+  return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+         && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
+}
+
 /* Return the number of vectors of type VECTYPE that are needed to get
    NUNITS elements.  NUNITS should be based on the vectorization factor,
    so it is always a known multiple of the number of elements in VECTYPE.  */
@@ -1328,6 +1346,7 @@ extern void vect_loop_versioning (loop_vec_info, unsigned int, bool,
                                  poly_uint64);
 extern struct loop *vect_do_peeling (loop_vec_info, tree, tree,
                                     tree *, tree *, tree *, int, bool, bool);
+extern void vect_prepare_for_masked_peels (loop_vec_info);
 extern source_location find_loop_location (struct loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
 
@@ -1393,6 +1412,7 @@ extern tree vect_gen_perm_mask_any (tree, const vec_perm_indices &);
 extern tree vect_gen_perm_mask_checked (tree, const vec_perm_indices &);
 extern void optimize_mask_stores (struct loop*);
 extern gcall *vect_gen_while (tree, tree, tree);
+extern tree vect_gen_while_not (gimple_seq *, tree, tree, tree);
 
 /* In tree-vect-data-refs.c.  */
 extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);