--- /dev/null
+2011-09-22 Revital Eres <revital.eres@linaro.org>
+
+ gcc/
+ Backport from trunk -r178804:
+ modulo-sched.c (remove_node_from_ps): Return void
+ instead of bool.
+ (optimize_sc): Adjust call to remove_node_from_ps.
+ (sms_schedule): Add print info.
+
+=== modified file 'gcc/modulo-sched.c'
+--- old/gcc/modulo-sched.c 2011-08-09 04:51:48 +0000
++++ new/gcc/modulo-sched.c 2011-09-14 11:06:06 +0000
+@@ -211,7 +211,7 @@
+ static bool try_scheduling_node_in_cycle (partial_schedule_ptr, ddg_node_ptr,
+ int, int, sbitmap, int *, sbitmap,
+ sbitmap);
+-static bool remove_node_from_ps (partial_schedule_ptr, ps_insn_ptr);
++static void remove_node_from_ps (partial_schedule_ptr, ps_insn_ptr);
+
+ #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
+ #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
+@@ -834,8 +834,7 @@
+ if (next_ps_i->node->cuid == g->closing_branch->cuid)
+ break;
+
+- gcc_assert (next_ps_i);
+- gcc_assert (remove_node_from_ps (ps, next_ps_i));
++ remove_node_from_ps (ps, next_ps_i);
+ success =
+ try_scheduling_node_in_cycle (ps, g->closing_branch,
+ g->closing_branch->cuid, c,
+@@ -1485,8 +1484,8 @@
+ if (dump_file)
+ {
+ fprintf (dump_file,
+- "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
+- stage_count);
++ "%s:%d SMS succeeded %d %d (with ii, sc)\n",
++ insn_file (tail), insn_line (tail), ps->ii, stage_count);
+ print_partial_schedule (ps, dump_file);
+ }
+
+@@ -2810,22 +2809,18 @@
+ }
+
+
+-/* Removes the given PS_INSN from the partial schedule. Returns false if the
+- node is not found in the partial schedule, else returns true. */
+-static bool
++/* Removes the given PS_INSN from the partial schedule. */
++static void
+ remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
+ {
+ int row;
+
+- if (!ps || !ps_i)
+- return false;
+-
++ gcc_assert (ps && ps_i);
++
+ row = SMODULO (ps_i->cycle, ps->ii);
+ if (! ps_i->prev_in_row)
+ {
+- if (ps_i != ps->rows[row])
+- return false;
+-
++ gcc_assert (ps_i == ps->rows[row]);
+ ps->rows[row] = ps_i->next_in_row;
+ if (ps->rows[row])
+ ps->rows[row]->prev_in_row = NULL;
+@@ -2839,7 +2834,7 @@
+
+ ps->rows_length[row] -= 1;
+ free (ps_i);
+- return true;
++ return;
+ }
+
+ /* Unlike what literature describes for modulo scheduling (which focuses
+
--- /dev/null
+2011-09-25 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/testsuite/
+ * lib/target-supports.exp (check_effective_target_vect_multiple_sizes):
+ Replace check_effective_target_arm_neon with
+ check_effective_target_arm_neon_ok.
+
+ Backport from mainline:
+
+ 2011-09-06 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/
+ * config/arm/arm.c (arm_preferred_simd_mode): Check
+ TARGET_NEON_VECTORIZE_DOUBLE instead of
+ TARGET_NEON_VECTORIZE_QUAD.
+ (arm_autovectorize_vector_sizes): Likewise.
+ * config/arm/arm.opt (mvectorize-with-neon-quad): Make inverse
+ mask of mvectorize-with-neon-double. Add RejectNegative.
+ (mvectorize-with-neon-double): New.
+
+ gcc/testsuite/
+ * lib/target-supports.exp (check_effective_target_vect_multiple_sizes):
+ New procedure.
+ (add_options_for_quad_vectors): Replace with ...
+ (add_options_for_double_vectors): ... this.
+ * gfortran.dg/vect/pr19049.f90: Expect more printings on targets that
+ support multiple vector sizes since the vectorizer attempts to
+ vectorize with both vector sizes.
+ * gcc.dg/vect/no-vfa-vect-79.c,
+ gcc.dg/vect/no-vfa-vect-102a.c, gcc.dg/vect/vect-outer-1a.c,
+ gcc.dg/vect/vect-outer-1b.c, gcc.dg/vect/vect-outer-2b.c,
+ gcc.dg/vect/vect-outer-3a.c, gcc.dg/vect/no-vfa-vect-37.c,
+ gcc.dg/vect/vect-outer-3b.c, gcc.dg/vect/no-vfa-vect-101.c,
+ gcc.dg/vect/no-vfa-vect-102.c, gcc.dg/vect/vect-reduc-dot-s8b.c,
+ gcc.dg/vect/vect-outer-1.c, gcc.dg/vect/vect-104.c: Likewise.
+ * gcc.dg/vect/vect-42.c: Run with 64 bit vectors if applicable.
+ * gcc.dg/vect/vect-multitypes-6.c, gcc.dg/vect/vect-52.c,
+ gcc.dg/vect/vect-54.c, gcc.dg/vect/vect-46.c, gcc.dg/vect/vect-48.c,
+ gcc.dg/vect/vect-96.c, gcc.dg/vect/vect-multitypes-3.c,
+ gcc.dg/vect/vect-40.c: Likewise.
+ * gcc.dg/vect/vect-outer-5.c: Remove quad-vectors option as
+ redundant.
+ * gcc.dg/vect/vect-109.c, gcc.dg/vect/vect-peel-1.c,
+ gcc.dg/vect/vect-peel-2.c, gcc.dg/vect/slp-25.c,
+ gcc.dg/vect/vect-multitypes-1.c, gcc.dg/vect/slp-3.c,
+ gcc.dg/vect/no-vfa-pr29145.c, gcc.dg/vect/vect-multitypes-4.c:
+ Likewise.
+ * gcc.dg/vect/vect-peel-4.c: Make ia global.
+
+=== modified file 'gcc/config/arm/arm.c'
+--- old/gcc/config/arm/arm.c 2011-09-15 09:45:31 +0000
++++ new/gcc/config/arm/arm.c 2011-09-19 07:44:24 +0000
+@@ -22974,7 +22974,7 @@
+ return false;
+ }
+
+-/* Use the option -mvectorize-with-neon-quad to override the use of doubleword
++/* Use the option -mvectorize-with-neon-double to override the use of quardword
+ registers when autovectorizing for Neon, at least until multiple vector
+ widths are supported properly by the middle-end. */
+
+@@ -22985,15 +22985,15 @@
+ switch (mode)
+ {
+ case SFmode:
+- return TARGET_NEON_VECTORIZE_QUAD ? V4SFmode : V2SFmode;
++ return TARGET_NEON_VECTORIZE_DOUBLE ? V2SFmode : V4SFmode;
+ case SImode:
+- return TARGET_NEON_VECTORIZE_QUAD ? V4SImode : V2SImode;
++ return TARGET_NEON_VECTORIZE_DOUBLE ? V2SImode : V4SImode;
+ case HImode:
+- return TARGET_NEON_VECTORIZE_QUAD ? V8HImode : V4HImode;
++ return TARGET_NEON_VECTORIZE_DOUBLE ? V4HImode : V8HImode;
+ case QImode:
+- return TARGET_NEON_VECTORIZE_QUAD ? V16QImode : V8QImode;
++ return TARGET_NEON_VECTORIZE_DOUBLE ? V8QImode : V16QImode;
+ case DImode:
+- if (TARGET_NEON_VECTORIZE_QUAD)
++ if (!TARGET_NEON_VECTORIZE_DOUBLE)
+ return V2DImode;
+ break;
+
+@@ -24226,7 +24226,7 @@
+ static unsigned int
+ arm_autovectorize_vector_sizes (void)
+ {
+- return TARGET_NEON_VECTORIZE_QUAD ? 16 | 8 : 0;
++ return TARGET_NEON_VECTORIZE_DOUBLE ? 0 : (16 | 8);
+ }
+
+ static bool
+
+=== modified file 'gcc/config/arm/arm.opt'
+--- old/gcc/config/arm/arm.opt 2009-06-18 11:24:10 +0000
++++ new/gcc/config/arm/arm.opt 2011-09-19 07:44:24 +0000
+@@ -158,9 +158,13 @@
+ Assume big endian bytes, little endian words
+
+ mvectorize-with-neon-quad
+-Target Report Mask(NEON_VECTORIZE_QUAD)
++Target Report RejectNegative InverseMask(NEON_VECTORIZE_DOUBLE)
+ Use Neon quad-word (rather than double-word) registers for vectorization
+
++mvectorize-with-neon-double
++Target Report RejectNegative Mask(NEON_VECTORIZE_DOUBLE)
++Use Neon double-word (rather than quad-word) registers for vectorization
++
+ mword-relocations
+ Target Report Var(target_word_relocations) Init(TARGET_DEFAULT_WORD_RELOCATIONS)
+ Only generate absolute relocations on word sized values.
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c'
+--- old/gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c 2011-04-28 11:46:58 +0000
++++ new/gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c 2011-09-19 07:44:24 +0000
+@@ -1,5 +1,4 @@
+ /* { dg-require-effective-target vect_int } */
+-/* { dg-add-options quad_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/no-vfa-vect-101.c'
+--- old/gcc/testsuite/gcc.dg/vect/no-vfa-vect-101.c 2007-09-04 12:05:19 +0000
++++ new/gcc/testsuite/gcc.dg/vect/no-vfa-vect-101.c 2011-09-19 07:44:24 +0000
+@@ -45,6 +45,7 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
+-/* { dg-final { scan-tree-dump-times "can't determine dependence" 1 "vect" } } */
++/* { dg-final { scan-tree-dump-times "can't determine dependence" 1 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "can't determine dependence" 2 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/no-vfa-vect-102.c'
+--- old/gcc/testsuite/gcc.dg/vect/no-vfa-vect-102.c 2007-09-12 07:48:44 +0000
++++ new/gcc/testsuite/gcc.dg/vect/no-vfa-vect-102.c 2011-09-19 07:44:24 +0000
+@@ -53,6 +53,7 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
+-/* { dg-final { scan-tree-dump-times "possible dependence between data-refs" 1 "vect" } } */
++/* { dg-final { scan-tree-dump-times "possible dependence between data-refs" 1 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "possible dependence between data-refs" 2 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/no-vfa-vect-102a.c'
+--- old/gcc/testsuite/gcc.dg/vect/no-vfa-vect-102a.c 2007-09-12 07:48:44 +0000
++++ new/gcc/testsuite/gcc.dg/vect/no-vfa-vect-102a.c 2011-09-19 07:44:24 +0000
+@@ -53,6 +53,7 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
+-/* { dg-final { scan-tree-dump-times "possible dependence between data-refs" 1 "vect" } } */
++/* { dg-final { scan-tree-dump-times "possible dependence between data-refs" 1 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "possible dependence between data-refs" 2 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/no-vfa-vect-37.c'
+--- old/gcc/testsuite/gcc.dg/vect/no-vfa-vect-37.c 2009-05-08 12:39:01 +0000
++++ new/gcc/testsuite/gcc.dg/vect/no-vfa-vect-37.c 2011-09-19 07:44:24 +0000
+@@ -58,5 +58,6 @@
+ If/when the aliasing problems are resolved, unalignment may
+ prevent vectorization on some targets. */
+ /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { xfail *-*-* } } } */
+-/* { dg-final { scan-tree-dump-times "can't determine dependence between" 2 "vect" } } */
++/* { dg-final { scan-tree-dump-times "can't determine dependence" 2 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "can't determine dependence" 4 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/no-vfa-vect-79.c'
+--- old/gcc/testsuite/gcc.dg/vect/no-vfa-vect-79.c 2009-05-08 12:39:01 +0000
++++ new/gcc/testsuite/gcc.dg/vect/no-vfa-vect-79.c 2011-09-19 07:44:24 +0000
+@@ -46,5 +46,6 @@
+ If/when the aliasing problems are resolved, unalignment may
+ prevent vectorization on some targets. */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */
+-/* { dg-final { scan-tree-dump-times "can't determine dependence between" 1 "vect" } } */
++/* { dg-final { scan-tree-dump-times "can't determine dependence" 1 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "can't determine dependence" 2 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/slp-25.c'
+--- old/gcc/testsuite/gcc.dg/vect/slp-25.c 2010-10-04 14:59:30 +0000
++++ new/gcc/testsuite/gcc.dg/vect/slp-25.c 2011-09-19 07:44:24 +0000
+@@ -1,5 +1,4 @@
+ /* { dg-require-effective-target vect_int } */
+-/* { dg-add-options quad_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/slp-3.c'
+--- old/gcc/testsuite/gcc.dg/vect/slp-3.c 2011-04-28 11:46:58 +0000
++++ new/gcc/testsuite/gcc.dg/vect/slp-3.c 2011-09-19 07:44:24 +0000
+@@ -1,5 +1,4 @@
+ /* { dg-require-effective-target vect_int } */
+-/* { dg-add-options quad_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-104.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-104.c 2007-09-12 07:48:44 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-104.c 2011-09-19 07:44:24 +0000
+@@ -64,6 +64,7 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
+-/* { dg-final { scan-tree-dump-times "possible dependence between data-refs" 1 "vect" } } */
++/* { dg-final { scan-tree-dump-times "possible dependence between data-refs" 1 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "possible dependence between data-refs" 2 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-109.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-109.c 2010-10-04 14:59:30 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-109.c 2011-09-19 07:44:24 +0000
+@@ -1,5 +1,4 @@
+ /* { dg-require-effective-target vect_int } */
+-/* { dg-add-options quad_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-40.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-40.c 2009-05-25 14:18:21 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-40.c 2011-09-19 07:44:24 +0000
+@@ -1,4 +1,5 @@
+ /* { dg-require-effective-target vect_float } */
++/* { dg-add-options double_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-42.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-42.c 2010-10-04 14:59:30 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-42.c 2011-09-19 07:44:24 +0000
+@@ -1,4 +1,5 @@
+ /* { dg-require-effective-target vect_float } */
++/* { dg-add-options double_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-46.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-46.c 2009-05-25 14:18:21 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-46.c 2011-09-19 07:44:24 +0000
+@@ -1,4 +1,5 @@
+ /* { dg-require-effective-target vect_float } */
++/* { dg-add-options double_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-48.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-48.c 2009-11-04 10:22:22 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-48.c 2011-09-19 07:44:24 +0000
+@@ -1,4 +1,5 @@
+ /* { dg-require-effective-target vect_float } */
++/* { dg-add-options double_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-52.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-52.c 2009-11-04 10:22:22 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-52.c 2011-09-19 07:44:24 +0000
+@@ -1,4 +1,5 @@
+ /* { dg-require-effective-target vect_float } */
++/* { dg-add-options double_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-54.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-54.c 2009-10-27 11:46:07 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-54.c 2011-09-19 07:44:24 +0000
+@@ -1,4 +1,5 @@
+ /* { dg-require-effective-target vect_float } */
++/* { dg-add-options double_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-96.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-96.c 2010-10-04 14:59:30 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-96.c 2011-09-19 07:44:24 +0000
+@@ -1,4 +1,5 @@
+ /* { dg-require-effective-target vect_int } */
++/* { dg-add-options double_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c 2010-10-04 14:59:30 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c 2011-09-19 07:44:24 +0000
+@@ -1,5 +1,4 @@
+ /* { dg-require-effective-target vect_int } */
+-/* { dg-add-options quad_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c 2009-11-04 10:22:22 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c 2011-09-19 07:44:24 +0000
+@@ -1,4 +1,5 @@
+ /* { dg-require-effective-target vect_int } */
++/* { dg-add-options double_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c 2010-10-04 14:59:30 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c 2011-09-19 07:44:24 +0000
+@@ -1,5 +1,4 @@
+ /* { dg-require-effective-target vect_int } */
+-/* { dg-add-options quad_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c 2009-11-10 18:01:22 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c 2011-09-19 07:44:24 +0000
+@@ -1,4 +1,5 @@
+ /* { dg-require-effective-target vect_int } */
++/* { dg-add-options double_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-outer-1.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-outer-1.c 2009-05-08 12:39:01 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-outer-1.c 2011-09-19 07:44:24 +0000
+@@ -22,5 +22,6 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+-/* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" } } */
++/* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "strided access in outer loop" 2 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-outer-1a.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-outer-1a.c 2009-06-16 06:21:12 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-outer-1a.c 2011-09-19 07:44:24 +0000
+@@ -20,5 +20,6 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+-/* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" } } */
++/* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "strided access in outer loop" 2 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-outer-1b.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-outer-1b.c 2007-08-19 11:02:48 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-outer-1b.c 2011-09-19 07:44:24 +0000
+@@ -22,5 +22,6 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+-/* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" } } */
++/* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "strided access in outer loop" 2 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-outer-2b.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-outer-2b.c 2009-05-08 12:39:01 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-outer-2b.c 2011-09-19 07:44:24 +0000
+@@ -37,5 +37,6 @@
+ return 0;
+ }
+
+-/* { dg-final { scan-tree-dump-times "strided access in outer loop." 1 "vect" } } */
++/* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "strided access in outer loop" 2 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-outer-3a.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-outer-3a.c 2009-05-08 12:39:01 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-outer-3a.c 2011-09-19 07:44:24 +0000
+@@ -49,5 +49,6 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail vect_no_align } } } */
+-/* { dg-final { scan-tree-dump-times "step doesn't divide the vector-size" 2 "vect" } } */
++/* { dg-final { scan-tree-dump-times "step doesn't divide the vector-size" 2 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "step doesn't divide the vector-size" 3 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-outer-3b.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-outer-3b.c 2009-05-08 12:39:01 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-outer-3b.c 2011-09-19 07:44:24 +0000
+@@ -49,5 +49,6 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+-/* { dg-final { scan-tree-dump-times "strided access in outer loop" 2 "vect" } } */
++/* { dg-final { scan-tree-dump-times "strided access in outer loop" 2 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "strided access in outer loop" 4 "vect" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "vect" } } */
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-outer-5.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-outer-5.c 2011-04-28 11:46:58 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-outer-5.c 2011-09-19 07:44:24 +0000
+@@ -1,5 +1,4 @@
+ /* { dg-require-effective-target vect_float } */
+-/* { dg-add-options quad_vectors } */
+
+ #include <stdarg.h>
+ #include <signal.h>
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-peel-1.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-peel-1.c 2011-01-10 12:41:40 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-peel-1.c 2011-09-19 07:44:24 +0000
+@@ -1,5 +1,4 @@
+ /* { dg-require-effective-target vect_int } */
+-/* { dg-add-options quad_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-peel-2.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-peel-2.c 2011-01-10 12:41:40 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-peel-2.c 2011-09-19 07:44:24 +0000
+@@ -1,5 +1,4 @@
+ /* { dg-require-effective-target vect_int } */
+-/* { dg-add-options quad_vectors } */
+
+ #include <stdarg.h>
+ #include "tree-vect.h"
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-peel-4.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-peel-4.c 2011-01-10 12:41:40 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-peel-4.c 2011-09-19 07:44:24 +0000
+@@ -6,12 +6,12 @@
+ #define N 128
+
+ int ib[N+7];
++int ia[N+1];
+
+ __attribute__ ((noinline))
+ int main1 ()
+ {
+ int i;
+- int ia[N+1];
+
+ /* Don't peel keeping one load and the store aligned. */
+ for (i = 0; i <= N; i++)
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c 2010-05-27 12:23:45 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c 2011-09-19 07:44:24 +0000
+@@ -58,7 +58,8 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
+-/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" } } */
++/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 2 "vect" { target vect_multiple_sizes } } } */
+
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */
+
+
+=== modified file 'gcc/testsuite/gfortran.dg/vect/pr19049.f90'
+--- old/gcc/testsuite/gfortran.dg/vect/pr19049.f90 2005-07-25 11:05:07 +0000
++++ new/gcc/testsuite/gfortran.dg/vect/pr19049.f90 2011-09-19 07:44:24 +0000
+@@ -19,6 +19,7 @@
+ end
+
+ ! { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } }
+-! { dg-final { scan-tree-dump-times "complicated access pattern" 1 "vect" } }
++! { dg-final { scan-tree-dump-times "complicated access pattern" 1 "vect" { xfail vect_multiple_sizes } } }
++! { dg-final { scan-tree-dump-times "complicated access pattern" 2 "vect" { target vect_multiple_sizes } } }
+ ! { dg-final { cleanup-tree-dump "vect" } }
+
+
+=== modified file 'gcc/testsuite/lib/target-supports.exp'
+--- old/gcc/testsuite/lib/target-supports.exp 2011-08-13 08:32:32 +0000
++++ new/gcc/testsuite/lib/target-supports.exp 2011-09-20 07:54:28 +0000
+@@ -3265,6 +3265,24 @@
+ }]
+ }
+
++# Return 1 if the target supports multiple vector sizes
++
++proc check_effective_target_vect_multiple_sizes { } {
++ global et_vect_multiple_sizes
++
++ if [info exists et_vect_multiple_sizes_saved] {
++ verbose "check_effective_target_vect_multiple_sizes: using cached result" 2
++ } else {
++ set et_vect_multiple_sizes_saved 0
++ if { ([istarget arm*-*-*] && [check_effective_target_arm_neon_ok]) } {
++ set et_vect_multiple_sizes_saved 1
++ }
++ }
++
++ verbose "check_effective_target_vect_multiple_sizes: returning $et_vect_multiple_sizes_saved" 2
++ return $et_vect_multiple_sizes_saved
++}
++
+ # Return 1 if the target supports section-anchors
+
+ proc check_effective_target_section_anchors { } {
+@@ -3648,11 +3666,11 @@
+ return $flags
+ }
+
+-# Add to FLAGS the flags needed to enable 128-bit vectors.
++# Add to FLAGS the flags needed to enable 64-bit vectors.
+
+-proc add_options_for_quad_vectors { flags } {
++proc add_options_for_double_vectors { flags } {
+ if [is-effective-target arm_neon_ok] {
+- return "$flags -mvectorize-with-neon-quad"
++ return "$flags -mvectorize-with-neon-double"
+ }
+
+ return $flags
+
--- /dev/null
+2011-09-28 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-09-28 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * config/arm/neon.md (neon_move_lo_quad_<mode>): Delete.
+ (neon_move_hi_quad_<mode>): Likewise.
+ (move_hi_quad_<mode>, move_lo_quad_<mode>): Use subreg moves.
+
+2011-09-28 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-09-27 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * config/arm/neon.md (neon_vget_highv16qi, neon_vget_highv8hi)
+ (neon_vget_highv4si, neon_vget_highv4sf, neon_vget_highv2di)
+ (neon_vget_lowv16qi, neon_vget_lowv8hi, neon_vget_lowv4si)
+ (neon_vget_lowv4sf, neon_vget_lowv2di): Turn into define_expands
+ that produce subreg moves. Define using VQX iterators.
+
+2011-09-28 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-09-14 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * simplify-rtx.c (simplify_subreg): Check that the inner mode is
+ a scalar integer before applying integer-only optimisations to
+ inner arithmetic.
+
+=== modified file 'gcc/config/arm/neon.md'
+--- old/gcc/config/arm/neon.md 2011-07-04 14:03:49 +0000
++++ new/gcc/config/arm/neon.md 2011-09-28 15:14:59 +0000
+@@ -1235,66 +1235,14 @@
+ (const_string "neon_int_1") (const_string "neon_int_5")))]
+ )
+
+-; FIXME: We wouldn't need the following insns if we could write subregs of
+-; vector registers. Make an attempt at removing unnecessary moves, though
+-; we're really at the mercy of the register allocator.
+-
+-(define_insn "neon_move_lo_quad_<mode>"
+- [(set (match_operand:ANY128 0 "s_register_operand" "+w")
+- (vec_concat:ANY128
+- (match_operand:<V_HALF> 1 "s_register_operand" "w")
+- (vec_select:<V_HALF>
+- (match_dup 0)
+- (match_operand:ANY128 2 "vect_par_constant_high" ""))))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src)
+- return "vmov\t%e0, %P1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+-(define_insn "neon_move_hi_quad_<mode>"
+- [(set (match_operand:ANY128 0 "s_register_operand" "+w")
+- (vec_concat:ANY128
+- (vec_select:<V_HALF>
+- (match_dup 0)
+- (match_operand:ANY128 2 "vect_par_constant_low" ""))
+- (match_operand:<V_HALF> 1 "s_register_operand" "w")))]
+-
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src)
+- return "vmov\t%f0, %P1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+ (define_expand "move_hi_quad_<mode>"
+ [(match_operand:ANY128 0 "s_register_operand" "")
+ (match_operand:<V_HALF> 1 "s_register_operand" "")]
+ "TARGET_NEON"
+ {
+- rtvec v = rtvec_alloc (<V_mode_nunits>/2);
+- rtx t1;
+- int i;
+-
+- for (i=0; i < (<V_mode_nunits>/2); i++)
+- RTVEC_ELT (v, i) = GEN_INT (i);
+-
+- t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+- emit_insn (gen_neon_move_hi_quad_<mode> (operands[0], operands[1], t1));
+-
++ emit_move_insn (simplify_gen_subreg (<V_HALF>mode, operands[0], <MODE>mode,
++ GET_MODE_SIZE (<V_HALF>mode)),
++ operands[1]);
+ DONE;
+ })
+
+@@ -1303,16 +1251,9 @@
+ (match_operand:<V_HALF> 1 "s_register_operand" "")]
+ "TARGET_NEON"
+ {
+- rtvec v = rtvec_alloc (<V_mode_nunits>/2);
+- rtx t1;
+- int i;
+-
+- for (i=0; i < (<V_mode_nunits>/2); i++)
+- RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
+-
+- t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+- emit_insn (gen_neon_move_lo_quad_<mode> (operands[0], operands[1], t1));
+-
++ emit_move_insn (simplify_gen_subreg (<V_HALF>mode, operands[0],
++ <MODE>mode, 0),
++ operands[1]);
+ DONE;
+ })
+
+@@ -2950,183 +2891,27 @@
+ (set_attr "neon_type" "neon_bp_simple")]
+ )
+
+-(define_insn "neon_vget_highv16qi"
+- [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+- (vec_select:V8QI (match_operand:V16QI 1 "s_register_operand" "w")
+- (parallel [(const_int 8) (const_int 9)
+- (const_int 10) (const_int 11)
+- (const_int 12) (const_int 13)
+- (const_int 14) (const_int 15)])))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src + 2)
+- return "vmov\t%P0, %f1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+-(define_insn "neon_vget_highv8hi"
+- [(set (match_operand:V4HI 0 "s_register_operand" "=w")
+- (vec_select:V4HI (match_operand:V8HI 1 "s_register_operand" "w")
+- (parallel [(const_int 4) (const_int 5)
+- (const_int 6) (const_int 7)])))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src + 2)
+- return "vmov\t%P0, %f1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+-(define_insn "neon_vget_highv4si"
+- [(set (match_operand:V2SI 0 "s_register_operand" "=w")
+- (vec_select:V2SI (match_operand:V4SI 1 "s_register_operand" "w")
+- (parallel [(const_int 2) (const_int 3)])))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src + 2)
+- return "vmov\t%P0, %f1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+-(define_insn "neon_vget_highv4sf"
+- [(set (match_operand:V2SF 0 "s_register_operand" "=w")
+- (vec_select:V2SF (match_operand:V4SF 1 "s_register_operand" "w")
+- (parallel [(const_int 2) (const_int 3)])))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src + 2)
+- return "vmov\t%P0, %f1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+-(define_insn "neon_vget_highv2di"
+- [(set (match_operand:DI 0 "s_register_operand" "=w")
+- (vec_select:DI (match_operand:V2DI 1 "s_register_operand" "w")
+- (parallel [(const_int 1)])))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src + 2)
+- return "vmov\t%P0, %f1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+-(define_insn "neon_vget_lowv16qi"
+- [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+- (vec_select:V8QI (match_operand:V16QI 1 "s_register_operand" "w")
+- (parallel [(const_int 0) (const_int 1)
+- (const_int 2) (const_int 3)
+- (const_int 4) (const_int 5)
+- (const_int 6) (const_int 7)])))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src)
+- return "vmov\t%P0, %e1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+-(define_insn "neon_vget_lowv8hi"
+- [(set (match_operand:V4HI 0 "s_register_operand" "=w")
+- (vec_select:V4HI (match_operand:V8HI 1 "s_register_operand" "w")
+- (parallel [(const_int 0) (const_int 1)
+- (const_int 2) (const_int 3)])))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src)
+- return "vmov\t%P0, %e1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+-(define_insn "neon_vget_lowv4si"
+- [(set (match_operand:V2SI 0 "s_register_operand" "=w")
+- (vec_select:V2SI (match_operand:V4SI 1 "s_register_operand" "w")
+- (parallel [(const_int 0) (const_int 1)])))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src)
+- return "vmov\t%P0, %e1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+-(define_insn "neon_vget_lowv4sf"
+- [(set (match_operand:V2SF 0 "s_register_operand" "=w")
+- (vec_select:V2SF (match_operand:V4SF 1 "s_register_operand" "w")
+- (parallel [(const_int 0) (const_int 1)])))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src)
+- return "vmov\t%P0, %e1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
+-
+-(define_insn "neon_vget_lowv2di"
+- [(set (match_operand:DI 0 "s_register_operand" "=w")
+- (vec_select:DI (match_operand:V2DI 1 "s_register_operand" "w")
+- (parallel [(const_int 0)])))]
+- "TARGET_NEON"
+-{
+- int dest = REGNO (operands[0]);
+- int src = REGNO (operands[1]);
+-
+- if (dest != src)
+- return "vmov\t%P0, %e1";
+- else
+- return "";
+-}
+- [(set_attr "neon_type" "neon_bp_simple")]
+-)
++(define_expand "neon_vget_high<mode>"
++ [(match_operand:<V_HALF> 0 "s_register_operand")
++ (match_operand:VQX 1 "s_register_operand")]
++ "TARGET_NEON"
++{
++ emit_move_insn (operands[0],
++ simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode,
++ GET_MODE_SIZE (<V_HALF>mode)));
++ DONE;
++})
++
++(define_expand "neon_vget_low<mode>"
++ [(match_operand:<V_HALF> 0 "s_register_operand")
++ (match_operand:VQX 1 "s_register_operand")]
++ "TARGET_NEON"
++{
++ emit_move_insn (operands[0],
++ simplify_gen_subreg (<V_HALF>mode, operands[1],
++ <MODE>mode, 0));
++ DONE;
++})
+
+ (define_insn "neon_vcvt<mode>"
+ [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
+
+=== modified file 'gcc/simplify-rtx.c'
+--- old/gcc/simplify-rtx.c 2011-08-13 08:32:32 +0000
++++ new/gcc/simplify-rtx.c 2011-09-28 15:11:59 +0000
+@@ -5567,6 +5567,7 @@
+ /* Optimize SUBREG truncations of zero and sign extended values. */
+ if ((GET_CODE (op) == ZERO_EXTEND
+ || GET_CODE (op) == SIGN_EXTEND)
++ && SCALAR_INT_MODE_P (innermode)
+ && GET_MODE_BITSIZE (outermode) < GET_MODE_BITSIZE (innermode))
+ {
+ unsigned int bitpos = subreg_lsb_1 (outermode, innermode, byte);
+@@ -5605,6 +5606,7 @@
+ if ((GET_CODE (op) == LSHIFTRT
+ || GET_CODE (op) == ASHIFTRT)
+ && SCALAR_INT_MODE_P (outermode)
++ && SCALAR_INT_MODE_P (innermode)
+ /* Ensure that OUTERMODE is at least twice as wide as the INNERMODE
+ to avoid the possibility that an outer LSHIFTRT shifts by more
+ than the sign extension's sign_bit_copies and introduces zeros
+@@ -5624,6 +5626,7 @@
+ if ((GET_CODE (op) == LSHIFTRT
+ || GET_CODE (op) == ASHIFTRT)
+ && SCALAR_INT_MODE_P (outermode)
++ && SCALAR_INT_MODE_P (innermode)
+ && GET_MODE_BITSIZE (outermode) < GET_MODE_BITSIZE (innermode)
+ && CONST_INT_P (XEXP (op, 1))
+ && GET_CODE (XEXP (op, 0)) == ZERO_EXTEND
+@@ -5638,6 +5641,7 @@
+ the outer subreg is effectively a truncation to the original mode. */
+ if (GET_CODE (op) == ASHIFT
+ && SCALAR_INT_MODE_P (outermode)
++ && SCALAR_INT_MODE_P (innermode)
+ && GET_MODE_BITSIZE (outermode) < GET_MODE_BITSIZE (innermode)
+ && CONST_INT_P (XEXP (op, 1))
+ && (GET_CODE (XEXP (op, 0)) == ZERO_EXTEND
+@@ -5651,7 +5655,7 @@
+ /* Recognize a word extraction from a multi-word subreg. */
+ if ((GET_CODE (op) == LSHIFTRT
+ || GET_CODE (op) == ASHIFTRT)
+- && SCALAR_INT_MODE_P (outermode)
++ && SCALAR_INT_MODE_P (innermode)
+ && GET_MODE_BITSIZE (outermode) >= BITS_PER_WORD
+ && GET_MODE_BITSIZE (innermode) >= (2 * GET_MODE_BITSIZE (outermode))
+ && CONST_INT_P (XEXP (op, 1))
+@@ -5673,6 +5677,7 @@
+
+ if ((GET_CODE (op) == LSHIFTRT
+ || GET_CODE (op) == ASHIFTRT)
++ && SCALAR_INT_MODE_P (innermode)
+ && MEM_P (XEXP (op, 0))
+ && CONST_INT_P (XEXP (op, 1))
+ && GET_MODE_SIZE (outermode) < GET_MODE_SIZE (GET_MODE (op))
+
--- /dev/null
+2011-10-01 Revital Eres <revital.eres@linaro.org>
+
+ gcc/
+ Backport from mainline -r179380 and -r179381
+
+ * ddg.c (autoinc_var_is_used_p): New function.
+ (create_ddg_dep_from_intra_loop_link,
+ add_cross_iteration_register_deps): Call it.
+ * ddg.h (autoinc_var_is_used_p): Declare.
+ * modulo-sched.c (sms_schedule): Handle instructions with REG_INC.
+ (generate_reg_moves): Call autoinc_var_is_used_p. Skip
+ instructions that do not set a register and verify no regmoves
+ are created for !single_set instructions.
+
+ gcc/testsuite/
+
+ * gcc.dg/sms-10.c: New file
+
+=== modified file 'gcc/ddg.c'
+--- old/gcc/ddg.c 2011-07-31 11:29:10 +0000
++++ new/gcc/ddg.c 2011-10-02 06:56:53 +0000
+@@ -145,6 +145,27 @@
+ return rtx_mem_access_p (PATTERN (insn));
+ }
+
++/* Return true if DEF_INSN contains address being auto-inc or auto-dec
++ which is used in USE_INSN. Otherwise return false. The result is
++ being used to decide whether to remove the edge between def_insn and
++ use_insn when -fmodulo-sched-allow-regmoves is set. This function
++ doesn't need to consider the specific address register; no reg_moves
++ will be allowed for any life range defined by def_insn and used
++ by use_insn, if use_insn uses an address register auto-inc'ed by
++ def_insn. */
++bool
++autoinc_var_is_used_p (rtx def_insn, rtx use_insn)
++{
++ rtx note;
++
++ for (note = REG_NOTES (def_insn); note; note = XEXP (note, 1))
++ if (REG_NOTE_KIND (note) == REG_INC
++ && reg_referenced_p (XEXP (note, 0), PATTERN (use_insn)))
++ return true;
++
++ return false;
++}
++
+ /* Computes the dependence parameters (latency, distance etc.), creates
+ a ddg_edge and adds it to the given DDG. */
+ static void
+@@ -173,10 +194,15 @@
+ compensate for that by generating reg-moves based on the life-range
+ analysis. The anti-deps that will be deleted are the ones which
+ have true-deps edges in the opposite direction (in other words
+- the kernel has only one def of the relevant register). TODO:
+- support the removal of all anti-deps edges, i.e. including those
++ the kernel has only one def of the relevant register).
++ If the address that is being auto-inc or auto-dec in DEST_NODE
++ is used in SRC_NODE then do not remove the edge to make sure
++ reg-moves will not be created for this address.
++ TODO: support the removal of all anti-deps edges, i.e. including those
+ whose register has multiple defs in the loop. */
+- if (flag_modulo_sched_allow_regmoves && (t == ANTI_DEP && dt == REG_DEP))
++ if (flag_modulo_sched_allow_regmoves
++ && (t == ANTI_DEP && dt == REG_DEP)
++ && !autoinc_var_is_used_p (dest_node->insn, src_node->insn))
+ {
+ rtx set;
+
+@@ -302,10 +328,14 @@
+ gcc_assert (first_def_node);
+
+ /* Always create the edge if the use node is a branch in
+- order to prevent the creation of reg-moves. */
++ order to prevent the creation of reg-moves.
++ If the address that is being auto-inc or auto-dec in LAST_DEF
++ is used in USE_INSN then do not remove the edge to make sure
++ reg-moves will not be created for that address. */
+ if (DF_REF_ID (last_def) != DF_REF_ID (first_def)
+ || !flag_modulo_sched_allow_regmoves
+- || JUMP_P (use_node->insn))
++ || JUMP_P (use_node->insn)
++ || autoinc_var_is_used_p (DF_REF_INSN (last_def), use_insn))
+ create_ddg_dep_no_link (g, use_node, first_def_node, ANTI_DEP,
+ REG_DEP, 1);
+
+
+=== modified file 'gcc/ddg.h'
+--- old/gcc/ddg.h 2009-11-25 10:55:54 +0000
++++ new/gcc/ddg.h 2011-10-02 06:56:53 +0000
+@@ -186,4 +186,6 @@
+ int find_nodes_on_paths (sbitmap result, ddg_ptr, sbitmap from, sbitmap to);
+ int longest_simple_path (ddg_ptr, int from, int to, sbitmap via);
+
++bool autoinc_var_is_used_p (rtx, rtx);
++
+ #endif /* GCC_DDG_H */
+
+=== modified file 'gcc/modulo-sched.c'
+--- old/gcc/modulo-sched.c 2011-09-14 11:06:06 +0000
++++ new/gcc/modulo-sched.c 2011-10-02 06:56:53 +0000
+@@ -477,7 +477,12 @@
+ sbitmap *uses_of_defs;
+ rtx last_reg_move;
+ rtx prev_reg, old_reg;
+-
++ rtx set = single_set (u->insn);
++
++ /* Skip instructions that do not set a register. */
++ if ((set && !REG_P (SET_DEST (set))))
++ continue;
++
+ /* Compute the number of reg_moves needed for u, by looking at life
+ ranges started at u (excluding self-loops). */
+ for (e = u->out; e; e = e->next_out)
+@@ -494,6 +499,20 @@
+ && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
+ nreg_moves4e--;
+
++ if (nreg_moves4e >= 1)
++ {
++ /* !single_set instructions are not supported yet and
++ thus we do not except to encounter them in the loop
++ except from the doloop part. For the latter case
++ we assume no regmoves are generated as the doloop
++ instructions are tied to the branch with an edge. */
++ gcc_assert (set);
++ /* If the instruction contains auto-inc register then
++ validate that the regmov is being generated for the
++ target regsiter rather then the inc'ed register. */
++ gcc_assert (!autoinc_var_is_used_p (u->insn, e->dest->insn));
++ }
++
+ nreg_moves = MAX (nreg_moves, nreg_moves4e);
+ }
+
+@@ -1266,12 +1285,10 @@
+ continue;
+ }
+
+- /* Don't handle BBs with calls or barriers or auto-increment insns
+- (to avoid creating invalid reg-moves for the auto-increment insns),
++ /* Don't handle BBs with calls or barriers
+ or !single_set with the exception of instructions that include
+ count_reg---these instructions are part of the control part
+ that do-loop recognizes.
+- ??? Should handle auto-increment insns.
+ ??? Should handle insns defining subregs. */
+ for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
+ {
+@@ -1282,7 +1299,6 @@
+ || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
+ && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
+ && !reg_mentioned_p (count_reg, insn))
+- || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
+ || (INSN_P (insn) && (set = single_set (insn))
+ && GET_CODE (SET_DEST (set)) == SUBREG))
+ break;
+@@ -1296,8 +1312,6 @@
+ fprintf (dump_file, "SMS loop-with-call\n");
+ else if (BARRIER_P (insn))
+ fprintf (dump_file, "SMS loop-with-barrier\n");
+- else if (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
+- fprintf (dump_file, "SMS reg inc\n");
+ else if ((NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
+ && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
+ fprintf (dump_file, "SMS loop-with-not-single-set\n");
+
+=== added file 'gcc/testsuite/gcc.dg/sms-10.c'
+--- old/gcc/testsuite/gcc.dg/sms-10.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/sms-10.c 2011-10-02 06:56:53 +0000
+@@ -0,0 +1,118 @@
++ /* { dg-do run } */
++ /* { dg-options "-O2 -fmodulo-sched -fmodulo-sched-allow-regmoves -fdump-rtl-sms" } */
++
++
++typedef __SIZE_TYPE__ size_t;
++extern void *malloc (size_t);
++extern void free (void *);
++extern void abort (void);
++
++struct regstat_n_sets_and_refs_t
++{
++ int sets;
++ int refs;
++};
++
++struct regstat_n_sets_and_refs_t *regstat_n_sets_and_refs;
++
++struct df_reg_info
++{
++ unsigned int n_refs;
++};
++
++struct df_d
++{
++ struct df_reg_info **def_regs;
++ struct df_reg_info **use_regs;
++};
++struct df_d *df;
++
++static inline int
++REG_N_SETS (int regno)
++{
++ return regstat_n_sets_and_refs[regno].sets;
++}
++
++__attribute__ ((noinline))
++ int max_reg_num (void)
++{
++ return 100;
++}
++
++__attribute__ ((noinline))
++ void regstat_init_n_sets_and_refs (void)
++{
++ unsigned int i;
++ unsigned int max_regno = max_reg_num ();
++
++ for (i = 0; i < max_regno; i++)
++ {
++ (regstat_n_sets_and_refs[i].sets = (df->def_regs[(i)]->n_refs));
++ (regstat_n_sets_and_refs[i].refs =
++ (df->use_regs[(i)]->n_refs) + REG_N_SETS (i));
++ }
++}
++
++int a_sets[100] =
++ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
++ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
++ 40, 41, 42,
++ 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
++ 62, 63, 64,
++ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
++ 84, 85, 86,
++ 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99
++};
++
++int a_refs[100] =
++ { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
++ 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
++ 78, 80, 82,
++ 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116,
++ 118, 120,
++ 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150,
++ 152, 154, 156,
++ 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186,
++ 188, 190, 192,
++ 194, 196, 198
++};
++
++int
++main ()
++{
++ struct df_reg_info *b[100], *c[100];
++ struct df_d df1;
++ size_t s = sizeof (struct df_reg_info);
++ struct regstat_n_sets_and_refs_t a[100];
++
++ df = &df1;
++ regstat_n_sets_and_refs = a;
++ int i;
++
++ for (i = 0; i < 100; i++)
++ {
++ b[i] = (struct df_reg_info *) malloc (s);
++ b[i]->n_refs = i;
++ c[i] = (struct df_reg_info *) malloc (s);
++ c[i]->n_refs = i;
++ }
++
++ df1.def_regs = b;
++ df1.use_regs = c;
++ regstat_init_n_sets_and_refs ();
++
++ for (i = 0; i < 100; i++)
++ if ((a[i].sets != a_sets[i]) || (a[i].refs != a_refs[i]))
++ abort ();
++
++ for (i = 0; i < 100; i++)
++ {
++ free (b[i]);
++ free (c[i]);
++ }
++
++ return 0;
++}
++
++/* { dg-final { scan-rtl-dump-times "SMS succeeded" 1 "sms" { target powerpc*-*-* } } } */
++/* { dg-final { cleanup-rtl-dump "sms" } } */
+
--- /dev/null
+2011-10-03 Michael Hope <michael.hope@linaro.org>
+
+ Backport from mainline:
+
+ 2011-09-13 Sevak Sargsyan <sevak.sargsyan@ispras.ru>
+
+ gcc/
+ * config/arm/neon.md (neon_vabd<mode>_2, neon_vabd<mode>_3): New
+ define_insn patterns for combine.
+
+ gcc/testsuite/
+ * gcc.target/arm/neon-combine-sub-abs-into-vabd.c: New test.
+
+=== modified file 'gcc/config/arm/neon.md'
+--- old/gcc/config/arm/neon.md 2011-09-28 15:14:59 +0000
++++ new/gcc/config/arm/neon.md 2011-10-03 01:32:17 +0000
+@@ -5428,3 +5428,32 @@
+ emit_insn (gen_neon_vec_pack_trunc_<V_double> (operands[0], tempreg));
+ DONE;
+ })
++
++(define_insn "neon_vabd<mode>_2"
++ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
++ (abs:VDQ (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
++ (match_operand:VDQ 2 "s_register_operand" "w"))))]
++ "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
++ "vabd.<V_s_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
++ [(set (attr "neon_type")
++ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
++ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
++ (const_string "neon_fp_vadd_ddd_vabs_dd")
++ (const_string "neon_fp_vadd_qqq_vabs_qq"))
++ (const_string "neon_int_5")))]
++)
++
++(define_insn "neon_vabd<mode>_3"
++ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
++ (abs:VDQ (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w")
++ (match_operand:VDQ 2 "s_register_operand" "w")]
++ UNSPEC_VSUB)))]
++ "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
++ "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
++ [(set (attr "neon_type")
++ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
++ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
++ (const_string "neon_fp_vadd_ddd_vabs_dd")
++ (const_string "neon_fp_vadd_qqq_vabs_qq"))
++ (const_string "neon_int_5")))]
++)
+
+=== added file 'gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c'
+--- old/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c 2011-10-03 01:32:17 +0000
+@@ -0,0 +1,50 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_neon_ok } */
++/* { dg-options "-O2 -funsafe-math-optimizations" } */
++/* { dg-add-options arm_neon } */
++
++#include <arm_neon.h>
++float32x2_t f_sub_abs_to_vabd_32()
++{
++ float32x2_t val1 = vdup_n_f32 (10);
++ float32x2_t val2 = vdup_n_f32 (30);
++ float32x2_t sres = vsub_f32(val1, val2);
++ float32x2_t res = vabs_f32 (sres);
++
++ return res;
++}
++/* { dg-final { scan-assembler "vabd\.f32" } }*/
++
++#include <arm_neon.h>
++int8x8_t sub_abs_to_vabd_8()
++{
++ int8x8_t val1 = vdup_n_s8 (10);
++ int8x8_t val2 = vdup_n_s8 (30);
++ int8x8_t sres = vsub_s8(val1, val2);
++ int8x8_t res = vabs_s8 (sres);
++
++ return res;
++}
++/* { dg-final { scan-assembler "vabd\.s8" } }*/
++
++int16x4_t sub_abs_to_vabd_16()
++{
++ int16x4_t val1 = vdup_n_s16 (10);
++ int16x4_t val2 = vdup_n_s16 (30);
++ int16x4_t sres = vsub_s16(val1, val2);
++ int16x4_t res = vabs_s16 (sres);
++
++ return res;
++}
++/* { dg-final { scan-assembler "vabd\.s16" } }*/
++
++int32x2_t sub_abs_to_vabd_32()
++{
++ int32x2_t val1 = vdup_n_s32 (10);
++ int32x2_t val2 = vdup_n_s32 (30);
++ int32x2_t sres = vsub_s32(val1, val2);
++ int32x2_t res = vabs_s32 (sres);
++
++ return res;
++}
++/* { dg-final { scan-assembler "vabd\.s32" } }*/
+
--- /dev/null
+2011-10-03 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-09-22 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * config/arm/predicates.md (expandable_comparison_operator): New
+ predicate, extracted from...
+ (arm_comparison_operator): ...here.
+ * config/arm/arm.md (cbranchsi4, cbranchsf4, cbranchdf4, cbranchdi4)
+ (cstoresi4, cstoresf4, cstoredf4, cstoredi4, movsicc, movsfcc)
+ (movdfcc): Use expandable_comparison_operator.
+
+ gcc/testsuite/
+ Backport from mainline:
+
+ 2011-09-22 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * gcc.target/arm/cmp-1.c: New test.
+ * gcc.target/arm/cmp-2.c: Likewise.
+
+2011-10-03 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-09-07 Richard Sandiford <richard.sandiford@linaro.org>
+
+ PR target/49030
+ * config/arm/arm-protos.h (maybe_get_arm_condition_code): Declare.
+ * config/arm/arm.c (maybe_get_arm_condition_code): New function,
+ reusing the old code from get_arm_condition_code. Return ARM_NV
+ for invalid comparison codes.
+ (get_arm_condition_code): Redefine in terms of
+ maybe_get_arm_condition_code.
+ * config/arm/predicates.md (arm_comparison_operator): Use
+ maybe_get_arm_condition_code.
+
+ gcc/testsuite/
+ Backport from mainline:
+
+ 2011-09-07 Richard Sandiford <richard.sandiford@linaro.org>
+
+ PR target/49030
+ * gcc.dg/torture/pr49030.c: New test.
+
+=== modified file 'gcc/config/arm/arm-protos.h'
+--- old/gcc/config/arm/arm-protos.h 2011-09-15 09:45:31 +0000
++++ new/gcc/config/arm/arm-protos.h 2011-10-03 09:46:40 +0000
+@@ -180,6 +180,7 @@
+ #endif
+ extern int thumb_shiftable_const (unsigned HOST_WIDE_INT);
+ #ifdef RTX_CODE
++extern enum arm_cond_code maybe_get_arm_condition_code (rtx);
+ extern void thumb1_final_prescan_insn (rtx);
+ extern void thumb2_final_prescan_insn (rtx);
+ extern const char *thumb_load_double_from_address (rtx *);
+
+=== modified file 'gcc/config/arm/arm.c'
+--- old/gcc/config/arm/arm.c 2011-09-19 07:44:24 +0000
++++ new/gcc/config/arm/arm.c 2011-10-03 09:46:40 +0000
+@@ -17494,10 +17494,10 @@
+ decremented/zeroed by arm_asm_output_opcode as the insns are output. */
+
+ /* Returns the index of the ARM condition code string in
+- `arm_condition_codes'. COMPARISON should be an rtx like
+- `(eq (...) (...))'. */
+-static enum arm_cond_code
+-get_arm_condition_code (rtx comparison)
++ `arm_condition_codes', or ARM_NV if the comparison is invalid.
++ COMPARISON should be an rtx like `(eq (...) (...))'. */
++enum arm_cond_code
++maybe_get_arm_condition_code (rtx comparison)
+ {
+ enum machine_mode mode = GET_MODE (XEXP (comparison, 0));
+ enum arm_cond_code code;
+@@ -17521,11 +17521,11 @@
+ case CC_DLTUmode: code = ARM_CC;
+
+ dominance:
+- gcc_assert (comp_code == EQ || comp_code == NE);
+-
+ if (comp_code == EQ)
+ return ARM_INVERSE_CONDITION_CODE (code);
+- return code;
++ if (comp_code == NE)
++ return code;
++ return ARM_NV;
+
+ case CC_NOOVmode:
+ switch (comp_code)
+@@ -17534,7 +17534,7 @@
+ case EQ: return ARM_EQ;
+ case GE: return ARM_PL;
+ case LT: return ARM_MI;
+- default: gcc_unreachable ();
++ default: return ARM_NV;
+ }
+
+ case CC_Zmode:
+@@ -17542,7 +17542,7 @@
+ {
+ case NE: return ARM_NE;
+ case EQ: return ARM_EQ;
+- default: gcc_unreachable ();
++ default: return ARM_NV;
+ }
+
+ case CC_Nmode:
+@@ -17550,7 +17550,7 @@
+ {
+ case NE: return ARM_MI;
+ case EQ: return ARM_PL;
+- default: gcc_unreachable ();
++ default: return ARM_NV;
+ }
+
+ case CCFPEmode:
+@@ -17575,7 +17575,7 @@
+ /* UNEQ and LTGT do not have a representation. */
+ case UNEQ: /* Fall through. */
+ case LTGT: /* Fall through. */
+- default: gcc_unreachable ();
++ default: return ARM_NV;
+ }
+
+ case CC_SWPmode:
+@@ -17591,7 +17591,7 @@
+ case GTU: return ARM_CC;
+ case LEU: return ARM_CS;
+ case LTU: return ARM_HI;
+- default: gcc_unreachable ();
++ default: return ARM_NV;
+ }
+
+ case CC_Cmode:
+@@ -17599,7 +17599,7 @@
+ {
+ case LTU: return ARM_CS;
+ case GEU: return ARM_CC;
+- default: gcc_unreachable ();
++ default: return ARM_NV;
+ }
+
+ case CC_CZmode:
+@@ -17611,7 +17611,7 @@
+ case GTU: return ARM_HI;
+ case LEU: return ARM_LS;
+ case LTU: return ARM_CC;
+- default: gcc_unreachable ();
++ default: return ARM_NV;
+ }
+
+ case CC_NCVmode:
+@@ -17621,7 +17621,7 @@
+ case LT: return ARM_LT;
+ case GEU: return ARM_CS;
+ case LTU: return ARM_CC;
+- default: gcc_unreachable ();
++ default: return ARM_NV;
+ }
+
+ case CCmode:
+@@ -17637,13 +17637,22 @@
+ case GTU: return ARM_HI;
+ case LEU: return ARM_LS;
+ case LTU: return ARM_CC;
+- default: gcc_unreachable ();
++ default: return ARM_NV;
+ }
+
+ default: gcc_unreachable ();
+ }
+ }
+
++/* Like maybe_get_arm_condition_code, but never return ARM_NV. */
++static enum arm_cond_code
++get_arm_condition_code (rtx comparison)
++{
++ enum arm_cond_code code = maybe_get_arm_condition_code (comparison);
++ gcc_assert (code != ARM_NV);
++ return code;
++}
++
+ /* Tell arm_asm_output_opcode to output IT blocks for conditionally executed
+ instructions. */
+ void
+
+=== modified file 'gcc/config/arm/arm.md'
+--- old/gcc/config/arm/arm.md 2011-09-12 14:14:00 +0000
++++ new/gcc/config/arm/arm.md 2011-10-03 09:47:33 +0000
+@@ -6543,7 +6543,7 @@
+
+ (define_expand "cbranchsi4"
+ [(set (pc) (if_then_else
+- (match_operator 0 "arm_comparison_operator"
++ (match_operator 0 "expandable_comparison_operator"
+ [(match_operand:SI 1 "s_register_operand" "")
+ (match_operand:SI 2 "nonmemory_operand" "")])
+ (label_ref (match_operand 3 "" ""))
+@@ -6594,7 +6594,7 @@
+
+ (define_expand "cbranchsf4"
+ [(set (pc) (if_then_else
+- (match_operator 0 "arm_comparison_operator"
++ (match_operator 0 "expandable_comparison_operator"
+ [(match_operand:SF 1 "s_register_operand" "")
+ (match_operand:SF 2 "arm_float_compare_operand" "")])
+ (label_ref (match_operand 3 "" ""))
+@@ -6606,7 +6606,7 @@
+
+ (define_expand "cbranchdf4"
+ [(set (pc) (if_then_else
+- (match_operator 0 "arm_comparison_operator"
++ (match_operator 0 "expandable_comparison_operator"
+ [(match_operand:DF 1 "s_register_operand" "")
+ (match_operand:DF 2 "arm_float_compare_operand" "")])
+ (label_ref (match_operand 3 "" ""))
+@@ -6618,7 +6618,7 @@
+
+ (define_expand "cbranchdi4"
+ [(set (pc) (if_then_else
+- (match_operator 0 "arm_comparison_operator"
++ (match_operator 0 "expandable_comparison_operator"
+ [(match_operand:DI 1 "cmpdi_operand" "")
+ (match_operand:DI 2 "cmpdi_operand" "")])
+ (label_ref (match_operand 3 "" ""))
+@@ -7473,7 +7473,7 @@
+
+ (define_expand "cstoresi4"
+ [(set (match_operand:SI 0 "s_register_operand" "")
+- (match_operator:SI 1 "arm_comparison_operator"
++ (match_operator:SI 1 "expandable_comparison_operator"
+ [(match_operand:SI 2 "s_register_operand" "")
+ (match_operand:SI 3 "reg_or_int_operand" "")]))]
+ "TARGET_32BIT || TARGET_THUMB1"
+@@ -7609,7 +7609,7 @@
+
+ (define_expand "cstoresf4"
+ [(set (match_operand:SI 0 "s_register_operand" "")
+- (match_operator:SI 1 "arm_comparison_operator"
++ (match_operator:SI 1 "expandable_comparison_operator"
+ [(match_operand:SF 2 "s_register_operand" "")
+ (match_operand:SF 3 "arm_float_compare_operand" "")]))]
+ "TARGET_32BIT && TARGET_HARD_FLOAT"
+@@ -7619,7 +7619,7 @@
+
+ (define_expand "cstoredf4"
+ [(set (match_operand:SI 0 "s_register_operand" "")
+- (match_operator:SI 1 "arm_comparison_operator"
++ (match_operator:SI 1 "expandable_comparison_operator"
+ [(match_operand:DF 2 "s_register_operand" "")
+ (match_operand:DF 3 "arm_float_compare_operand" "")]))]
+ "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE"
+@@ -7629,7 +7629,7 @@
+
+ (define_expand "cstoredi4"
+ [(set (match_operand:SI 0 "s_register_operand" "")
+- (match_operator:SI 1 "arm_comparison_operator"
++ (match_operator:SI 1 "expandable_comparison_operator"
+ [(match_operand:DI 2 "cmpdi_operand" "")
+ (match_operand:DI 3 "cmpdi_operand" "")]))]
+ "TARGET_32BIT"
+@@ -7749,7 +7749,7 @@
+
+ (define_expand "movsicc"
+ [(set (match_operand:SI 0 "s_register_operand" "")
+- (if_then_else:SI (match_operand 1 "arm_comparison_operator" "")
++ (if_then_else:SI (match_operand 1 "expandable_comparison_operator" "")
+ (match_operand:SI 2 "arm_not_operand" "")
+ (match_operand:SI 3 "arm_not_operand" "")))]
+ "TARGET_32BIT"
+@@ -7769,7 +7769,7 @@
+
+ (define_expand "movsfcc"
+ [(set (match_operand:SF 0 "s_register_operand" "")
+- (if_then_else:SF (match_operand 1 "arm_comparison_operator" "")
++ (if_then_else:SF (match_operand 1 "expandable_comparison_operator" "")
+ (match_operand:SF 2 "s_register_operand" "")
+ (match_operand:SF 3 "nonmemory_operand" "")))]
+ "TARGET_32BIT && TARGET_HARD_FLOAT"
+@@ -7795,7 +7795,7 @@
+
+ (define_expand "movdfcc"
+ [(set (match_operand:DF 0 "s_register_operand" "")
+- (if_then_else:DF (match_operand 1 "arm_comparison_operator" "")
++ (if_then_else:DF (match_operand 1 "expandable_comparison_operator" "")
+ (match_operand:DF 2 "s_register_operand" "")
+ (match_operand:DF 3 "arm_float_add_operand" "")))]
+ "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FPA || TARGET_VFP_DOUBLE)"
+
+=== modified file 'gcc/config/arm/predicates.md'
+--- old/gcc/config/arm/predicates.md 2011-09-15 09:45:31 +0000
++++ new/gcc/config/arm/predicates.md 2011-10-03 09:47:33 +0000
+@@ -242,11 +242,15 @@
+
+ ;; True for integer comparisons and, if FP is active, for comparisons
+ ;; other than LTGT or UNEQ.
++(define_special_predicate "expandable_comparison_operator"
++ (match_code "eq,ne,le,lt,ge,gt,geu,gtu,leu,ltu,
++ unordered,ordered,unlt,unle,unge,ungt"))
++
++;; Likewise, but only accept comparisons that are directly supported
++;; by ARM condition codes.
+ (define_special_predicate "arm_comparison_operator"
+- (ior (match_code "eq,ne,le,lt,ge,gt,geu,gtu,leu,ltu")
+- (and (match_test "TARGET_32BIT && TARGET_HARD_FLOAT
+- && (TARGET_FPA || TARGET_VFP)")
+- (match_code "unordered,ordered,unlt,unle,unge,ungt"))))
++ (and (match_operand 0 "expandable_comparison_operator")
++ (match_test "maybe_get_arm_condition_code (op) != ARM_NV")))
+
+ (define_special_predicate "lt_ge_comparison_operator"
+ (match_code "lt,ge"))
+
+=== added file 'gcc/testsuite/gcc.dg/torture/pr49030.c'
+--- old/gcc/testsuite/gcc.dg/torture/pr49030.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/torture/pr49030.c 2011-10-03 09:46:40 +0000
+@@ -0,0 +1,19 @@
++void
++sample_move_d32u24_sS (char *dst, float *src, unsigned long nsamples,
++ unsigned long dst_skip)
++{
++ long long y;
++ while (nsamples--)
++ {
++ y = (long long) (*src * 8388608.0f) << 8;
++ if (y > 2147483647) {
++ *(int *) dst = 2147483647;
++ } else if (y < -2147483647 - 1) {
++ *(int *) dst = -2147483647 - 1;
++ } else {
++ *(int *) dst = (int) y;
++ }
++ dst += dst_skip;
++ src++;
++ }
++}
+
+=== added file 'gcc/testsuite/gcc.target/arm/cmp-1.c'
+--- old/gcc/testsuite/gcc.target/arm/cmp-1.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.target/arm/cmp-1.c 2011-10-03 09:47:33 +0000
+@@ -0,0 +1,37 @@
++/* { dg-do compile } */
++/* { dg-options "-O" } */
++/* { dg-final { scan-assembler-not "\tbl\t" } } */
++/* { dg-final { scan-assembler-not "__aeabi" } } */
++int x, y;
++
++#define TEST_EXPR(NAME, ARGS, EXPR) \
++ int NAME##1 ARGS { return (EXPR); } \
++ int NAME##2 ARGS { return !(EXPR); } \
++ int NAME##3 ARGS { return (EXPR) ? x : y; } \
++ void NAME##4 ARGS { if (EXPR) x++; } \
++ void NAME##5 ARGS { if (!(EXPR)) x++; }
++
++#define TEST(NAME, TYPE, OPERATOR) \
++ TEST_EXPR (NAME##_rr, (TYPE a1, TYPE a2), a1 OPERATOR a2) \
++ TEST_EXPR (NAME##_rm, (TYPE a1, TYPE *a2), a1 OPERATOR *a2) \
++ TEST_EXPR (NAME##_mr, (TYPE *a1, TYPE a2), *a1 OPERATOR a2) \
++ TEST_EXPR (NAME##_mm, (TYPE *a1, TYPE *a2), *a1 OPERATOR *a2) \
++ TEST_EXPR (NAME##_rc, (TYPE a1), a1 OPERATOR 100) \
++ TEST_EXPR (NAME##_cr, (TYPE a1), 100 OPERATOR a1)
++
++#define TEST_OP(NAME, OPERATOR) \
++ TEST (sc_##NAME, signed char, OPERATOR) \
++ TEST (uc_##NAME, unsigned char, OPERATOR) \
++ TEST (ss_##NAME, short, OPERATOR) \
++ TEST (us_##NAME, unsigned short, OPERATOR) \
++ TEST (si_##NAME, int, OPERATOR) \
++ TEST (ui_##NAME, unsigned int, OPERATOR) \
++ TEST (sll_##NAME, long long, OPERATOR) \
++ TEST (ull_##NAME, unsigned long long, OPERATOR)
++
++TEST_OP (eq, ==)
++TEST_OP (ne, !=)
++TEST_OP (lt, <)
++TEST_OP (gt, >)
++TEST_OP (le, <=)
++TEST_OP (ge, >=)
+
+=== added file 'gcc/testsuite/gcc.target/arm/cmp-2.c'
+--- old/gcc/testsuite/gcc.target/arm/cmp-2.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.target/arm/cmp-2.c 2011-10-03 09:47:33 +0000
+@@ -0,0 +1,49 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_vfp_ok } */
++/* { dg-skip-if "need fp instructions" { *-*-* } { "-mfloat-abi=soft" } { "" } } */
++/* { dg-options "-O -mfpu=vfp -mfloat-abi=softfp" } */
++/* { dg-final { scan-assembler-not "\tbl\t" } } */
++/* { dg-final { scan-assembler-not "__aeabi" } } */
++int x, y;
++
++#define EQ(X, Y) ((X) == (Y))
++#define NE(X, Y) ((X) != (Y))
++#define LT(X, Y) ((X) < (Y))
++#define GT(X, Y) ((X) > (Y))
++#define LE(X, Y) ((X) <= (Y))
++#define GE(X, Y) ((X) >= (Y))
++
++#define TEST_EXPR(NAME, ARGS, EXPR) \
++ int NAME##1 ARGS { return (EXPR); } \
++ int NAME##2 ARGS { return !(EXPR); } \
++ int NAME##3 ARGS { return (EXPR) ? x : y; } \
++ void NAME##4 ARGS { if (EXPR) x++; } \
++ void NAME##5 ARGS { if (!(EXPR)) x++; }
++
++#define TEST(NAME, TYPE, OPERATOR) \
++ TEST_EXPR (NAME##_rr, (TYPE a1, TYPE a2), OPERATOR (a1, a2)) \
++ TEST_EXPR (NAME##_rm, (TYPE a1, TYPE *a2), OPERATOR (a1, *a2)) \
++ TEST_EXPR (NAME##_mr, (TYPE *a1, TYPE a2), OPERATOR (*a1, a2)) \
++ TEST_EXPR (NAME##_mm, (TYPE *a1, TYPE *a2), OPERATOR (*a1, *a2)) \
++ TEST_EXPR (NAME##_rc, (TYPE a1), OPERATOR (a1, 100)) \
++ TEST_EXPR (NAME##_cr, (TYPE a1), OPERATOR (100, a1))
++
++#define TEST_OP(NAME, OPERATOR) \
++ TEST (f_##NAME, float, OPERATOR) \
++ TEST (d_##NAME, double, OPERATOR) \
++ TEST (ld_##NAME, long double, OPERATOR)
++
++TEST_OP (eq, EQ)
++TEST_OP (ne, NE)
++TEST_OP (lt, LT)
++TEST_OP (gt, GT)
++TEST_OP (le, LE)
++TEST_OP (ge, GE)
++TEST_OP (blt, __builtin_isless)
++TEST_OP (bgt, __builtin_isgreater)
++TEST_OP (ble, __builtin_islessequal)
++TEST_OP (bge, __builtin_isgreaterequal)
++/* This one should be expanded into separate ordered and equality
++ comparisons. */
++TEST_OP (blg, __builtin_islessgreater)
++TEST_OP (bun, __builtin_isunordered)
+
--- /dev/null
+2011-10-06 Ira Rosen <ira.rosen@linaro.org>
+
+ Backport from mainline:
+
+ 2011-09-25 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/
+ * tree-data-ref.c (dr_analyze_innermost): Add new argument.
+ Allow not simple iv if analyzing basic block.
+ (create_data_ref): Update call to dr_analyze_innermost.
+ (stmt_with_adjacent_zero_store_dr_p, ref_base_address): Likewise.
+ * tree-loop-distribution.c (generate_memset_zero): Likewise.
+ * tree-predcom.c (find_looparound_phi): Likewise.
+ * tree-data-ref.h (dr_analyze_innermost): Add new argument.
+
+ gcc/testsuite/
+ * gcc.dg/vect/bb-slp-24.c: New.
+
+
+ 2011-09-15 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/
+ * tree-vect-data-refs.c (vect_analyze_data_ref_dependence): Allow
+ read-after-read dependencies in basic block SLP.
+
+ gcc/testsuite/
+ * gcc.dg/vect/bb-slp-25.c: New.
+
+
+ 2011-04-21 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ * tree-vect-data-refs.c (vect_drs_dependent_in_basic_block): Use
+ operand_equal_p to compare DR_BASE_ADDRESSes.
+ (vect_check_interleaving): Likewise.
+
+ gcc/testsuite/
+ * gcc.dg/vect/vect-119.c: New test.
+
+=== added file 'gcc/testsuite/gcc.dg/vect/bb-slp-24.c'
+--- old/gcc/testsuite/gcc.dg/vect/bb-slp-24.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/bb-slp-24.c 2011-10-02 08:43:10 +0000
+@@ -0,0 +1,59 @@
++/* { dg-require-effective-target vect_int } */
++
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define A 3
++#define N 256
++
++short src[N], dst[N];
++
++void foo (short * __restrict__ dst, short * __restrict__ src, int h,
++ int stride, int dummy)
++{
++ int i;
++ h /= 8;
++ for (i = 0; i < h; i++)
++ {
++ dst[0] += A*src[0];
++ dst[1] += A*src[1];
++ dst[2] += A*src[2];
++ dst[3] += A*src[3];
++ dst[4] += A*src[4];
++ dst[5] += A*src[5];
++ dst[6] += A*src[6];
++ dst[7] += A*src[7];
++ dst += stride;
++ src += stride;
++ if (dummy == 32)
++ abort ();
++ }
++}
++
++
++int main (void)
++{
++ int i;
++
++ check_vect ();
++
++ for (i = 0; i < N; i++)
++ {
++ dst[i] = 0;
++ src[i] = i;
++ }
++
++ foo (dst, src, N, 8, 0);
++
++ for (i = 0; i < N; i++)
++ {
++ if (dst[i] != A * i)
++ abort ();
++ }
++
++ return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp" { target vect_element_align } } } */
++/* { dg-final { cleanup-tree-dump "slp" } } */
++
+
+=== added file 'gcc/testsuite/gcc.dg/vect/bb-slp-25.c'
+--- old/gcc/testsuite/gcc.dg/vect/bb-slp-25.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/bb-slp-25.c 2011-10-02 08:43:10 +0000
+@@ -0,0 +1,59 @@
++/* { dg-require-effective-target vect_int } */
++
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define A 3
++#define B 4
++#define N 256
++
++short src[N], dst[N];
++
++void foo (short * __restrict__ dst, short * __restrict__ src, int h, int stride, int dummy)
++{
++ int i;
++ h /= 16;
++ for (i = 0; i < h; i++)
++ {
++ dst[0] += A*src[0] + src[stride];
++ dst[1] += A*src[1] + src[1+stride];
++ dst[2] += A*src[2] + src[2+stride];
++ dst[3] += A*src[3] + src[3+stride];
++ dst[4] += A*src[4] + src[4+stride];
++ dst[5] += A*src[5] + src[5+stride];
++ dst[6] += A*src[6] + src[6+stride];
++ dst[7] += A*src[7] + src[7+stride];
++ dst += 8;
++ src += 8;
++ if (dummy == 32)
++ abort ();
++ }
++}
++
++
++int main (void)
++{
++ int i;
++
++ check_vect ();
++
++ for (i = 0; i < N; i++)
++ {
++ dst[i] = 0;
++ src[i] = i;
++ }
++
++ foo (dst, src, N, 8, 0);
++
++ for (i = 0; i < N/2; i++)
++ {
++ if (dst[i] != A * i + i + 8)
++ abort ();
++ }
++
++ return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp" { target vect_element_align } } } */
++/* { dg-final { cleanup-tree-dump "slp" } } */
++
+
+=== added file 'gcc/testsuite/gcc.dg/vect/vect-119.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-119.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-119.c 2011-10-02 08:43:10 +0000
+@@ -0,0 +1,28 @@
++/* { dg-do compile } */
++
++#define OUTER 32
++#define INNER 40
++
++static unsigned int
++bar (const unsigned int x[INNER][2], unsigned int sum)
++{
++ int i;
++
++ for (i = 0; i < INNER; i++)
++ sum += x[i][0] * x[i][0] + x[i][1] * x[i][1];
++ return sum;
++}
++
++unsigned int foo (const unsigned int x[OUTER][INNER][2])
++{
++ int i;
++ unsigned int sum;
++
++ sum = 0.0f;
++ for (i = 0; i < OUTER; i++)
++ sum = bar (x[i], sum);
++ return sum;
++}
++
++/* { dg-final { scan-tree-dump-times "Detected interleaving of size 2" 1 "vect" } } */
++/* { dg-final { cleanup-tree-dump "vect" } } */
+
+=== modified file 'gcc/tree-data-ref.c'
+--- old/gcc/tree-data-ref.c 2011-05-26 14:27:33 +0000
++++ new/gcc/tree-data-ref.c 2011-10-02 08:43:10 +0000
+@@ -721,11 +721,11 @@
+ }
+
+ /* Analyzes the behavior of the memory reference DR in the innermost loop or
+- basic block that contains it. Returns true if analysis succeed or false
++ basic block that contains it. Returns true if analysis succeed or false
+ otherwise. */
+
+ bool
+-dr_analyze_innermost (struct data_reference *dr)
++dr_analyze_innermost (struct data_reference *dr, struct loop *nest)
+ {
+ gimple stmt = DR_STMT (dr);
+ struct loop *loop = loop_containing_stmt (stmt);
+@@ -768,14 +768,25 @@
+ }
+ else
+ base = build_fold_addr_expr (base);
++
+ if (in_loop)
+ {
+ if (!simple_iv (loop, loop_containing_stmt (stmt), base, &base_iv,
+ false))
+ {
+- if (dump_file && (dump_flags & TDF_DETAILS))
+- fprintf (dump_file, "failed: evolution of base is not affine.\n");
+- return false;
++ if (nest)
++ {
++ if (dump_file && (dump_flags & TDF_DETAILS))
++ fprintf (dump_file, "failed: evolution of base is not"
++ " affine.\n");
++ return false;
++ }
++ else
++ {
++ base_iv.base = base;
++ base_iv.step = ssize_int (0);
++ base_iv.no_overflow = true;
++ }
+ }
+ }
+ else
+@@ -800,10 +811,18 @@
+ else if (!simple_iv (loop, loop_containing_stmt (stmt),
+ poffset, &offset_iv, false))
+ {
+- if (dump_file && (dump_flags & TDF_DETAILS))
+- fprintf (dump_file, "failed: evolution of offset is not"
+- " affine.\n");
+- return false;
++ if (nest)
++ {
++ if (dump_file && (dump_flags & TDF_DETAILS))
++ fprintf (dump_file, "failed: evolution of offset is not"
++ " affine.\n");
++ return false;
++ }
++ else
++ {
++ offset_iv.base = poffset;
++ offset_iv.step = ssize_int (0);
++ }
+ }
+ }
+
+@@ -967,7 +986,7 @@
+ DR_REF (dr) = memref;
+ DR_IS_READ (dr) = is_read;
+
+- dr_analyze_innermost (dr);
++ dr_analyze_innermost (dr, nest);
+ dr_analyze_indices (dr, nest, loop);
+ dr_analyze_alias (dr);
+
+@@ -5185,7 +5204,7 @@
+ DR_STMT (dr) = stmt;
+ DR_REF (dr) = op0;
+
+- res = dr_analyze_innermost (dr)
++ res = dr_analyze_innermost (dr, loop_containing_stmt (stmt))
+ && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0));
+
+ free_data_ref (dr);
+@@ -5225,7 +5244,7 @@
+
+ DR_STMT (dr) = stmt;
+ DR_REF (dr) = *ref->pos;
+- dr_analyze_innermost (dr);
++ dr_analyze_innermost (dr, loop_containing_stmt (stmt));
+ base_address = DR_BASE_ADDRESS (dr);
+
+ if (!base_address)
+
+=== modified file 'gcc/tree-data-ref.h'
+--- old/gcc/tree-data-ref.h 2011-03-27 09:38:18 +0000
++++ new/gcc/tree-data-ref.h 2011-10-02 08:43:10 +0000
+@@ -386,7 +386,7 @@
+ DEF_VEC_ALLOC_O (data_ref_loc, heap);
+
+ bool get_references_in_stmt (gimple, VEC (data_ref_loc, heap) **);
+-bool dr_analyze_innermost (struct data_reference *);
++bool dr_analyze_innermost (struct data_reference *, struct loop *);
+ extern bool compute_data_dependences_for_loop (struct loop *, bool,
+ VEC (loop_p, heap) **,
+ VEC (data_reference_p, heap) **,
+
+=== modified file 'gcc/tree-loop-distribution.c'
+--- old/gcc/tree-loop-distribution.c 2011-05-11 13:07:54 +0000
++++ new/gcc/tree-loop-distribution.c 2011-10-02 08:43:10 +0000
+@@ -267,7 +267,7 @@
+
+ DR_STMT (dr) = stmt;
+ DR_REF (dr) = op0;
+- res = dr_analyze_innermost (dr);
++ res = dr_analyze_innermost (dr, loop_containing_stmt (stmt));
+ gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0)));
+
+ nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list);
+
+=== modified file 'gcc/tree-predcom.c'
+--- old/gcc/tree-predcom.c 2011-02-11 14:19:44 +0000
++++ new/gcc/tree-predcom.c 2011-10-02 08:43:10 +0000
+@@ -1114,7 +1114,7 @@
+ memset (&init_dr, 0, sizeof (struct data_reference));
+ DR_REF (&init_dr) = init_ref;
+ DR_STMT (&init_dr) = phi;
+- if (!dr_analyze_innermost (&init_dr))
++ if (!dr_analyze_innermost (&init_dr, loop))
+ return NULL;
+
+ if (!valid_initializer_p (&init_dr, ref->distance + 1, root->ref))
+
+=== modified file 'gcc/tree-vect-data-refs.c'
+--- old/gcc/tree-vect-data-refs.c 2011-07-04 11:13:51 +0000
++++ new/gcc/tree-vect-data-refs.c 2011-10-02 08:43:10 +0000
+@@ -353,11 +353,7 @@
+
+ /* Check that the data-refs have same bases and offsets. If not, we can't
+ determine if they are dependent. */
+- if ((DR_BASE_ADDRESS (dra) != DR_BASE_ADDRESS (drb)
+- && (TREE_CODE (DR_BASE_ADDRESS (dra)) != ADDR_EXPR
+- || TREE_CODE (DR_BASE_ADDRESS (drb)) != ADDR_EXPR
+- || TREE_OPERAND (DR_BASE_ADDRESS (dra), 0)
+- != TREE_OPERAND (DR_BASE_ADDRESS (drb),0)))
++ if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
+ || !dr_equal_offsets_p (dra, drb))
+ return true;
+
+@@ -403,11 +399,7 @@
+
+ /* Check that the data-refs have same first location (except init) and they
+ are both either store or load (not load and store). */
+- if ((DR_BASE_ADDRESS (dra) != DR_BASE_ADDRESS (drb)
+- && (TREE_CODE (DR_BASE_ADDRESS (dra)) != ADDR_EXPR
+- || TREE_CODE (DR_BASE_ADDRESS (drb)) != ADDR_EXPR
+- || TREE_OPERAND (DR_BASE_ADDRESS (dra), 0)
+- != TREE_OPERAND (DR_BASE_ADDRESS (drb),0)))
++ if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
+ || !dr_equal_offsets_p (dra, drb)
+ || !tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb))
+ || DR_IS_READ (dra) != DR_IS_READ (drb))
+@@ -615,6 +607,11 @@
+ if (vect_check_interleaving (dra, drb))
+ return false;
+
++ /* Read-read is OK (we need this check here, after checking for
++ interleaving). */
++ if (DR_IS_READ (dra) && DR_IS_READ (drb))
++ return false;
++
+ if (vect_print_dump_info (REPORT_DR_DETAILS))
+ {
+ fprintf (vect_dump, "can't determine dependence between ");
+
--- /dev/null
+2011-10-06 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/testsuite/
+ * gcc.dg/vect/bb-slp-26.c: Simplify to make the basic block
+ vectorizable.
+
+ Backport from mainline:
+
+ 2011-09-25 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/
+ * tree-vect-slp.c (vect_slp_analyze_bb_1): Split out core part
+ of vect_analyze_bb here.
+ (vect_analyze_bb): Loop over vector sizes calling vect_analyze_bb_1.
+
+ gcc/testsuite/
+ * lib/target-supports.exp (check_effective_target_vect64): New.
+ * gcc.dg/vect/bb-slp-11.c: Expect the error message twice in case
+ of multiple vector sizes.
+ * gcc.dg/vect/bb-slp-26.c: New.
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/bb-slp-11.c'
+--- old/gcc/testsuite/gcc.dg/vect/bb-slp-11.c 2010-11-22 12:16:52 +0000
++++ new/gcc/testsuite/gcc.dg/vect/bb-slp-11.c 2011-10-02 10:40:34 +0000
+@@ -49,6 +49,7 @@
+ }
+
+ /* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 0 "slp" } } */
+-/* { dg-final { scan-tree-dump-times "SLP with multiple types" 1 "slp" } } */
++/* { dg-final { scan-tree-dump-times "SLP with multiple types" 1 "slp" { xfail vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "SLP with multiple types" 2 "slp" { target vect_multiple_sizes } } } */
+ /* { dg-final { cleanup-tree-dump "slp" } } */
+
+
+=== added file 'gcc/testsuite/gcc.dg/vect/bb-slp-26.c'
+--- old/gcc/testsuite/gcc.dg/vect/bb-slp-26.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/bb-slp-26.c 2011-10-02 10:40:34 +0000
+@@ -0,0 +1,60 @@
++/* { dg-require-effective-target vect_int } */
++
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define A 3
++#define B 4
++#define N 256
++
++char src[N], dst[N];
++
++void foo (char * __restrict__ dst, char * __restrict__ src, int h,
++ int stride, int dummy)
++{
++ int i;
++ h /= 16;
++ for (i = 0; i < h; i++)
++ {
++ dst[0] += A*src[0];
++ dst[1] += A*src[1];
++ dst[2] += A*src[2];
++ dst[3] += A*src[3];
++ dst[4] += A*src[4];
++ dst[5] += A*src[5];
++ dst[6] += A*src[6];
++ dst[7] += A*src[7];
++ dst += 8;
++ src += 8;
++ if (dummy == 32)
++ abort ();
++ }
++}
++
++
++int main (void)
++{
++ int i;
++
++ check_vect ();
++
++ for (i = 0; i < N; i++)
++ {
++ dst[i] = 0;
++ src[i] = i/8;
++ }
++
++ foo (dst, src, N, 8, 0);
++
++ for (i = 0; i < N/2; i++)
++ {
++ if (dst[i] != A * src[i])
++ abort ();
++ }
++
++ return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp" { target vect64 } } } */
++/* { dg-final { cleanup-tree-dump "slp" } } */
++
+
+=== modified file 'gcc/testsuite/lib/target-supports.exp'
+--- old/gcc/testsuite/lib/target-supports.exp 2011-09-20 07:54:28 +0000
++++ new/gcc/testsuite/lib/target-supports.exp 2011-10-02 10:40:34 +0000
+@@ -3283,6 +3283,24 @@
+ return $et_vect_multiple_sizes_saved
+ }
+
++# Return 1 if the target supports vectors of 64 bits.
++
++proc check_effective_target_vect64 { } {
++ global et_vect64
++
++ if [info exists et_vect64_saved] {
++ verbose "check_effective_target_vect64: using cached result" 2
++ } else {
++ set et_vect64_saved 0
++ if { ([istarget arm*-*-*] && [check_effective_target_arm_neon_ok]) } {
++ set et_vect64_saved 1
++ }
++ }
++
++ verbose "check_effective_target_vect64: returning $et_vect64_saved" 2
++ return $et_vect64_saved
++}
++
+ # Return 1 if the target supports section-anchors
+
+ proc check_effective_target_section_anchors { } {
+
+=== modified file 'gcc/tree-vect-slp.c'
+--- old/gcc/tree-vect-slp.c 2011-07-06 12:04:10 +0000
++++ new/gcc/tree-vect-slp.c 2011-10-02 10:40:34 +0000
+@@ -1664,42 +1664,18 @@
+
+ /* Check if the basic block can be vectorized. */
+
+-bb_vec_info
+-vect_slp_analyze_bb (basic_block bb)
++static bb_vec_info
++vect_slp_analyze_bb_1 (basic_block bb)
+ {
+ bb_vec_info bb_vinfo;
+ VEC (ddr_p, heap) *ddrs;
+ VEC (slp_instance, heap) *slp_instances;
+ slp_instance instance;
+- int i, insns = 0;
+- gimple_stmt_iterator gsi;
++ int i;
+ int min_vf = 2;
+ int max_vf = MAX_VECTORIZATION_FACTOR;
+ bool data_dependence_in_bb = false;
+
+- current_vector_size = 0;
+-
+- if (vect_print_dump_info (REPORT_DETAILS))
+- fprintf (vect_dump, "===vect_slp_analyze_bb===\n");
+-
+- for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+- {
+- gimple stmt = gsi_stmt (gsi);
+- if (!is_gimple_debug (stmt)
+- && !gimple_nop_p (stmt)
+- && gimple_code (stmt) != GIMPLE_LABEL)
+- insns++;
+- }
+-
+- if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
+- {
+- if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+- fprintf (vect_dump, "not vectorized: too many instructions in basic "
+- "block.\n");
+-
+- return NULL;
+- }
+-
+ bb_vinfo = new_bb_vec_info (bb);
+ if (!bb_vinfo)
+ return NULL;
+@@ -1819,6 +1795,61 @@
+ }
+
+
++bb_vec_info
++vect_slp_analyze_bb (basic_block bb)
++{
++ bb_vec_info bb_vinfo;
++ int insns = 0;
++ gimple_stmt_iterator gsi;
++ unsigned int vector_sizes;
++
++ if (vect_print_dump_info (REPORT_DETAILS))
++ fprintf (vect_dump, "===vect_slp_analyze_bb===\n");
++
++ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
++ {
++ gimple stmt = gsi_stmt (gsi);
++ if (!is_gimple_debug (stmt)
++ && !gimple_nop_p (stmt)
++ && gimple_code (stmt) != GIMPLE_LABEL)
++ insns++;
++ }
++
++ if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
++ {
++ if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
++ fprintf (vect_dump, "not vectorized: too many instructions in basic "
++ "block.\n");
++
++ return NULL;
++ }
++
++ /* Autodetect first vector size we try. */
++ current_vector_size = 0;
++ vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
++
++ while (1)
++ {
++ bb_vinfo = vect_slp_analyze_bb_1 (bb);
++ if (bb_vinfo)
++ return bb_vinfo;
++
++ destroy_bb_vec_info (bb_vinfo);
++
++ vector_sizes &= ~current_vector_size;
++ if (vector_sizes == 0
++ || current_vector_size == 0)
++ return NULL;
++
++ /* Try the next biggest vector size. */
++ current_vector_size = 1 << floor_log2 (vector_sizes);
++ if (vect_print_dump_info (REPORT_DETAILS))
++ fprintf (vect_dump, "***** Re-trying analysis with "
++ "vector size %d\n", current_vector_size);
++ }
++}
++
++
+ /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
+ the number of created vector stmts depends on the unrolling factor).
+ However, the actual number of vector stmts for every SLP node depends on
+
--- /dev/null
+2011-10-13 Andrew Stubbs <ams@codesourcery.com>
+
+ Backport from mainline:
+
+ 2011-10-07 Andrew Stubbs <ams@codesourcery.com>
+
+ gcc/
+ * config/arm/predicates.md (shift_amount_operand): Remove constant
+ range check.
+ (shift_operator): Check range of constants for all shift operators.
+
+ gcc/testsuite/
+ * gcc.dg/pr50193-1.c: New file.
+ * gcc.target/arm/shiftable.c: New file.
+
+=== modified file 'gcc/config/arm/predicates.md'
+--- old/gcc/config/arm/predicates.md 2011-10-03 09:47:33 +0000
++++ new/gcc/config/arm/predicates.md 2011-10-10 11:43:28 +0000
+@@ -129,11 +129,12 @@
+ (ior (match_operand 0 "arm_rhs_operand")
+ (match_operand 0 "memory_operand")))
+
++;; This doesn't have to do much because the constant is already checked
++;; in the shift_operator predicate.
+ (define_predicate "shift_amount_operand"
+ (ior (and (match_test "TARGET_ARM")
+ (match_operand 0 "s_register_operand"))
+- (and (match_operand 0 "const_int_operand")
+- (match_test "INTVAL (op) > 0"))))
++ (match_operand 0 "const_int_operand")))
+
+ (define_predicate "arm_add_operand"
+ (ior (match_operand 0 "arm_rhs_operand")
+@@ -219,13 +220,20 @@
+ (match_test "mode == GET_MODE (op)")))
+
+ ;; True for shift operators.
++;; Notes:
++;; * mult is only permitted with a constant shift amount
++;; * patterns that permit register shift amounts only in ARM mode use
++;; shift_amount_operand, patterns that always allow registers do not,
++;; so we don't have to worry about that sort of thing here.
+ (define_special_predicate "shift_operator"
+ (and (ior (ior (and (match_code "mult")
+ (match_test "power_of_two_operand (XEXP (op, 1), mode)"))
+ (and (match_code "rotate")
+ (match_test "GET_CODE (XEXP (op, 1)) == CONST_INT
+ && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32")))
+- (match_code "ashift,ashiftrt,lshiftrt,rotatert"))
++ (and (match_code "ashift,ashiftrt,lshiftrt,rotatert")
++ (match_test "GET_CODE (XEXP (op, 1)) != CONST_INT
++ || ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32")))
+ (match_test "mode == GET_MODE (op)")))
+
+ ;; True for MULT, to identify which variant of shift_operator is in use.
+
+=== added file 'gcc/testsuite/gcc.target/arm/shiftable.c'
+--- old/gcc/testsuite/gcc.target/arm/shiftable.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.target/arm/shiftable.c 2011-10-10 11:43:28 +0000
+@@ -0,0 +1,63 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++/* { dg-require-effective-target arm32 } */
++
++/* ARM has shift-and-alu insns. Depending on the ALU op GCC represents some
++ of these as a left shift, others as a multiply. Check that we match the
++ right one. */
++
++int
++plus (int a, int b)
++{
++ return (a * 64) + b;
++}
++
++/* { dg-final { scan-assembler "add.*\[al]sl #6" } } */
++
++int
++minus (int a, int b)
++{
++ return a - (b * 64);
++}
++
++/* { dg-final { scan-assembler "sub.*\[al]sl #6" } } */
++
++int
++ior (int a, int b)
++{
++ return (a * 64) | b;
++}
++
++/* { dg-final { scan-assembler "orr.*\[al]sl #6" } } */
++
++int
++xor (int a, int b)
++{
++ return (a * 64) ^ b;
++}
++
++/* { dg-final { scan-assembler "eor.*\[al]sl #6" } } */
++
++int
++and (int a, int b)
++{
++ return (a * 64) & b;
++}
++
++/* { dg-final { scan-assembler "and.*\[al]sl #6" } } */
++
++int
++rsb (int a, int b)
++{
++ return (a * 64) - b;
++}
++
++/* { dg-final { scan-assembler "rsb.*\[al]sl #6" } } */
++
++int
++mvn (int a, int b)
++{
++ return ~(a * 64);
++}
++
++/* { dg-final { scan-assembler "mvn.*\[al]sl #6" } } */
+
--- /dev/null
+2011-10-16 Ira Rosen <ira.rosen@linaro.org>
+
+ Backport from mainline:
+
+ 2011-09-27 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/
+ * tree-vect-stmts.c (vectorizable_type_demotion): Handle basic block
+ vectorization.
+ (vectorizable_type_promotion): Likewise.
+ (vect_analyze_stmt): Call vectorizable_type_demotion and
+ vectorizable_type_promotion for basic blocks.
+ (supportable_widening_operation): Don't assume loop vectorization.
+ * tree-vect-slp.c (vect_build_slp_tree): Allow multiple types for
+ basic blocks. Update vectorization factor for basic block
+ vectorization.
+ (vect_analyze_slp_instance): Allow multiple types for basic block
+ vectorization. Recheck unrolling factor after construction of SLP
+ instance.
+
+ gcc/testsuite/
+ * gcc.dg/vect/bb-slp-11.c: Expect to get vectorized with 64-bit
+ vectors.
+ * gcc.dg/vect/bb-slp-27.c: New.
+ * gcc.dg/vect/bb-slp-28.c: New.
+
+
+ 2011-10-04 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/testsuite/
+ * lib/target-supports.exp (check_effective_target_vect_multiple_sizes):
+ Make et_vect_multiple_sizes_saved global.
+ (check_effective_target_vect64): Make et_vect64_saved global.
+
+=== modified file 'gcc/testsuite/gcc.dg/vect/bb-slp-11.c'
+--- old/gcc/testsuite/gcc.dg/vect/bb-slp-11.c 2011-10-02 10:40:34 +0000
++++ new/gcc/testsuite/gcc.dg/vect/bb-slp-11.c 2011-10-06 11:08:08 +0000
+@@ -48,8 +48,6 @@
+ return 0;
+ }
+
+-/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 0 "slp" } } */
+-/* { dg-final { scan-tree-dump-times "SLP with multiple types" 1 "slp" { xfail vect_multiple_sizes } } } */
+-/* { dg-final { scan-tree-dump-times "SLP with multiple types" 2 "slp" { target vect_multiple_sizes } } } */
++/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp" { target vect64 } } } */
+ /* { dg-final { cleanup-tree-dump "slp" } } */
+
+
+=== added file 'gcc/testsuite/gcc.dg/vect/bb-slp-27.c'
+--- old/gcc/testsuite/gcc.dg/vect/bb-slp-27.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/bb-slp-27.c 2011-10-06 11:08:08 +0000
+@@ -0,0 +1,49 @@
++/* { dg-require-effective-target vect_int } */
++
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define A 3
++#define N 16
++
++short src[N], dst[N];
++
++void foo (int a)
++{
++ dst[0] += a*src[0];
++ dst[1] += a*src[1];
++ dst[2] += a*src[2];
++ dst[3] += a*src[3];
++ dst[4] += a*src[4];
++ dst[5] += a*src[5];
++ dst[6] += a*src[6];
++ dst[7] += a*src[7];
++}
++
++
++int main (void)
++{
++ int i;
++
++ check_vect ();
++
++ for (i = 0; i < N; i++)
++ {
++ dst[i] = 0;
++ src[i] = i;
++ }
++
++ foo (A);
++
++ for (i = 0; i < 8; i++)
++ {
++ if (dst[i] != A * i)
++ abort ();
++ }
++
++ return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp" { target { vect_int_mult && { vect_unpack && vect_pack_trunc } } } } } */
++/* { dg-final { cleanup-tree-dump "slp" } } */
++
+
+=== added file 'gcc/testsuite/gcc.dg/vect/bb-slp-28.c'
+--- old/gcc/testsuite/gcc.dg/vect/bb-slp-28.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/bb-slp-28.c 2011-10-06 11:08:08 +0000
+@@ -0,0 +1,71 @@
++/* { dg-require-effective-target vect_int } */
++
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define A 300
++#define N 16
++
++char src[N];
++short dst[N];
++short src1[N], dst1[N];
++
++void foo (int a)
++{
++ dst[0] = (short) (a * (int) src[0]);
++ dst[1] = (short) (a * (int) src[1]);
++ dst[2] = (short) (a * (int) src[2]);
++ dst[3] = (short) (a * (int) src[3]);
++ dst[4] = (short) (a * (int) src[4]);
++ dst[5] = (short) (a * (int) src[5]);
++ dst[6] = (short) (a * (int) src[6]);
++ dst[7] = (short) (a * (int) src[7]);
++ dst[8] = (short) (a * (int) src[8]);
++ dst[9] = (short) (a * (int) src[9]);
++ dst[10] = (short) (a * (int) src[10]);
++ dst[11] = (short) (a * (int) src[11]);
++ dst[12] = (short) (a * (int) src[12]);
++ dst[13] = (short) (a * (int) src[13]);
++ dst[14] = (short) (a * (int) src[14]);
++ dst[15] = (short) (a * (int) src[15]);
++
++ dst1[0] += src1[0];
++ dst1[1] += src1[1];
++ dst1[2] += src1[2];
++ dst1[3] += src1[3];
++ dst1[4] += src1[4];
++ dst1[5] += src1[5];
++ dst1[6] += src1[6];
++ dst1[7] += src1[7];
++}
++
++
++int main (void)
++{
++ int i;
++
++ check_vect ();
++
++ for (i = 0; i < N; i++)
++ {
++ dst[i] = 2;
++ dst1[i] = 0;
++ src[i] = i;
++ src1[i] = i+2;
++ }
++
++ foo (A);
++
++ for (i = 0; i < N; i++)
++ {
++ if (dst[i] != A * i
++ || (i < N/2 && dst1[i] != i + 2))
++ abort ();
++ }
++
++ return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp" { target { vect_int_mult && { vect_pack_trunc && vect_unpack } } } } } */
++/* { dg-final { cleanup-tree-dump "slp" } } */
++
+
+=== modified file 'gcc/testsuite/lib/target-supports.exp'
+--- old/gcc/testsuite/lib/target-supports.exp 2011-10-02 10:40:34 +0000
++++ new/gcc/testsuite/lib/target-supports.exp 2011-10-06 11:08:08 +0000
+@@ -3268,7 +3268,7 @@
+ # Return 1 if the target supports multiple vector sizes
+
+ proc check_effective_target_vect_multiple_sizes { } {
+- global et_vect_multiple_sizes
++ global et_vect_multiple_sizes_saved
+
+ if [info exists et_vect_multiple_sizes_saved] {
+ verbose "check_effective_target_vect_multiple_sizes: using cached result" 2
+@@ -3286,7 +3286,7 @@
+ # Return 1 if the target supports vectors of 64 bits.
+
+ proc check_effective_target_vect64 { } {
+- global et_vect64
++ global et_vect64_saved
+
+ if [info exists et_vect64_saved] {
+ verbose "check_effective_target_vect64: using cached result" 2
+
+=== modified file 'gcc/tree-vect-slp.c'
+--- old/gcc/tree-vect-slp.c 2011-10-02 10:40:34 +0000
++++ new/gcc/tree-vect-slp.c 2011-10-06 11:08:08 +0000
+@@ -386,20 +386,15 @@
+ return false;
+ }
+
+- ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
+- if (ncopies != 1)
+- {
+- if (vect_print_dump_info (REPORT_SLP))
+- fprintf (vect_dump, "SLP with multiple types ");
+-
+- /* FORNOW: multiple types are unsupported in BB SLP. */
+- if (bb_vinfo)
+- return false;
+- }
+-
+ /* In case of multiple types we need to detect the smallest type. */
+ if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
+- *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
++ {
++ *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
++ if (bb_vinfo)
++ vectorization_factor = *max_nunits;
++ }
++
++ ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
+
+ if (is_gimple_call (stmt))
+ rhs_code = CALL_EXPR;
+@@ -1183,7 +1178,6 @@
+ if (loop_vinfo)
+ vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ else
+- /* No multitypes in BB SLP. */
+ vectorization_factor = nunits;
+
+ /* Calculate the unrolling factor. */
+@@ -1246,16 +1240,23 @@
+ &max_nunits, &load_permutation, &loads,
+ vectorization_factor))
+ {
++ /* Calculate the unrolling factor based on the smallest type. */
++ if (max_nunits > nunits)
++ unrolling_factor = least_common_multiple (max_nunits, group_size)
++ / group_size;
++
++ if (unrolling_factor != 1 && !loop_vinfo)
++ {
++ if (vect_print_dump_info (REPORT_SLP))
++ fprintf (vect_dump, "Build SLP failed: unrolling required in basic"
++ " block SLP");
++ return false;
++ }
++
+ /* Create a new SLP instance. */
+ new_instance = XNEW (struct _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = node;
+ SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
+- /* Calculate the unrolling factor based on the smallest type in the
+- loop. */
+- if (max_nunits > nunits)
+- unrolling_factor = least_common_multiple (max_nunits, group_size)
+- / group_size;
+-
+ SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+ SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost;
+ SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost;
+
+=== modified file 'gcc/tree-vect-stmts.c'
+--- old/gcc/tree-vect-stmts.c 2011-10-04 08:57:25 +0000
++++ new/gcc/tree-vect-stmts.c 2011-10-16 12:16:07 +0000
+@@ -3081,11 +3081,9 @@
+ VEC (tree, heap) *vec_oprnds0 = NULL;
+ VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
+ tree last_oprnd, intermediate_type;
+-
+- /* FORNOW: not supported by basic block SLP vectorization. */
+- gcc_assert (loop_vinfo);
+-
+- if (!STMT_VINFO_RELEVANT_P (stmt_info))
++ bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
++
++ if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
+ return false;
+
+ if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
+@@ -3113,7 +3111,7 @@
+ && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
+ && CONVERT_EXPR_CODE_P (code))))
+ return false;
+- if (!vect_is_simple_use_1 (op0, loop_vinfo, NULL,
++ if (!vect_is_simple_use_1 (op0, loop_vinfo, bb_vinfo,
+ &def_stmt, &def, &dt[0], &vectype_in))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+@@ -3360,11 +3358,9 @@
+ int multi_step_cvt = 0;
+ VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
+ VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
+-
+- /* FORNOW: not supported by basic block SLP vectorization. */
+- gcc_assert (loop_vinfo);
+-
+- if (!STMT_VINFO_RELEVANT_P (stmt_info))
++ bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
++
++ if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
+ return false;
+
+ if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
+@@ -3393,7 +3389,7 @@
+ && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
+ && CONVERT_EXPR_CODE_P (code))))
+ return false;
+- if (!vect_is_simple_use_1 (op0, loop_vinfo, NULL,
++ if (!vect_is_simple_use_1 (op0, loop_vinfo, bb_vinfo,
+ &def_stmt, &def, &dt[0], &vectype_in))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+@@ -5153,7 +5149,9 @@
+ else
+ {
+ if (bb_vinfo)
+- ok = (vectorizable_shift (stmt, NULL, NULL, node)
++ ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
++ || vectorizable_type_demotion (stmt, NULL, NULL, node)
++ || vectorizable_shift (stmt, NULL, NULL, node)
+ || vectorizable_operation (stmt, NULL, NULL, node)
+ || vectorizable_assignment (stmt, NULL, NULL, node)
+ || vectorizable_load (stmt, NULL, NULL, node, NULL)
+@@ -5780,7 +5778,7 @@
+ {
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
+- struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
++ struct loop *vect_loop = NULL;
+ bool ordered_p;
+ enum machine_mode vec_mode;
+ enum insn_code icode1, icode2;
+@@ -5789,6 +5787,9 @@
+ tree wide_vectype = vectype_out;
+ enum tree_code c1, c2;
+
++ if (loop_info)
++ vect_loop = LOOP_VINFO_LOOP (loop_info);
++
+ /* The result of a vectorized widening operation usually requires two vectors
+ (because the widened results do not fit int one vector). The generated
+ vector results would normally be expected to be generated in the same
+@@ -5809,7 +5810,8 @@
+ iterations in parallel). We therefore don't allow to change the order
+ of the computation in the inner-loop during outer-loop vectorization. */
+
+- if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
++ if (vect_loop
++ && STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
+ && !nested_in_vect_loop_p (vect_loop, stmt))
+ ordered_p = false;
+ else
+
--- /dev/null
+2011-10-17 Michael Hope <michael.hope@linaro.org>
+
+ Backport from mainline r178852:
+
+ 2011-09-14 Julian Brown <julian@codesourcery.com>
+
+ gcc/
+ * config/arm/arm.c (arm_override_options): Add unaligned_access
+ support.
+ (arm_file_start): Emit attribute for unaligned access as appropriate.
+ * config/arm/arm.md (UNSPEC_UNALIGNED_LOAD)
+ (UNSPEC_UNALIGNED_STORE): Add constants for unspecs.
+ (insv, extzv): Add unaligned-access support.
+ (extv): Change to expander. Likewise.
+ (extzv_t1, extv_regsi): Add helpers.
+ (unaligned_loadsi, unaligned_loadhis, unaligned_loadhiu)
+ (unaligned_storesi, unaligned_storehi): New.
+ (*extv_reg): New (previous extv implementation).
+ * config/arm/arm.opt (munaligned_access): Add option.
+ * config/arm/constraints.md (Uw): New constraint.
+ * expmed.c (store_bit_field_1): Adjust bitfield numbering according
+ to size of access, not size of unit, when BITS_BIG_ENDIAN !=
+ BYTES_BIG_ENDIAN. Don't use bitfield accesses for
+ volatile accesses when -fstrict-volatile-bitfields is in effect.
+ (extract_bit_field_1): Likewise.
+
+ Backport from mainline r172697:
+
+ 2011-04-19 Wei Guozhi <carrot@google.com>
+
+ PR target/47855
+ gcc/
+ * config/arm/arm-protos.h (thumb1_legitimate_address_p): New prototype.
+ * config/arm/arm.c (thumb1_legitimate_address_p): Remove the static
+ linkage.
+ * config/arm/constraints.md (Uu): New constraint.
+ * config/arm/arm.md (*arm_movqi_insn): Compute attr "length".
+
+=== modified file 'gcc/config/arm/arm-protos.h'
+--- old/gcc/config/arm/arm-protos.h 2011-10-03 09:46:40 +0000
++++ new/gcc/config/arm/arm-protos.h 2011-10-11 01:56:19 +0000
+@@ -59,6 +59,7 @@
+ int);
+ extern rtx thumb_legitimize_reload_address (rtx *, enum machine_mode, int, int,
+ int);
++extern int thumb1_legitimate_address_p (enum machine_mode, rtx, int);
+ extern int arm_const_double_rtx (rtx);
+ extern int neg_const_double_rtx_ok_for_fpa (rtx);
+ extern int vfp3_const_double_rtx (rtx);
+
+=== modified file 'gcc/config/arm/arm.c'
+--- old/gcc/config/arm/arm.c 2011-10-03 09:46:40 +0000
++++ new/gcc/config/arm/arm.c 2011-10-11 02:31:01 +0000
+@@ -2065,6 +2065,28 @@
+ fix_cm3_ldrd = 0;
+ }
+
++ /* Enable -munaligned-access by default for
++ - all ARMv6 architecture-based processors
++ - ARMv7-A, ARMv7-R, and ARMv7-M architecture-based processors.
++
++ Disable -munaligned-access by default for
++ - all pre-ARMv6 architecture-based processors
++ - ARMv6-M architecture-based processors. */
++
++ if (unaligned_access == 2)
++ {
++ if (arm_arch6 && (arm_arch_notm || arm_arch7))
++ unaligned_access = 1;
++ else
++ unaligned_access = 0;
++ }
++ else if (unaligned_access == 1
++ && !(arm_arch6 && (arm_arch_notm || arm_arch7)))
++ {
++ warning (0, "target CPU does not support unaligned accesses");
++ unaligned_access = 0;
++ }
++
+ if (TARGET_THUMB1 && flag_schedule_insns)
+ {
+ /* Don't warn since it's on by default in -O2. */
+@@ -6106,7 +6128,7 @@
+ addresses based on the frame pointer or arg pointer until the
+ reload pass starts. This is so that eliminating such addresses
+ into stack based ones won't produce impossible code. */
+-static int
++int
+ thumb1_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p)
+ {
+ /* ??? Not clear if this is right. Experiment. */
+@@ -22226,6 +22248,10 @@
+ val = 6;
+ asm_fprintf (asm_out_file, "\t.eabi_attribute 30, %d\n", val);
+
++ /* Tag_CPU_unaligned_access. */
++ asm_fprintf (asm_out_file, "\t.eabi_attribute 34, %d\n",
++ unaligned_access);
++
+ /* Tag_ABI_FP_16bit_format. */
+ if (arm_fp16_format)
+ asm_fprintf (asm_out_file, "\t.eabi_attribute 38, %d\n",
+
+=== modified file 'gcc/config/arm/arm.md'
+--- old/gcc/config/arm/arm.md 2011-10-03 09:47:33 +0000
++++ new/gcc/config/arm/arm.md 2011-10-11 02:31:01 +0000
+@@ -113,6 +113,10 @@
+ (UNSPEC_SYMBOL_OFFSET 27) ; The offset of the start of the symbol from
+ ; another symbolic address.
+ (UNSPEC_MEMORY_BARRIER 28) ; Represent a memory barrier.
++ (UNSPEC_UNALIGNED_LOAD 29) ; Used to represent ldr/ldrh instructions that access
++ ; unaligned locations, on architectures which support
++ ; that.
++ (UNSPEC_UNALIGNED_STORE 30) ; Same for str/strh.
+ ]
+ )
+
+@@ -2463,10 +2467,10 @@
+ ;;; this insv pattern, so this pattern needs to be reevalutated.
+
+ (define_expand "insv"
+- [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "")
+- (match_operand:SI 1 "general_operand" "")
+- (match_operand:SI 2 "general_operand" ""))
+- (match_operand:SI 3 "reg_or_int_operand" ""))]
++ [(set (zero_extract (match_operand 0 "nonimmediate_operand" "")
++ (match_operand 1 "general_operand" "")
++ (match_operand 2 "general_operand" ""))
++ (match_operand 3 "reg_or_int_operand" ""))]
+ "TARGET_ARM || arm_arch_thumb2"
+ "
+ {
+@@ -2477,35 +2481,70 @@
+
+ if (arm_arch_thumb2)
+ {
+- bool use_bfi = TRUE;
+-
+- if (GET_CODE (operands[3]) == CONST_INT)
+- {
+- HOST_WIDE_INT val = INTVAL (operands[3]) & mask;
+-
+- if (val == 0)
+- {
+- emit_insn (gen_insv_zero (operands[0], operands[1],
+- operands[2]));
++ if (unaligned_access && MEM_P (operands[0])
++ && s_register_operand (operands[3], GET_MODE (operands[3]))
++ && (width == 16 || width == 32) && (start_bit % BITS_PER_UNIT) == 0)
++ {
++ rtx base_addr;
++
++ if (BYTES_BIG_ENDIAN)
++ start_bit = GET_MODE_BITSIZE (GET_MODE (operands[3])) - width
++ - start_bit;
++
++ if (width == 32)
++ {
++ base_addr = adjust_address (operands[0], SImode,
++ start_bit / BITS_PER_UNIT);
++ emit_insn (gen_unaligned_storesi (base_addr, operands[3]));
++ }
++ else
++ {
++ rtx tmp = gen_reg_rtx (HImode);
++
++ base_addr = adjust_address (operands[0], HImode,
++ start_bit / BITS_PER_UNIT);
++ emit_move_insn (tmp, gen_lowpart (HImode, operands[3]));
++ emit_insn (gen_unaligned_storehi (base_addr, tmp));
++ }
++ DONE;
++ }
++ else if (s_register_operand (operands[0], GET_MODE (operands[0])))
++ {
++ bool use_bfi = TRUE;
++
++ if (GET_CODE (operands[3]) == CONST_INT)
++ {
++ HOST_WIDE_INT val = INTVAL (operands[3]) & mask;
++
++ if (val == 0)
++ {
++ emit_insn (gen_insv_zero (operands[0], operands[1],
++ operands[2]));
++ DONE;
++ }
++
++ /* See if the set can be done with a single orr instruction. */
++ if (val == mask && const_ok_for_arm (val << start_bit))
++ use_bfi = FALSE;
++ }
++
++ if (use_bfi)
++ {
++ if (GET_CODE (operands[3]) != REG)
++ operands[3] = force_reg (SImode, operands[3]);
++
++ emit_insn (gen_insv_t2 (operands[0], operands[1], operands[2],
++ operands[3]));
+ DONE;
+ }
+-
+- /* See if the set can be done with a single orr instruction. */
+- if (val == mask && const_ok_for_arm (val << start_bit))
+- use_bfi = FALSE;
+- }
+-
+- if (use_bfi)
+- {
+- if (GET_CODE (operands[3]) != REG)
+- operands[3] = force_reg (SImode, operands[3]);
+-
+- emit_insn (gen_insv_t2 (operands[0], operands[1], operands[2],
+- operands[3]));
+- DONE;
+- }
++ }
++ else
++ FAIL;
+ }
+
++ if (!s_register_operand (operands[0], GET_MODE (operands[0])))
++ FAIL;
++
+ target = copy_rtx (operands[0]);
+ /* Avoid using a subreg as a subtarget, and avoid writing a paradoxical
+ subreg as the final target. */
+@@ -3697,12 +3736,10 @@
+ ;; to reduce register pressure later on.
+
+ (define_expand "extzv"
+- [(set (match_dup 4)
+- (ashift:SI (match_operand:SI 1 "register_operand" "")
+- (match_operand:SI 2 "const_int_operand" "")))
+- (set (match_operand:SI 0 "register_operand" "")
+- (lshiftrt:SI (match_dup 4)
+- (match_operand:SI 3 "const_int_operand" "")))]
++ [(set (match_operand 0 "s_register_operand" "")
++ (zero_extract (match_operand 1 "nonimmediate_operand" "")
++ (match_operand 2 "const_int_operand" "")
++ (match_operand 3 "const_int_operand" "")))]
+ "TARGET_THUMB1 || arm_arch_thumb2"
+ "
+ {
+@@ -3711,10 +3748,57 @@
+
+ if (arm_arch_thumb2)
+ {
+- emit_insn (gen_extzv_t2 (operands[0], operands[1], operands[2],
+- operands[3]));
+- DONE;
++ HOST_WIDE_INT width = INTVAL (operands[2]);
++ HOST_WIDE_INT bitpos = INTVAL (operands[3]);
++
++ if (unaligned_access && MEM_P (operands[1])
++ && (width == 16 || width == 32) && (bitpos % BITS_PER_UNIT) == 0)
++ {
++ rtx base_addr;
++
++ if (BYTES_BIG_ENDIAN)
++ bitpos = GET_MODE_BITSIZE (GET_MODE (operands[0])) - width
++ - bitpos;
++
++ if (width == 32)
++ {
++ base_addr = adjust_address (operands[1], SImode,
++ bitpos / BITS_PER_UNIT);
++ emit_insn (gen_unaligned_loadsi (operands[0], base_addr));
++ }
++ else
++ {
++ rtx dest = operands[0];
++ rtx tmp = gen_reg_rtx (SImode);
++
++ /* We may get a paradoxical subreg here. Strip it off. */
++ if (GET_CODE (dest) == SUBREG
++ && GET_MODE (dest) == SImode
++ && GET_MODE (SUBREG_REG (dest)) == HImode)
++ dest = SUBREG_REG (dest);
++
++ if (GET_MODE_BITSIZE (GET_MODE (dest)) != width)
++ FAIL;
++
++ base_addr = adjust_address (operands[1], HImode,
++ bitpos / BITS_PER_UNIT);
++ emit_insn (gen_unaligned_loadhiu (tmp, base_addr));
++ emit_move_insn (gen_lowpart (SImode, dest), tmp);
++ }
++ DONE;
++ }
++ else if (s_register_operand (operands[1], GET_MODE (operands[1])))
++ {
++ emit_insn (gen_extzv_t2 (operands[0], operands[1], operands[2],
++ operands[3]));
++ DONE;
++ }
++ else
++ FAIL;
+ }
++
++ if (!s_register_operand (operands[1], GET_MODE (operands[1])))
++ FAIL;
+
+ operands[3] = GEN_INT (rshift);
+
+@@ -3724,12 +3808,154 @@
+ DONE;
+ }
+
+- operands[2] = GEN_INT (lshift);
+- operands[4] = gen_reg_rtx (SImode);
++ emit_insn (gen_extzv_t1 (operands[0], operands[1], GEN_INT (lshift),
++ operands[3], gen_reg_rtx (SImode)));
++ DONE;
+ }"
+ )
+
+-(define_insn "extv"
++;; Helper for extzv, for the Thumb-1 register-shifts case.
++
++(define_expand "extzv_t1"
++ [(set (match_operand:SI 4 "s_register_operand" "")
++ (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "")
++ (match_operand:SI 2 "const_int_operand" "")))
++ (set (match_operand:SI 0 "s_register_operand" "")
++ (lshiftrt:SI (match_dup 4)
++ (match_operand:SI 3 "const_int_operand" "")))]
++ "TARGET_THUMB1"
++ "")
++
++(define_expand "extv"
++ [(set (match_operand 0 "s_register_operand" "")
++ (sign_extract (match_operand 1 "nonimmediate_operand" "")
++ (match_operand 2 "const_int_operand" "")
++ (match_operand 3 "const_int_operand" "")))]
++ "arm_arch_thumb2"
++{
++ HOST_WIDE_INT width = INTVAL (operands[2]);
++ HOST_WIDE_INT bitpos = INTVAL (operands[3]);
++
++ if (unaligned_access && MEM_P (operands[1]) && (width == 16 || width == 32)
++ && (bitpos % BITS_PER_UNIT) == 0)
++ {
++ rtx base_addr;
++
++ if (BYTES_BIG_ENDIAN)
++ bitpos = GET_MODE_BITSIZE (GET_MODE (operands[0])) - width - bitpos;
++
++ if (width == 32)
++ {
++ base_addr = adjust_address (operands[1], SImode,
++ bitpos / BITS_PER_UNIT);
++ emit_insn (gen_unaligned_loadsi (operands[0], base_addr));
++ }
++ else
++ {
++ rtx dest = operands[0];
++ rtx tmp = gen_reg_rtx (SImode);
++
++ /* We may get a paradoxical subreg here. Strip it off. */
++ if (GET_CODE (dest) == SUBREG
++ && GET_MODE (dest) == SImode
++ && GET_MODE (SUBREG_REG (dest)) == HImode)
++ dest = SUBREG_REG (dest);
++
++ if (GET_MODE_BITSIZE (GET_MODE (dest)) != width)
++ FAIL;
++
++ base_addr = adjust_address (operands[1], HImode,
++ bitpos / BITS_PER_UNIT);
++ emit_insn (gen_unaligned_loadhis (tmp, base_addr));
++ emit_move_insn (gen_lowpart (SImode, dest), tmp);
++ }
++
++ DONE;
++ }
++ else if (!s_register_operand (operands[1], GET_MODE (operands[1])))
++ FAIL;
++ else if (GET_MODE (operands[0]) == SImode
++ && GET_MODE (operands[1]) == SImode)
++ {
++ emit_insn (gen_extv_regsi (operands[0], operands[1], operands[2],
++ operands[3]));
++ DONE;
++ }
++
++ FAIL;
++})
++
++; Helper to expand register forms of extv with the proper modes.
++
++(define_expand "extv_regsi"
++ [(set (match_operand:SI 0 "s_register_operand" "")
++ (sign_extract:SI (match_operand:SI 1 "s_register_operand" "")
++ (match_operand 2 "const_int_operand" "")
++ (match_operand 3 "const_int_operand" "")))]
++ ""
++{
++})
++
++; ARMv6+ unaligned load/store instructions (used for packed structure accesses).
++
++(define_insn "unaligned_loadsi"
++ [(set (match_operand:SI 0 "s_register_operand" "=l,r")
++ (unspec:SI [(match_operand:SI 1 "memory_operand" "Uw,m")]
++ UNSPEC_UNALIGNED_LOAD))]
++ "unaligned_access && TARGET_32BIT"
++ "ldr%?\t%0, %1\t@ unaligned"
++ [(set_attr "arch" "t2,any")
++ (set_attr "length" "2,4")
++ (set_attr "predicable" "yes")
++ (set_attr "type" "load1")])
++
++(define_insn "unaligned_loadhis"
++ [(set (match_operand:SI 0 "s_register_operand" "=l,r")
++ (sign_extend:SI
++ (unspec:HI [(match_operand:HI 1 "memory_operand" "Uw,m")]
++ UNSPEC_UNALIGNED_LOAD)))]
++ "unaligned_access && TARGET_32BIT"
++ "ldr%(sh%)\t%0, %1\t@ unaligned"
++ [(set_attr "arch" "t2,any")
++ (set_attr "length" "2,4")
++ (set_attr "predicable" "yes")
++ (set_attr "type" "load_byte")])
++
++(define_insn "unaligned_loadhiu"
++ [(set (match_operand:SI 0 "s_register_operand" "=l,r")
++ (zero_extend:SI
++ (unspec:HI [(match_operand:HI 1 "memory_operand" "Uw,m")]
++ UNSPEC_UNALIGNED_LOAD)))]
++ "unaligned_access && TARGET_32BIT"
++ "ldr%(h%)\t%0, %1\t@ unaligned"
++ [(set_attr "arch" "t2,any")
++ (set_attr "length" "2,4")
++ (set_attr "predicable" "yes")
++ (set_attr "type" "load_byte")])
++
++(define_insn "unaligned_storesi"
++ [(set (match_operand:SI 0 "memory_operand" "=Uw,m")
++ (unspec:SI [(match_operand:SI 1 "s_register_operand" "l,r")]
++ UNSPEC_UNALIGNED_STORE))]
++ "unaligned_access && TARGET_32BIT"
++ "str%?\t%1, %0\t@ unaligned"
++ [(set_attr "arch" "t2,any")
++ (set_attr "length" "2,4")
++ (set_attr "predicable" "yes")
++ (set_attr "type" "store1")])
++
++(define_insn "unaligned_storehi"
++ [(set (match_operand:HI 0 "memory_operand" "=Uw,m")
++ (unspec:HI [(match_operand:HI 1 "s_register_operand" "l,r")]
++ UNSPEC_UNALIGNED_STORE))]
++ "unaligned_access && TARGET_32BIT"
++ "str%(h%)\t%1, %0\t@ unaligned"
++ [(set_attr "arch" "t2,any")
++ (set_attr "length" "2,4")
++ (set_attr "predicable" "yes")
++ (set_attr "type" "store1")])
++
++(define_insn "*extv_reg"
+ [(set (match_operand:SI 0 "s_register_operand" "=r")
+ (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r")
+ (match_operand:SI 2 "const_int_operand" "M")
+@@ -6038,8 +6264,8 @@
+
+
+ (define_insn "*arm_movqi_insn"
+- [(set (match_operand:QI 0 "nonimmediate_operand" "=r,r,r,m")
+- (match_operand:QI 1 "general_operand" "rI,K,m,r"))]
++ [(set (match_operand:QI 0 "nonimmediate_operand" "=r,r,l,Uu,r,m")
++ (match_operand:QI 1 "general_operand" "rI,K,Uu,l,m,r"))]
+ "TARGET_32BIT
+ && ( register_operand (operands[0], QImode)
+ || register_operand (operands[1], QImode))"
+@@ -6047,10 +6273,14 @@
+ mov%?\\t%0, %1
+ mvn%?\\t%0, #%B1
+ ldr%(b%)\\t%0, %1
++ str%(b%)\\t%1, %0
++ ldr%(b%)\\t%0, %1
+ str%(b%)\\t%1, %0"
+- [(set_attr "type" "*,*,load1,store1")
+- (set_attr "insn" "mov,mvn,*,*")
+- (set_attr "predicable" "yes")]
++ [(set_attr "type" "*,*,load1,store1,load1,store1")
++ (set_attr "insn" "mov,mvn,*,*,*,*")
++ (set_attr "predicable" "yes")
++ (set_attr "arch" "any,any,t2,t2,any,any")
++ (set_attr "length" "4,4,2,2,4,4")]
+ )
+
+ (define_insn "*thumb1_movqi_insn"
+
+=== modified file 'gcc/config/arm/arm.opt'
+--- old/gcc/config/arm/arm.opt 2011-09-19 07:44:24 +0000
++++ new/gcc/config/arm/arm.opt 2011-10-11 02:31:01 +0000
+@@ -173,3 +173,7 @@
+ Target Report Var(fix_cm3_ldrd) Init(2)
+ Avoid overlapping destination and address registers on LDRD instructions
+ that may trigger Cortex-M3 errata.
++
++munaligned-access
++Target Report Var(unaligned_access) Init(2)
++Enable unaligned word and halfword accesses to packed data.
+
+=== modified file 'gcc/config/arm/constraints.md'
+--- old/gcc/config/arm/constraints.md 2011-09-12 14:14:00 +0000
++++ new/gcc/config/arm/constraints.md 2011-10-11 02:31:01 +0000
+@@ -36,6 +36,7 @@
+ ;; The following memory constraints have been used:
+ ;; in ARM/Thumb-2 state: Q, Ut, Uv, Uy, Un, Um, Us
+ ;; in ARM state: Uq
++;; in Thumb state: Uu, Uw
+
+
+ (define_register_constraint "f" "TARGET_ARM ? FPA_REGS : NO_REGS"
+@@ -344,6 +345,27 @@
+ (and (match_code "mem")
+ (match_test "REG_P (XEXP (op, 0))")))
+
++(define_memory_constraint "Uu"
++ "@internal
++ In Thumb state an address that is valid in 16bit encoding."
++ (and (match_code "mem")
++ (match_test "TARGET_THUMB
++ && thumb1_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
++ 0)")))
++
++; The 16-bit post-increment LDR/STR accepted by thumb1_legitimate_address_p
++; are actually LDM/STM instructions, so cannot be used to access unaligned
++; data.
++(define_memory_constraint "Uw"
++ "@internal
++ In Thumb state an address that is valid in 16bit encoding, and that can be
++ used for unaligned accesses."
++ (and (match_code "mem")
++ (match_test "TARGET_THUMB
++ && thumb1_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
++ 0)
++ && GET_CODE (XEXP (op, 0)) != POST_INC")))
++
+ ;; We used to have constraint letters for S and R in ARM state, but
+ ;; all uses of these now appear to have been removed.
+
+
+=== modified file 'gcc/expmed.c'
+--- old/gcc/expmed.c 2011-05-22 19:02:59 +0000
++++ new/gcc/expmed.c 2011-10-11 02:31:01 +0000
+@@ -657,6 +657,10 @@
+ && GET_MODE (value) != BLKmode
+ && bitsize > 0
+ && GET_MODE_BITSIZE (op_mode) >= bitsize
++ /* Do not use insv for volatile bitfields when
++ -fstrict-volatile-bitfields is in effect. */
++ && !(MEM_P (op0) && MEM_VOLATILE_P (op0)
++ && flag_strict_volatile_bitfields > 0)
+ && ! ((REG_P (op0) || GET_CODE (op0) == SUBREG)
+ && (bitsize + bitpos > GET_MODE_BITSIZE (op_mode)))
+ && insn_data[CODE_FOR_insv].operand[1].predicate (GEN_INT (bitsize),
+@@ -700,19 +704,21 @@
+ copy_back = true;
+ }
+
+- /* On big-endian machines, we count bits from the most significant.
+- If the bit field insn does not, we must invert. */
+-
+- if (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
+- xbitpos = unit - bitsize - xbitpos;
+-
+ /* We have been counting XBITPOS within UNIT.
+ Count instead within the size of the register. */
+- if (BITS_BIG_ENDIAN && !MEM_P (xop0))
++ if (BYTES_BIG_ENDIAN && !MEM_P (xop0))
+ xbitpos += GET_MODE_BITSIZE (op_mode) - unit;
+
+ unit = GET_MODE_BITSIZE (op_mode);
+
++ /* If BITS_BIG_ENDIAN is zero on a BYTES_BIG_ENDIAN machine, we count
++ "backwards" from the size of the unit we are inserting into.
++ Otherwise, we count bits from the most significant on a
++ BYTES/BITS_BIG_ENDIAN machine. */
++
++ if (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
++ xbitpos = unit - bitsize - xbitpos;
++
+ /* Convert VALUE to op_mode (which insv insn wants) in VALUE1. */
+ value1 = value;
+ if (GET_MODE (value) != op_mode)
+@@ -1528,6 +1534,10 @@
+ if (ext_mode != MAX_MACHINE_MODE
+ && bitsize > 0
+ && GET_MODE_BITSIZE (ext_mode) >= bitsize
++ /* Do not use extv/extzv for volatile bitfields when
++ -fstrict-volatile-bitfields is in effect. */
++ && !(MEM_P (op0) && MEM_VOLATILE_P (op0)
++ && flag_strict_volatile_bitfields > 0)
+ /* If op0 is a register, we need it in EXT_MODE to make it
+ acceptable to the format of ext(z)v. */
+ && !(GET_CODE (op0) == SUBREG && GET_MODE (op0) != ext_mode)
+@@ -1552,17 +1562,20 @@
+ /* Get ref to first byte containing part of the field. */
+ xop0 = adjust_address (xop0, byte_mode, xoffset);
+
+- /* On big-endian machines, we count bits from the most significant.
+- If the bit field insn does not, we must invert. */
+- if (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
+- xbitpos = unit - bitsize - xbitpos;
+-
+ /* Now convert from counting within UNIT to counting in EXT_MODE. */
+- if (BITS_BIG_ENDIAN && !MEM_P (xop0))
++ if (BYTES_BIG_ENDIAN && !MEM_P (xop0))
+ xbitpos += GET_MODE_BITSIZE (ext_mode) - unit;
+
+ unit = GET_MODE_BITSIZE (ext_mode);
+
++ /* If BITS_BIG_ENDIAN is zero on a BYTES_BIG_ENDIAN machine, we count
++ "backwards" from the size of the unit we are extracting from.
++ Otherwise, we count bits from the most significant on a
++ BYTES/BITS_BIG_ENDIAN machine. */
++
++ if (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
++ xbitpos = unit - bitsize - xbitpos;
++
+ if (xtarget == 0)
+ xtarget = xspec_target = gen_reg_rtx (tmode);
+
+
--- /dev/null
+2011-10-17 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-10-10 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * modulo-sched.c (ps_reg_move_info): Add num_consecutive_stages.
+ (SCHED_FIRST_REG_MOVE, SCHED_NREG_MOVES): Delete.
+ (node_sched_params): Remove first_reg_move and nreg_moves.
+ (ps_num_consecutive_stages, extend_node_sched_params): New functions.
+ (update_node_sched_params): Move up file.
+ (print_node_sched_params): Print the stage. Don't dump info related
+ to first_reg_move and nreg_moves.
+ (set_columns_for_row): New function.
+ (set_columns_for_ps): Move up file and use set_columns_for_row.
+ (schedule_reg_move): New function.
+ (schedule_reg_moves): Call extend_node_sched_params and
+ schedule_reg_move. Extend size of uses bitmap. Initialize
+ num_consecutive_stages. Return false if a move could not be
+ scheduled.
+ (apply_reg_moves): Don't emit moves here.
+ (permute_partial_schedule): Handle register moves.
+ (duplicate_insns_of_cycles): Remove for_prolog. Emit moves according
+ to the same stage-count test as ddg nodes.
+ (generate_prolog_epilog): Update calls accordingly.
+ (sms_schedule): Allow move-scheduling to add a new first stage.
+
+2011-10-17 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-10-10 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * modulo-sched.c (ps_insn): Adjust comment.
+ (ps_reg_move_info): New structure.
+ (partial_schedule): Add reg_moves field.
+ (SCHED_PARAMS): Use node_sched_param_vec instead of node_sched_params.
+ (node_sched_params): Turn first_reg_move into an identifier.
+ (ps_reg_move): New function.
+ (ps_rtl_insn): Cope with register moves.
+ (ps_first_note): Adjust comment and assert that the instruction
+ isn't a register move.
+ (node_sched_params): Replace with...
+ (node_sched_param_vec): ...this vector.
+ (set_node_sched_params): Adjust accordingly.
+ (print_node_sched_params): Take a partial schedule instead of a ddg.
+ Use ps_rtl_insn and ps_reg_move.
+ (generate_reg_moves): Rename to...
+ (schedule_reg_moves): ...this. Remove rescan parameter. Record each
+ move in the partial schedule, but don't emit it here. Don't perform
+ register substitutions here either.
+ (apply_reg_moves): New function.
+ (duplicate_insns_of_cycles): Use register indices directly,
+ rather than finding instructions using PREV_INSN. Use ps_reg_move.
+ (sms_schedule): Call schedule_reg_moves before committing to
+ a partial schedule. Try the next ii if the schedule fails.
+ Use apply_reg_moves instead of generate_reg_moves. Adjust
+ call to print_node_sched_params. Free node_sched_param_vec
+ instead of node_sched_params.
+ (create_partial_schedule): Initialize reg_moves.
+ (free_partial_schedule): Free reg_moves.
+
+2011-10-17 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-10-10 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * modulo-sched.c (ps_insn): Replace node field with an identifier.
+ (SCHED_ASAP): Replace with..
+ (NODE_ASAP): ...this macro.
+ (SCHED_PARAMS): New macro.
+ (SCHED_TIME, SCHED_FIRST_REG_MOVE, SCHED_NREG_MOVES, SCHED_ROW)
+ (SCHED_STAGE, SCHED_COLUMN): Redefine using SCHED_PARAMS.
+ (node_sched_params): Remove asap.
+ (ps_rtl_insn, ps_first_note): New functions.
+ (set_node_sched_params): Use XCNEWVEC. Don't copy across the
+ asap values.
+ (print_node_sched_params): Use SCHED_PARAMS and NODE_ASAP.
+ (generate_reg_moves): Pass ids to the SCHED_* macros.
+ (update_node_sched_params): Take a ps insn identifier rather than
+ a node as parameter. Use ps_rtl_insn.
+ (set_columns_for_ps): Update for above field and SCHED_* macro changes.
+ (permute_partial_schedule): Use ps_rtl_insn and ps_first_note.
+ (optimize_sc): Update for above field and SCHED_* macro changes.
+ Update calls to try_scheduling_node_in_cycle and
+ update_node_sched_params.
+ (duplicate_insns_of_cycles): Adjust for above field and SCHED_*
+ macro changes. Use ps_rtl_insn and ps_first_note.
+ (sms_schedule): Pass ids to the SCHED_* macros.
+ (get_sched_window): Adjust for above field and SCHED_* macro changes.
+ Use NODE_ASAP instead of SCHED_ASAP.
+ (try_scheduling_node_in_cycle): Remove node parameter. Update
+ call to ps_add_node_check_conflicts. Pass ids to the SCHED_*
+ macros.
+ (sms_schedule_by_order): Update call to try_scheduling_node_in_cycle.
+ (ps_insert_empty_row): Adjust for above field changes.
+ (compute_split_row): Use ids rather than nodes.
+ (verify_partial_schedule): Adjust for above field changes.
+ (print_partial_schedule): Use ps_rtl_insn.
+ (create_ps_insn): Take an id rather than a node.
+ (ps_insn_find_column): Adjust for above field changes.
+ Use ps_rtl_insn.
+ (ps_insn_advance_column): Adjust for above field changes.
+ (add_node_to_ps): Remove node parameter. Update call to
+ create_ps_insn.
+ (ps_has_conflicts): Use ps_rtl_insn.
+ (ps_add_node_check_conflicts): Replace node parameter than an id.
+
+2011-10-17 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-10-10 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * modulo-sched.c (undo_replace_buff_elem): Delete.
+ (generate_reg_moves): Don't build and return an undo list.
+ (free_undo_replace_buff): Delete.
+ (sms_schedule): Adjust call to generate_reg_moves.
+ Don't call free_undo_replace_buff.
+
+2011-10-17 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-08-08 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * modulo-sched.c (get_sched_window): Use a table for the debug output.
+ Print the current ii.
+ (sms_schedule_by_order): Reduce whitespace in dump line.
+
+2011-10-17 Richard Sandiford <richard.sandiford@linaro.org>
+
+ gcc/
+ Backport from mainline:
+
+ 2011-08-08 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * modulo-sched.c (get_sched_window): Use just one loop for predecessors
+ and one loop for successors. Fix upper bound of memory range.
+
+=== modified file 'gcc/modulo-sched.c'
+--- old/gcc/modulo-sched.c 2011-10-02 06:56:53 +0000
++++ new/gcc/modulo-sched.c 2011-10-10 14:35:32 +0000
+@@ -124,8 +124,10 @@
+ /* A single instruction in the partial schedule. */
+ struct ps_insn
+ {
+- /* The corresponding DDG_NODE. */
+- ddg_node_ptr node;
++ /* Identifies the instruction to be scheduled. Values smaller than
++ the ddg's num_nodes refer directly to ddg nodes. A value of
++ X - num_nodes refers to register move X. */
++ int id;
+
+ /* The (absolute) cycle in which the PS instruction is scheduled.
+ Same as SCHED_TIME (node). */
+@@ -137,6 +139,33 @@
+
+ };
+
++/* Information about a register move that has been added to a partial
++ schedule. */
++struct ps_reg_move_info
++{
++ /* The source of the move is defined by the ps_insn with id DEF.
++ The destination is used by the ps_insns with the ids in USES. */
++ int def;
++ sbitmap uses;
++
++ /* The original form of USES' instructions used OLD_REG, but they
++ should now use NEW_REG. */
++ rtx old_reg;
++ rtx new_reg;
++
++ /* The number of consecutive stages that the move occupies. */
++ int num_consecutive_stages;
++
++ /* An instruction that sets NEW_REG to the correct value. The first
++ move associated with DEF will have an rhs of OLD_REG; later moves
++ use the result of the previous move. */
++ rtx insn;
++};
++
++typedef struct ps_reg_move_info ps_reg_move_info;
++DEF_VEC_O (ps_reg_move_info);
++DEF_VEC_ALLOC_O (ps_reg_move_info, heap);
++
+ /* Holds the partial schedule as an array of II rows. Each entry of the
+ array points to a linked list of PS_INSNs, which represents the
+ instructions that are scheduled for that row. */
+@@ -148,6 +177,10 @@
+ /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
+ ps_insn_ptr *rows;
+
++ /* All the moves added for this partial schedule. Index X has
++ a ps_insn id of X + g->num_nodes. */
++ VEC (ps_reg_move_info, heap) *reg_moves;
++
+ /* rows_length[i] holds the number of instructions in the row.
+ It is used only (as an optimization) to back off quickly from
+ trying to schedule a node in a full row; that is, to avoid running
+@@ -165,17 +198,6 @@
+ int stage_count; /* The stage count of the partial schedule. */
+ };
+
+-/* We use this to record all the register replacements we do in
+- the kernel so we can undo SMS if it is not profitable. */
+-struct undo_replace_buff_elem
+-{
+- rtx insn;
+- rtx orig_reg;
+- rtx new_reg;
+- struct undo_replace_buff_elem *next;
+-};
+-
+-
+
+ static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
+ static void free_partial_schedule (partial_schedule_ptr);
+@@ -183,9 +205,7 @@
+ void print_partial_schedule (partial_schedule_ptr, FILE *);
+ static void verify_partial_schedule (partial_schedule_ptr, sbitmap);
+ static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
+- ddg_node_ptr node, int cycle,
+- sbitmap must_precede,
+- sbitmap must_follow);
++ int, int, sbitmap, sbitmap);
+ static void rotate_partial_schedule (partial_schedule_ptr, int);
+ void set_row_column_for_ps (partial_schedule_ptr);
+ static void ps_insert_empty_row (partial_schedule_ptr, int, sbitmap);
+@@ -201,43 +221,27 @@
+ static void permute_partial_schedule (partial_schedule_ptr, rtx);
+ static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
+ rtx, rtx);
+-static void duplicate_insns_of_cycles (partial_schedule_ptr,
+- int, int, int, rtx);
+ static int calculate_stage_count (partial_schedule_ptr, int);
+ static void calculate_must_precede_follow (ddg_node_ptr, int, int,
+ int, int, sbitmap, sbitmap, sbitmap);
+ static int get_sched_window (partial_schedule_ptr, ddg_node_ptr,
+ sbitmap, int, int *, int *, int *);
+-static bool try_scheduling_node_in_cycle (partial_schedule_ptr, ddg_node_ptr,
+- int, int, sbitmap, int *, sbitmap,
+- sbitmap);
++static bool try_scheduling_node_in_cycle (partial_schedule_ptr, int, int,
++ sbitmap, int *, sbitmap, sbitmap);
+ static void remove_node_from_ps (partial_schedule_ptr, ps_insn_ptr);
+
+-#define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
+-#define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
+-#define SCHED_FIRST_REG_MOVE(x) \
+- (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
+-#define SCHED_NREG_MOVES(x) \
+- (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
+-#define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
+-#define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
+-#define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
++#define NODE_ASAP(node) ((node)->aux.count)
++
++#define SCHED_PARAMS(x) VEC_index (node_sched_params, node_sched_param_vec, x)
++#define SCHED_TIME(x) (SCHED_PARAMS (x)->time)
++#define SCHED_ROW(x) (SCHED_PARAMS (x)->row)
++#define SCHED_STAGE(x) (SCHED_PARAMS (x)->stage)
++#define SCHED_COLUMN(x) (SCHED_PARAMS (x)->column)
+
+ /* The scheduling parameters held for each node. */
+ typedef struct node_sched_params
+ {
+- int asap; /* A lower-bound on the absolute scheduling cycle. */
+- int time; /* The absolute scheduling cycle (time >= asap). */
+-
+- /* The following field (first_reg_move) is a pointer to the first
+- register-move instruction added to handle the modulo-variable-expansion
+- of the register defined by this node. This register-move copies the
+- original register defined by the node. */
+- rtx first_reg_move;
+-
+- /* The number of register-move instructions added, immediately preceding
+- first_reg_move. */
+- int nreg_moves;
++ int time; /* The absolute scheduling cycle. */
+
+ int row; /* Holds time % ii. */
+ int stage; /* Holds time / ii. */
+@@ -247,6 +251,9 @@
+ int column;
+ } *node_sched_params_ptr;
+
++typedef struct node_sched_params node_sched_params;
++DEF_VEC_O (node_sched_params);
++DEF_VEC_ALLOC_O (node_sched_params, heap);
+ \f
+ /* The following three functions are copied from the current scheduler
+ code in order to use sched_analyze() for computing the dependencies.
+@@ -296,6 +303,49 @@
+ 0
+ };
+
++/* Partial schedule instruction ID in PS is a register move. Return
++ information about it. */
++static struct ps_reg_move_info *
++ps_reg_move (partial_schedule_ptr ps, int id)
++{
++ gcc_checking_assert (id >= ps->g->num_nodes);
++ return VEC_index (ps_reg_move_info, ps->reg_moves, id - ps->g->num_nodes);
++}
++
++/* Return the rtl instruction that is being scheduled by partial schedule
++ instruction ID, which belongs to schedule PS. */
++static rtx
++ps_rtl_insn (partial_schedule_ptr ps, int id)
++{
++ if (id < ps->g->num_nodes)
++ return ps->g->nodes[id].insn;
++ else
++ return ps_reg_move (ps, id)->insn;
++}
++
++/* Partial schedule instruction ID, which belongs to PS, occured in
++ the original (unscheduled) loop. Return the first instruction
++ in the loop that was associated with ps_rtl_insn (PS, ID).
++ If the instruction had some notes before it, this is the first
++ of those notes. */
++static rtx
++ps_first_note (partial_schedule_ptr ps, int id)
++{
++ gcc_assert (id < ps->g->num_nodes);
++ return ps->g->nodes[id].first_note;
++}
++
++/* Return the number of consecutive stages that are occupied by
++ partial schedule instruction ID in PS. */
++static int
++ps_num_consecutive_stages (partial_schedule_ptr ps, int id)
++{
++ if (id < ps->g->num_nodes)
++ return 1;
++ else
++ return ps_reg_move (ps, id)->num_consecutive_stages;
++}
++
+ /* Given HEAD and TAIL which are the first and last insns in a loop;
+ return the register which controls the loop. Return zero if it has
+ more than one occurrence in the loop besides the control part or the
+@@ -396,35 +446,59 @@
+ }
+
+
+-/* Points to the array that contains the sched data for each node. */
+-static node_sched_params_ptr node_sched_params;
++/* A vector that contains the sched data for each ps_insn. */
++static VEC (node_sched_params, heap) *node_sched_param_vec;
+
+-/* Allocate sched_params for each node and initialize it. Assumes that
+- the aux field of each node contain the asap bound (computed earlier),
+- and copies it into the sched_params field. */
++/* Allocate sched_params for each node and initialize it. */
+ static void
+ set_node_sched_params (ddg_ptr g)
+ {
+- int i;
+-
+- /* Allocate for each node in the DDG a place to hold the "sched_data". */
+- /* Initialize ASAP/ALAP/HIGHT to zero. */
+- node_sched_params = (node_sched_params_ptr)
+- xcalloc (g->num_nodes,
+- sizeof (struct node_sched_params));
+-
+- /* Set the pointer of the general data of the node to point to the
+- appropriate sched_params structure. */
+- for (i = 0; i < g->num_nodes; i++)
+- {
+- /* Watch out for aliasing problems? */
+- node_sched_params[i].asap = g->nodes[i].aux.count;
+- g->nodes[i].aux.info = &node_sched_params[i];
+- }
+-}
+-
+-static void
+-print_node_sched_params (FILE *file, int num_nodes, ddg_ptr g)
++ VEC_truncate (node_sched_params, node_sched_param_vec, 0);
++ VEC_safe_grow_cleared (node_sched_params, heap,
++ node_sched_param_vec, g->num_nodes);
++}
++
++/* Make sure that node_sched_param_vec has an entry for every move in PS. */
++static void
++extend_node_sched_params (partial_schedule_ptr ps)
++{
++ VEC_safe_grow_cleared (node_sched_params, heap, node_sched_param_vec,
++ ps->g->num_nodes + VEC_length (ps_reg_move_info,
++ ps->reg_moves));
++}
++
++/* Update the sched_params (time, row and stage) for node U using the II,
++ the CYCLE of U and MIN_CYCLE.
++ We're not simply taking the following
++ SCHED_STAGE (u) = CALC_STAGE_COUNT (SCHED_TIME (u), min_cycle, ii);
++ because the stages may not be aligned on cycle 0. */
++static void
++update_node_sched_params (int u, int ii, int cycle, int min_cycle)
++{
++ int sc_until_cycle_zero;
++ int stage;
++
++ SCHED_TIME (u) = cycle;
++ SCHED_ROW (u) = SMODULO (cycle, ii);
++
++ /* The calculation of stage count is done adding the number
++ of stages before cycle zero and after cycle zero. */
++ sc_until_cycle_zero = CALC_STAGE_COUNT (-1, min_cycle, ii);
++
++ if (SCHED_TIME (u) < 0)
++ {
++ stage = CALC_STAGE_COUNT (-1, SCHED_TIME (u), ii);
++ SCHED_STAGE (u) = sc_until_cycle_zero - stage;
++ }
++ else
++ {
++ stage = CALC_STAGE_COUNT (SCHED_TIME (u), 0, ii);
++ SCHED_STAGE (u) = sc_until_cycle_zero + stage - 1;
++ }
++}
++
++static void
++print_node_sched_params (FILE *file, int num_nodes, partial_schedule_ptr ps)
+ {
+ int i;
+
+@@ -432,22 +506,170 @@
+ return;
+ for (i = 0; i < num_nodes; i++)
+ {
+- node_sched_params_ptr nsp = &node_sched_params[i];
+- rtx reg_move = nsp->first_reg_move;
+- int j;
++ node_sched_params_ptr nsp = SCHED_PARAMS (i);
+
+ fprintf (file, "Node = %d; INSN = %d\n", i,
+- (INSN_UID (g->nodes[i].insn)));
+- fprintf (file, " asap = %d:\n", nsp->asap);
++ INSN_UID (ps_rtl_insn (ps, i)));
++ fprintf (file, " asap = %d:\n", NODE_ASAP (&ps->g->nodes[i]));
+ fprintf (file, " time = %d:\n", nsp->time);
+- fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
+- for (j = 0; j < nsp->nreg_moves; j++)
++ fprintf (file, " stage = %d:\n", nsp->stage);
++ }
++}
++
++/* Set SCHED_COLUMN for each instruction in row ROW of PS. */
++static void
++set_columns_for_row (partial_schedule_ptr ps, int row)
++{
++ ps_insn_ptr cur_insn;
++ int column;
++
++ column = 0;
++ for (cur_insn = ps->rows[row]; cur_insn; cur_insn = cur_insn->next_in_row)
++ SCHED_COLUMN (cur_insn->id) = column++;
++}
++
++/* Set SCHED_COLUMN for each instruction in PS. */
++static void
++set_columns_for_ps (partial_schedule_ptr ps)
++{
++ int row;
++
++ for (row = 0; row < ps->ii; row++)
++ set_columns_for_row (ps, row);
++}
++
++/* Try to schedule the move with ps_insn identifier I_REG_MOVE in PS.
++ Its single predecessor has already been scheduled, as has its
++ ddg node successors. (The move may have also another move as its
++ successor, in which case that successor will be scheduled later.)
++
++ The move is part of a chain that satisfies register dependencies
++ between a producing ddg node and various consuming ddg nodes.
++ If some of these dependencies have a distance of 1 (meaning that
++ the use is upward-exposoed) then DISTANCE1_USES is nonnull and
++ contains the set of uses with distance-1 dependencies.
++ DISTANCE1_USES is null otherwise.
++
++ MUST_FOLLOW is a scratch bitmap that is big enough to hold
++ all current ps_insn ids.
++
++ Return true on success. */
++static bool
++schedule_reg_move (partial_schedule_ptr ps, int i_reg_move,
++ sbitmap distance1_uses, sbitmap must_follow)
++{
++ unsigned int u;
++ int this_time, this_distance, this_start, this_end, this_latency;
++ int start, end, c, ii;
++ sbitmap_iterator sbi;
++ ps_reg_move_info *move;
++ rtx this_insn;
++ ps_insn_ptr psi;
++
++ move = ps_reg_move (ps, i_reg_move);
++ ii = ps->ii;
++ if (dump_file)
++ {
++ fprintf (dump_file, "Scheduling register move INSN %d; ii = %d"
++ ", min cycle = %d\n\n", INSN_UID (move->insn), ii,
++ PS_MIN_CYCLE (ps));
++ print_rtl_single (dump_file, move->insn);
++ fprintf (dump_file, "\n%11s %11s %5s\n", "start", "end", "time");
++ fprintf (dump_file, "=========== =========== =====\n");
++ }
++
++ start = INT_MIN;
++ end = INT_MAX;
++
++ /* For dependencies of distance 1 between a producer ddg node A
++ and consumer ddg node B, we have a chain of dependencies:
++
++ A --(T,L1,1)--> M1 --(T,L2,0)--> M2 ... --(T,Ln,0)--> B
++
++ where Mi is the ith move. For dependencies of distance 0 between
++ a producer ddg node A and consumer ddg node C, we have a chain of
++ dependencies:
++
++ A --(T,L1',0)--> M1' --(T,L2',0)--> M2' ... --(T,Ln',0)--> C
++
++ where Mi' occupies the same position as Mi but occurs a stage later.
++ We can only schedule each move once, so if we have both types of
++ chain, we model the second as:
++
++ A --(T,L1',1)--> M1 --(T,L2',0)--> M2 ... --(T,Ln',-1)--> C
++
++ First handle the dependencies between the previously-scheduled
++ predecessor and the move. */
++ this_insn = ps_rtl_insn (ps, move->def);
++ this_latency = insn_latency (this_insn, move->insn);
++ this_distance = distance1_uses && move->def < ps->g->num_nodes ? 1 : 0;
++ this_time = SCHED_TIME (move->def) - this_distance * ii;
++ this_start = this_time + this_latency;
++ this_end = this_time + ii;
++ if (dump_file)
++ fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
++ this_start, this_end, SCHED_TIME (move->def),
++ INSN_UID (this_insn), this_latency, this_distance,
++ INSN_UID (move->insn));
++
++ if (start < this_start)
++ start = this_start;
++ if (end > this_end)
++ end = this_end;
++
++ /* Handle the dependencies between the move and previously-scheduled
++ successors. */
++ EXECUTE_IF_SET_IN_SBITMAP (move->uses, 0, u, sbi)
++ {
++ this_insn = ps_rtl_insn (ps, u);
++ this_latency = insn_latency (move->insn, this_insn);
++ if (distance1_uses && !TEST_BIT (distance1_uses, u))
++ this_distance = -1;
++ else
++ this_distance = 0;
++ this_time = SCHED_TIME (u) + this_distance * ii;
++ this_start = this_time - ii;
++ this_end = this_time - this_latency;
++ if (dump_file)
++ fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
++ this_start, this_end, SCHED_TIME (u), INSN_UID (move->insn),
++ this_latency, this_distance, INSN_UID (this_insn));
++
++ if (start < this_start)
++ start = this_start;
++ if (end > this_end)
++ end = this_end;
++ }
++
++ if (dump_file)
++ {
++ fprintf (dump_file, "----------- ----------- -----\n");
++ fprintf (dump_file, "%11d %11d %5s %s\n", start, end, "", "(max, min)");
++ }
++
++ sbitmap_zero (must_follow);
++ SET_BIT (must_follow, move->def);
++
++ start = MAX (start, end - (ii - 1));
++ for (c = end; c >= start; c--)
++ {
++ psi = ps_add_node_check_conflicts (ps, i_reg_move, c,
++ move->uses, must_follow);
++ if (psi)
+ {
+- fprintf (file, " reg_move = ");
+- print_rtl_single (file, reg_move);
+- reg_move = PREV_INSN (reg_move);
++ update_node_sched_params (i_reg_move, ii, c, PS_MIN_CYCLE (ps));
++ if (dump_file)
++ fprintf (dump_file, "\nScheduled register move INSN %d at"
++ " time %d, row %d\n\n", INSN_UID (move->insn), c,
++ SCHED_ROW (i_reg_move));
++ return true;
+ }
+ }
++
++ if (dump_file)
++ fprintf (dump_file, "\nNo available slot\n\n");
++
++ return false;
+ }
+
+ /*
+@@ -461,22 +683,23 @@
+ nreg_moves = ----------------------------------- + 1 - { dependence.
+ ii { 1 if not.
+ */
+-static struct undo_replace_buff_elem *
+-generate_reg_moves (partial_schedule_ptr ps, bool rescan)
++static bool
++schedule_reg_moves (partial_schedule_ptr ps)
+ {
+ ddg_ptr g = ps->g;
+ int ii = ps->ii;
+ int i;
+- struct undo_replace_buff_elem *reg_move_replaces = NULL;
+
+ for (i = 0; i < g->num_nodes; i++)
+ {
+ ddg_node_ptr u = &g->nodes[i];
+ ddg_edge_ptr e;
+ int nreg_moves = 0, i_reg_move;
+- sbitmap *uses_of_defs;
+- rtx last_reg_move;
+ rtx prev_reg, old_reg;
++ int first_move;
++ int distances[2];
++ sbitmap must_follow;
++ sbitmap distance1_uses;
+ rtx set = single_set (u->insn);
+
+ /* Skip instructions that do not set a register. */
+@@ -485,18 +708,21 @@
+
+ /* Compute the number of reg_moves needed for u, by looking at life
+ ranges started at u (excluding self-loops). */
++ distances[0] = distances[1] = false;
+ for (e = u->out; e; e = e->next_out)
+ if (e->type == TRUE_DEP && e->dest != e->src)
+ {
+- int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
++ int nreg_moves4e = (SCHED_TIME (e->dest->cuid)
++ - SCHED_TIME (e->src->cuid)) / ii;
+
+ if (e->distance == 1)
+- nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
++ nreg_moves4e = (SCHED_TIME (e->dest->cuid)
++ - SCHED_TIME (e->src->cuid) + ii) / ii;
+
+ /* If dest precedes src in the schedule of the kernel, then dest
+ will read before src writes and we can save one reg_copy. */
+- if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
+- && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
++ if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
++ && SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
+ nreg_moves4e--;
+
+ if (nreg_moves4e >= 1)
+@@ -513,125 +739,105 @@
+ gcc_assert (!autoinc_var_is_used_p (u->insn, e->dest->insn));
+ }
+
++ if (nreg_moves4e)
++ {
++ gcc_assert (e->distance < 2);
++ distances[e->distance] = true;
++ }
+ nreg_moves = MAX (nreg_moves, nreg_moves4e);
+ }
+
+ if (nreg_moves == 0)
+ continue;
+
++ /* Create NREG_MOVES register moves. */
++ first_move = VEC_length (ps_reg_move_info, ps->reg_moves);
++ VEC_safe_grow_cleared (ps_reg_move_info, heap, ps->reg_moves,
++ first_move + nreg_moves);
++ extend_node_sched_params (ps);
++
++ /* Record the moves associated with this node. */
++ first_move += ps->g->num_nodes;
++
++ /* Generate each move. */
++ old_reg = prev_reg = SET_DEST (single_set (u->insn));
++ for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
++ {
++ ps_reg_move_info *move = ps_reg_move (ps, first_move + i_reg_move);
++
++ move->def = i_reg_move > 0 ? first_move + i_reg_move - 1 : i;
++ move->uses = sbitmap_alloc (first_move + nreg_moves);
++ move->old_reg = old_reg;
++ move->new_reg = gen_reg_rtx (GET_MODE (prev_reg));
++ move->num_consecutive_stages = distances[0] && distances[1] ? 2 : 1;
++ move->insn = gen_move_insn (move->new_reg, copy_rtx (prev_reg));
++ sbitmap_zero (move->uses);
++
++ prev_reg = move->new_reg;
++ }
++
++ distance1_uses = distances[1] ? sbitmap_alloc (g->num_nodes) : NULL;
++
+ /* Every use of the register defined by node may require a different
+ copy of this register, depending on the time the use is scheduled.
+- Set a bitmap vector, telling which nodes use each copy of this
+- register. */
+- uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
+- sbitmap_vector_zero (uses_of_defs, nreg_moves);
++ Record which uses require which move results. */
+ for (e = u->out; e; e = e->next_out)
+ if (e->type == TRUE_DEP && e->dest != e->src)
+ {
+- int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
++ int dest_copy = (SCHED_TIME (e->dest->cuid)
++ - SCHED_TIME (e->src->cuid)) / ii;
+
+ if (e->distance == 1)
+- dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
++ dest_copy = (SCHED_TIME (e->dest->cuid)
++ - SCHED_TIME (e->src->cuid) + ii) / ii;
+
+- if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
+- && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
++ if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
++ && SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
+ dest_copy--;
+
+ if (dest_copy)
+- SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
++ {
++ ps_reg_move_info *move;
++
++ move = ps_reg_move (ps, first_move + dest_copy - 1);
++ SET_BIT (move->uses, e->dest->cuid);
++ if (e->distance == 1)
++ SET_BIT (distance1_uses, e->dest->cuid);
++ }
+ }
+
+- /* Now generate the reg_moves, attaching relevant uses to them. */
+- SCHED_NREG_MOVES (u) = nreg_moves;
+- old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
+- /* Insert the reg-moves right before the notes which precede
+- the insn they relates to. */
+- last_reg_move = u->first_note;
+-
++ must_follow = sbitmap_alloc (first_move + nreg_moves);
+ for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
++ if (!schedule_reg_move (ps, first_move + i_reg_move,
++ distance1_uses, must_follow))
++ break;
++ sbitmap_free (must_follow);
++ if (distance1_uses)
++ sbitmap_free (distance1_uses);
++ if (i_reg_move < nreg_moves)
++ return false;
++ }
++ return true;
++}
++
++/* Emit the moves associatied with PS. Apply the substitutions
++ associated with them. */
++static void
++apply_reg_moves (partial_schedule_ptr ps)
++{
++ ps_reg_move_info *move;
++ int i;
++
++ FOR_EACH_VEC_ELT (ps_reg_move_info, ps->reg_moves, i, move)
++ {
++ unsigned int i_use;
++ sbitmap_iterator sbi;
++
++ EXECUTE_IF_SET_IN_SBITMAP (move->uses, 0, i_use, sbi)
+ {
+- unsigned int i_use = 0;
+- rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
+- rtx reg_move = gen_move_insn (new_reg, prev_reg);
+- sbitmap_iterator sbi;
+-
+- add_insn_before (reg_move, last_reg_move, NULL);
+- last_reg_move = reg_move;
+-
+- if (!SCHED_FIRST_REG_MOVE (u))
+- SCHED_FIRST_REG_MOVE (u) = reg_move;
+-
+- EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
+- {
+- struct undo_replace_buff_elem *rep;
+-
+- rep = (struct undo_replace_buff_elem *)
+- xcalloc (1, sizeof (struct undo_replace_buff_elem));
+- rep->insn = g->nodes[i_use].insn;
+- rep->orig_reg = old_reg;
+- rep->new_reg = new_reg;
+-
+- if (! reg_move_replaces)
+- reg_move_replaces = rep;
+- else
+- {
+- rep->next = reg_move_replaces;
+- reg_move_replaces = rep;
+- }
+-
+- replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
+- if (rescan)
+- df_insn_rescan (g->nodes[i_use].insn);
+- }
+-
+- prev_reg = new_reg;
++ replace_rtx (ps->g->nodes[i_use].insn, move->old_reg, move->new_reg);
++ df_insn_rescan (ps->g->nodes[i_use].insn);
+ }
+- sbitmap_vector_free (uses_of_defs);
+- }
+- return reg_move_replaces;
+-}
+-
+-/* Free memory allocated for the undo buffer. */
+-static void
+-free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
+-{
+-
+- while (reg_move_replaces)
+- {
+- struct undo_replace_buff_elem *rep = reg_move_replaces;
+-
+- reg_move_replaces = reg_move_replaces->next;
+- free (rep);
+- }
+-}
+-
+-/* Update the sched_params (time, row and stage) for node U using the II,
+- the CYCLE of U and MIN_CYCLE.
+- We're not simply taking the following
+- SCHED_STAGE (u) = CALC_STAGE_COUNT (SCHED_TIME (u), min_cycle, ii);
+- because the stages may not be aligned on cycle 0. */
+-static void
+-update_node_sched_params (ddg_node_ptr u, int ii, int cycle, int min_cycle)
+-{
+- int sc_until_cycle_zero;
+- int stage;
+-
+- SCHED_TIME (u) = cycle;
+- SCHED_ROW (u) = SMODULO (cycle, ii);
+-
+- /* The calculation of stage count is done adding the number
+- of stages before cycle zero and after cycle zero. */
+- sc_until_cycle_zero = CALC_STAGE_COUNT (-1, min_cycle, ii);
+-
+- if (SCHED_TIME (u) < 0)
+- {
+- stage = CALC_STAGE_COUNT (-1, SCHED_TIME (u), ii);
+- SCHED_STAGE (u) = sc_until_cycle_zero - stage;
+- }
+- else
+- {
+- stage = CALC_STAGE_COUNT (SCHED_TIME (u), 0, ii);
+- SCHED_STAGE (u) = sc_until_cycle_zero + stage - 1;
+ }
+ }
+
+@@ -647,18 +853,19 @@
+ for (row = 0; row < ii; row++)
+ for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
+ {
+- ddg_node_ptr u = crr_insn->node;
++ int u = crr_insn->id;
+ int normalized_time = SCHED_TIME (u) - amount;
+ int new_min_cycle = PS_MIN_CYCLE (ps) - amount;
+
+ if (dump_file)
+ {
+ /* Print the scheduling times after the rotation. */
++ rtx insn = ps_rtl_insn (ps, u);
++
+ fprintf (dump_file, "crr_insn->node=%d (insn id %d), "
+- "crr_insn->cycle=%d, min_cycle=%d", crr_insn->node->cuid,
+- INSN_UID (crr_insn->node->insn), normalized_time,
+- new_min_cycle);
+- if (JUMP_P (crr_insn->node->insn))
++ "crr_insn->cycle=%d, min_cycle=%d", u,
++ INSN_UID (insn), normalized_time, new_min_cycle);
++ if (JUMP_P (insn))
+ fprintf (dump_file, " (branch)");
+ fprintf (dump_file, "\n");
+ }
+@@ -671,22 +878,6 @@
+ }
+ }
+
+-/* Set SCHED_COLUMN of each node according to its position in PS. */
+-static void
+-set_columns_for_ps (partial_schedule_ptr ps)
+-{
+- int row;
+-
+- for (row = 0; row < ps->ii; row++)
+- {
+- ps_insn_ptr cur_insn = ps->rows[row];
+- int column = 0;
+-
+- for (; cur_insn; cur_insn = cur_insn->next_in_row)
+- SCHED_COLUMN (cur_insn->node) = column++;
+- }
+-}
+-
+ /* Permute the insns according to their order in PS, from row 0 to
+ row ii-1, and position them right before LAST. This schedules
+ the insns of the loop kernel. */
+@@ -699,9 +890,18 @@
+
+ for (row = 0; row < ii ; row++)
+ for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
+- if (PREV_INSN (last) != ps_ij->node->insn)
+- reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
+- PREV_INSN (last));
++ {
++ rtx insn = ps_rtl_insn (ps, ps_ij->id);
++
++ if (PREV_INSN (last) != insn)
++ {
++ if (ps_ij->id < ps->g->num_nodes)
++ reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
++ PREV_INSN (last));
++ else
++ add_insn_before (insn, last, NULL);
++ }
++ }
+ }
+
+ /* Set bitmaps TMP_FOLLOW and TMP_PRECEDE to MUST_FOLLOW and MUST_PRECEDE
+@@ -750,7 +950,7 @@
+ to row ii-1. If they are equal just bail out. */
+ stage_count = calculate_stage_count (ps, amount);
+ stage_count_curr =
+- calculate_stage_count (ps, SCHED_TIME (g->closing_branch) - (ii - 1));
++ calculate_stage_count (ps, SCHED_TIME (g->closing_branch->cuid) - (ii - 1));
+
+ if (stage_count == stage_count_curr)
+ {
+@@ -779,7 +979,7 @@
+ print_partial_schedule (ps, dump_file);
+ }
+
+- if (SMODULO (SCHED_TIME (g->closing_branch), ii) == ii - 1)
++ if (SMODULO (SCHED_TIME (g->closing_branch->cuid), ii) == ii - 1)
+ {
+ ok = true;
+ goto clear;
+@@ -794,7 +994,7 @@
+ {
+ bool success;
+ ps_insn_ptr next_ps_i;
+- int branch_cycle = SCHED_TIME (g->closing_branch);
++ int branch_cycle = SCHED_TIME (g->closing_branch->cuid);
+ int row = SMODULO (branch_cycle, ps->ii);
+ int num_splits = 0;
+ sbitmap must_precede, must_follow, tmp_precede, tmp_follow;
+@@ -850,13 +1050,12 @@
+ branch so we can remove it from it's current cycle. */
+ for (next_ps_i = ps->rows[row];
+ next_ps_i; next_ps_i = next_ps_i->next_in_row)
+- if (next_ps_i->node->cuid == g->closing_branch->cuid)
++ if (next_ps_i->id == g->closing_branch->cuid)
+ break;
+
+ remove_node_from_ps (ps, next_ps_i);
+ success =
+- try_scheduling_node_in_cycle (ps, g->closing_branch,
+- g->closing_branch->cuid, c,
++ try_scheduling_node_in_cycle (ps, g->closing_branch->cuid, c,
+ sched_nodes, &num_splits,
+ tmp_precede, tmp_follow);
+ gcc_assert (num_splits == 0);
+@@ -874,8 +1073,7 @@
+ must_precede, branch_cycle, start, end,
+ step);
+ success =
+- try_scheduling_node_in_cycle (ps, g->closing_branch,
+- g->closing_branch->cuid,
++ try_scheduling_node_in_cycle (ps, g->closing_branch->cuid,
+ branch_cycle, sched_nodes,
+ &num_splits, tmp_precede,
+ tmp_follow);
+@@ -889,7 +1087,7 @@
+ fprintf (dump_file,
+ "SMS success in moving branch to cycle %d\n", c);
+
+- update_node_sched_params (g->closing_branch, ii, c,
++ update_node_sched_params (g->closing_branch->cuid, ii, c,
+ PS_MIN_CYCLE (ps));
+ ok = true;
+ }
+@@ -905,7 +1103,7 @@
+
+ static void
+ duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
+- int to_stage, int for_prolog, rtx count_reg)
++ int to_stage, rtx count_reg)
+ {
+ int row;
+ ps_insn_ptr ps_ij;
+@@ -913,9 +1111,9 @@
+ for (row = 0; row < ps->ii; row++)
+ for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
+ {
+- ddg_node_ptr u_node = ps_ij->node;
+- int j, i_reg_moves;
+- rtx reg_move = NULL_RTX;
++ int u = ps_ij->id;
++ int first_u, last_u;
++ rtx u_insn;
+
+ /* Do not duplicate any insn which refers to count_reg as it
+ belongs to the control part.
+@@ -923,52 +1121,20 @@
+ be ignored.
+ TODO: This should be done by analyzing the control part of
+ the loop. */
+- if (reg_mentioned_p (count_reg, u_node->insn)
+- || JUMP_P (ps_ij->node->insn))
++ u_insn = ps_rtl_insn (ps, u);
++ if (reg_mentioned_p (count_reg, u_insn)
++ || JUMP_P (u_insn))
+ continue;
+
+- if (for_prolog)
+- {
+- /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
+- number of reg_moves starting with the second occurrence of
+- u_node, which is generated if its SCHED_STAGE <= to_stage. */
+- i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
+- i_reg_moves = MAX (i_reg_moves, 0);
+- i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
+-
+- /* The reg_moves start from the *first* reg_move backwards. */
+- if (i_reg_moves)
+- {
+- reg_move = SCHED_FIRST_REG_MOVE (u_node);
+- for (j = 1; j < i_reg_moves; j++)
+- reg_move = PREV_INSN (reg_move);
+- }
+- }
+- else /* It's for the epilog. */
+- {
+- /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
+- starting to decrease one stage after u_node no longer occurs;
+- that is, generate all reg_moves until
+- SCHED_STAGE (u_node) == from_stage - 1. */
+- i_reg_moves = SCHED_NREG_MOVES (u_node)
+- - (from_stage - SCHED_STAGE (u_node) - 1);
+- i_reg_moves = MAX (i_reg_moves, 0);
+- i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
+-
+- /* The reg_moves start from the *last* reg_move forwards. */
+- if (i_reg_moves)
+- {
+- reg_move = SCHED_FIRST_REG_MOVE (u_node);
+- for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
+- reg_move = PREV_INSN (reg_move);
+- }
+- }
+-
+- for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
+- emit_insn (copy_rtx (PATTERN (reg_move)));
+- if (SCHED_STAGE (u_node) >= from_stage
+- && SCHED_STAGE (u_node) <= to_stage)
+- duplicate_insn_chain (u_node->first_note, u_node->insn);
++ first_u = SCHED_STAGE (u);
++ last_u = first_u + ps_num_consecutive_stages (ps, u) - 1;
++ if (from_stage <= last_u && to_stage >= first_u)
++ {
++ if (u < ps->g->num_nodes)
++ duplicate_insn_chain (ps_first_note (ps, u), u_insn);
++ else
++ emit_insn (copy_rtx (PATTERN (u_insn)));
++ }
+ }
+ }
+
+@@ -1002,7 +1168,7 @@
+ }
+
+ for (i = 0; i < last_stage; i++)
+- duplicate_insns_of_cycles (ps, 0, i, 1, count_reg);
++ duplicate_insns_of_cycles (ps, 0, i, count_reg);
+
+ /* Put the prolog on the entry edge. */
+ e = loop_preheader_edge (loop);
+@@ -1014,7 +1180,7 @@
+ start_sequence ();
+
+ for (i = 0; i < last_stage; i++)
+- duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg);
++ duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg);
+
+ /* Put the epilogue on the exit edge. */
+ gcc_assert (single_exit (loop));
+@@ -1350,10 +1516,9 @@
+ {
+ rtx head, tail;
+ rtx count_reg, count_init;
+- int mii, rec_mii;
+- unsigned stage_count = 0;
++ int mii, rec_mii, stage_count, min_cycle;
+ HOST_WIDEST_INT loop_count = 0;
+- bool opt_sc_p = false;
++ bool opt_sc_p;
+
+ if (! (g = g_arr[loop->num]))
+ continue;
+@@ -1430,62 +1595,63 @@
+ fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
+ rec_mii, mii, maxii);
+
+- /* After sms_order_nodes and before sms_schedule_by_order, to copy over
+- ASAP. */
+- set_node_sched_params (g);
+-
+- ps = sms_schedule_by_order (g, mii, maxii, node_order);
+-
+- if (ps)
++ for (;;)
+ {
+- /* Try to achieve optimized SC by normalizing the partial
+- schedule (having the cycles start from cycle zero).
+- The branch location must be placed in row ii-1 in the
+- final scheduling. If failed, shift all instructions to
+- position the branch in row ii-1. */
+- opt_sc_p = optimize_sc (ps, g);
+- if (opt_sc_p)
+- stage_count = calculate_stage_count (ps, 0);
+- else
++ set_node_sched_params (g);
++
++ stage_count = 0;
++ opt_sc_p = false;
++ ps = sms_schedule_by_order (g, mii, maxii, node_order);
++
++ if (ps)
+ {
+- /* Bring the branch to cycle ii-1. */
+- int amount = SCHED_TIME (g->closing_branch) - (ps->ii - 1);
++ /* Try to achieve optimized SC by normalizing the partial
++ schedule (having the cycles start from cycle zero).
++ The branch location must be placed in row ii-1 in the
++ final scheduling. If failed, shift all instructions to
++ position the branch in row ii-1. */
++ opt_sc_p = optimize_sc (ps, g);
++ if (opt_sc_p)
++ stage_count = calculate_stage_count (ps, 0);
++ else
++ {
++ /* Bring the branch to cycle ii-1. */
++ int amount = (SCHED_TIME (g->closing_branch->cuid)
++ - (ps->ii - 1));
+
++ if (dump_file)
++ fprintf (dump_file, "SMS schedule branch at cycle ii-1\n");
++
++ stage_count = calculate_stage_count (ps, amount);
++ }
++
++ gcc_assert (stage_count >= 1);
++ }
++
++ /* The default value of PARAM_SMS_MIN_SC is 2 as stage count of
++ 1 means that there is no interleaving between iterations thus
++ we let the scheduling passes do the job in this case. */
++ if (stage_count < PARAM_VALUE (PARAM_SMS_MIN_SC)
++ || (count_init && (loop_count <= stage_count))
++ || (flag_branch_probabilities && (trip_count <= stage_count)))
++ {
+ if (dump_file)
+- fprintf (dump_file, "SMS schedule branch at cycle ii-1\n");
+-
+- stage_count = calculate_stage_count (ps, amount);
+- }
+-
+- gcc_assert (stage_count >= 1);
+- PS_STAGE_COUNT (ps) = stage_count;
+- }
+-
+- /* The default value of PARAM_SMS_MIN_SC is 2 as stage count of
+- 1 means that there is no interleaving between iterations thus
+- we let the scheduling passes do the job in this case. */
+- if (stage_count < (unsigned) PARAM_VALUE (PARAM_SMS_MIN_SC)
+- || (count_init && (loop_count <= stage_count))
+- || (flag_branch_probabilities && (trip_count <= stage_count)))
+- {
+- if (dump_file)
+- {
+- fprintf (dump_file, "SMS failed... \n");
+- fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
+- fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
+- fprintf (dump_file, ", trip-count=");
+- fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
+- fprintf (dump_file, ")\n");
+- }
+- }
+- else
+- {
+- struct undo_replace_buff_elem *reg_move_replaces;
++ {
++ fprintf (dump_file, "SMS failed... \n");
++ fprintf (dump_file, "SMS sched-failed (stage-count=%d,"
++ " loop-count=", stage_count);
++ fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
++ fprintf (dump_file, ", trip-count=");
++ fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
++ fprintf (dump_file, ")\n");
++ }
++ break;
++ }
+
+ if (!opt_sc_p)
+ {
+ /* Rotate the partial schedule to have the branch in row ii-1. */
+- int amount = SCHED_TIME (g->closing_branch) - (ps->ii - 1);
++ int amount = SCHED_TIME (g->closing_branch->cuid) - (ps->ii - 1);
+
+ reset_sched_times (ps, amount);
+ rotate_partial_schedule (ps, amount);
+@@ -1493,6 +1659,29 @@
+
+ set_columns_for_ps (ps);
+
++ min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
++ if (!schedule_reg_moves (ps))
++ {
++ mii = ps->ii + 1;
++ free_partial_schedule (ps);
++ continue;
++ }
++
++ /* Moves that handle incoming values might have been added
++ to a new first stage. Bump the stage count if so.
++
++ ??? Perhaps we could consider rotating the schedule here
++ instead? */
++ if (PS_MIN_CYCLE (ps) < min_cycle)
++ {
++ reset_sched_times (ps, 0);
++ stage_count++;
++ }
++
++ /* The stage count should now be correct without rotation. */
++ gcc_checking_assert (stage_count == calculate_stage_count (ps, 0));
++ PS_STAGE_COUNT (ps) = stage_count;
++
+ canon_loop (loop);
+
+ if (dump_file)
+@@ -1531,17 +1720,16 @@
+ /* The life-info is not valid any more. */
+ df_set_bb_dirty (g->bb);
+
+- reg_move_replaces = generate_reg_moves (ps, true);
++ apply_reg_moves (ps);
+ if (dump_file)
+- print_node_sched_params (dump_file, g->num_nodes, g);
++ print_node_sched_params (dump_file, g->num_nodes, ps);
+ /* Generate prolog and epilog. */
+ generate_prolog_epilog (ps, loop, count_reg, count_init);
+-
+- free_undo_replace_buff (reg_move_replaces);
++ break;
+ }
+
+ free_partial_schedule (ps);
+- free (node_sched_params);
++ VEC_free (node_sched_params, heap, node_sched_param_vec);
+ free (node_order);
+ free_ddg (g);
+ }
+@@ -1643,9 +1831,11 @@
+
+ static int
+ get_sched_window (partial_schedule_ptr ps, ddg_node_ptr u_node,
+- sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
++ sbitmap sched_nodes, int ii, int *start_p, int *step_p,
++ int *end_p)
+ {
+ int start, step, end;
++ int early_start, late_start;
+ ddg_edge_ptr e;
+ sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
+ sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
+@@ -1653,6 +1843,8 @@
+ sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
+ int psp_not_empty;
+ int pss_not_empty;
++ int count_preds;
++ int count_succs;
+
+ /* 1. compute sched window for u (start, end, step). */
+ sbitmap_zero (psp);
+@@ -1660,214 +1852,119 @@
+ psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
+ pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
+
+- if (psp_not_empty && !pss_not_empty)
+- {
+- int early_start = INT_MIN;
+-
+- end = INT_MAX;
+- for (e = u_node->in; e != 0; e = e->next_in)
+- {
+- ddg_node_ptr v_node = e->src;
+-
+- if (dump_file)
+- {
+- fprintf (dump_file, "\nProcessing edge: ");
+- print_ddg_edge (dump_file, e);
+- fprintf (dump_file,
+- "\nScheduling %d (%d) in psp_not_empty,"
+- " checking p %d (%d): ", u_node->cuid,
+- INSN_UID (u_node->insn), v_node->cuid, INSN_UID
+- (v_node->insn));
+- }
+-
+- if (TEST_BIT (sched_nodes, v_node->cuid))
+- {
+- int p_st = SCHED_TIME (v_node);
+-
+- early_start =
+- MAX (early_start, p_st + e->latency - (e->distance * ii));
+-
+- if (dump_file)
+- fprintf (dump_file,
+- "pred st = %d; early_start = %d; latency: %d",
+- p_st, early_start, e->latency);
+-
+- if (e->data_type == MEM_DEP)
+- end = MIN (end, SCHED_TIME (v_node) + ii - 1);
+- }
+- else if (dump_file)
+- fprintf (dump_file, "the node is not scheduled\n");
+- }
+- start = early_start;
+- end = MIN (end, early_start + ii);
+- /* Schedule the node close to it's predecessors. */
+- step = 1;
+-
+- if (dump_file)
+- fprintf (dump_file,
+- "\nScheduling %d (%d) in a window (%d..%d) with step %d\n",
+- u_node->cuid, INSN_UID (u_node->insn), start, end, step);
+- }
+-
+- else if (!psp_not_empty && pss_not_empty)
+- {
+- int late_start = INT_MAX;
+-
+- end = INT_MIN;
+- for (e = u_node->out; e != 0; e = e->next_out)
+- {
+- ddg_node_ptr v_node = e->dest;
+-
+- if (dump_file)
+- {
+- fprintf (dump_file, "\nProcessing edge:");
+- print_ddg_edge (dump_file, e);
+- fprintf (dump_file,
+- "\nScheduling %d (%d) in pss_not_empty,"
+- " checking s %d (%d): ", u_node->cuid,
+- INSN_UID (u_node->insn), v_node->cuid, INSN_UID
+- (v_node->insn));
+- }
+-
+- if (TEST_BIT (sched_nodes, v_node->cuid))
+- {
+- int s_st = SCHED_TIME (v_node);
+-
+- late_start = MIN (late_start,
+- s_st - e->latency + (e->distance * ii));
+-
+- if (dump_file)
+- fprintf (dump_file,
+- "succ st = %d; late_start = %d; latency = %d",
+- s_st, late_start, e->latency);
+-
+- if (e->data_type == MEM_DEP)
+- end = MAX (end, SCHED_TIME (v_node) - ii + 1);
+- if (dump_file)
+- fprintf (dump_file, "end = %d\n", end);
+-
+- }
+- else if (dump_file)
+- fprintf (dump_file, "the node is not scheduled\n");
+-
+- }
+- start = late_start;
+- end = MAX (end, late_start - ii);
+- /* Schedule the node close to it's successors. */
++ /* We first compute a forward range (start <= end), then decide whether
++ to reverse it. */
++ early_start = INT_MIN;
++ late_start = INT_MAX;
++ start = INT_MIN;
++ end = INT_MAX;
++ step = 1;
++
++ count_preds = 0;
++ count_succs = 0;
++
++ if (dump_file && (psp_not_empty || pss_not_empty))
++ {
++ fprintf (dump_file, "\nAnalyzing dependencies for node %d (INSN %d)"
++ "; ii = %d\n\n", u_node->cuid, INSN_UID (u_node->insn), ii);
++ fprintf (dump_file, "%11s %11s %11s %11s %5s\n",
++ "start", "early start", "late start", "end", "time");
++ fprintf (dump_file, "=========== =========== =========== ==========="
++ " =====\n");
++ }
++ /* Calculate early_start and limit end. Both bounds are inclusive. */
++ if (psp_not_empty)
++ for (e = u_node->in; e != 0; e = e->next_in)
++ {
++ int v = e->src->cuid;
++
++ if (TEST_BIT (sched_nodes, v))
++ {
++ int p_st = SCHED_TIME (v);
++ int earliest = p_st + e->latency - (e->distance * ii);
++ int latest = (e->data_type == MEM_DEP ? p_st + ii - 1 : INT_MAX);
++
++ if (dump_file)
++ {
++ fprintf (dump_file, "%11s %11d %11s %11d %5d",
++ "", earliest, "", latest, p_st);
++ print_ddg_edge (dump_file, e);
++ fprintf (dump_file, "\n");
++ }
++
++ early_start = MAX (early_start, earliest);
++ end = MIN (end, latest);
++
++ if (e->type == TRUE_DEP && e->data_type == REG_DEP)
++ count_preds++;
++ }
++ }
++
++ /* Calculate late_start and limit start. Both bounds are inclusive. */
++ if (pss_not_empty)
++ for (e = u_node->out; e != 0; e = e->next_out)
++ {
++ int v = e->dest->cuid;
++
++ if (TEST_BIT (sched_nodes, v))
++ {
++ int s_st = SCHED_TIME (v);
++ int earliest = (e->data_type == MEM_DEP ? s_st - ii + 1 : INT_MIN);
++ int latest = s_st - e->latency + (e->distance * ii);
++
++ if (dump_file)
++ {
++ fprintf (dump_file, "%11d %11s %11d %11s %5d",
++ earliest, "", latest, "", s_st);
++ print_ddg_edge (dump_file, e);
++ fprintf (dump_file, "\n");
++ }
++
++ start = MAX (start, earliest);
++ late_start = MIN (late_start, latest);
++
++ if (e->type == TRUE_DEP && e->data_type == REG_DEP)
++ count_succs++;
++ }
++ }
++
++ if (dump_file && (psp_not_empty || pss_not_empty))
++ {
++ fprintf (dump_file, "----------- ----------- ----------- -----------"
++ " -----\n");
++ fprintf (dump_file, "%11d %11d %11d %11d %5s %s\n",
++ start, early_start, late_start, end, "",
++ "(max, max, min, min)");
++ }
++
++ /* Get a target scheduling window no bigger than ii. */
++ if (early_start == INT_MIN && late_start == INT_MAX)
++ early_start = NODE_ASAP (u_node);
++ else if (early_start == INT_MIN)
++ early_start = late_start - (ii - 1);
++ late_start = MIN (late_start, early_start + (ii - 1));
++
++ /* Apply memory dependence limits. */
++ start = MAX (start, early_start);
++ end = MIN (end, late_start);
++
++ if (dump_file && (psp_not_empty || pss_not_empty))
++ fprintf (dump_file, "%11s %11d %11d %11s %5s final window\n",
++ "", start, end, "", "");
++
++ /* If there are at least as many successors as predecessors, schedule the
++ node close to its successors. */
++ if (pss_not_empty && count_succs >= count_preds)
++ {
++ int tmp = end;
++ end = start;
++ start = tmp;
+ step = -1;
+-
+- if (dump_file)
+- fprintf (dump_file,
+- "\nScheduling %d (%d) in a window (%d..%d) with step %d\n",
+- u_node->cuid, INSN_UID (u_node->insn), start, end, step);
+-
+- }
+-
+- else if (psp_not_empty && pss_not_empty)
+- {
+- int early_start = INT_MIN;
+- int late_start = INT_MAX;
+- int count_preds = 0;
+- int count_succs = 0;
+-
+- start = INT_MIN;
+- end = INT_MAX;
+- for (e = u_node->in; e != 0; e = e->next_in)
+- {
+- ddg_node_ptr v_node = e->src;
+-
+- if (dump_file)
+- {
+- fprintf (dump_file, "\nProcessing edge:");
+- print_ddg_edge (dump_file, e);
+- fprintf (dump_file,
+- "\nScheduling %d (%d) in psp_pss_not_empty,"
+- " checking p %d (%d): ", u_node->cuid, INSN_UID
+- (u_node->insn), v_node->cuid, INSN_UID
+- (v_node->insn));
+- }
+-
+- if (TEST_BIT (sched_nodes, v_node->cuid))
+- {
+- int p_st = SCHED_TIME (v_node);
+-
+- early_start = MAX (early_start,
+- p_st + e->latency
+- - (e->distance * ii));
+-
+- if (dump_file)
+- fprintf (dump_file,
+- "pred st = %d; early_start = %d; latency = %d",
+- p_st, early_start, e->latency);
+-
+- if (e->type == TRUE_DEP && e->data_type == REG_DEP)
+- count_preds++;
+-
+- if (e->data_type == MEM_DEP)
+- end = MIN (end, SCHED_TIME (v_node) + ii - 1);
+- }
+- else if (dump_file)
+- fprintf (dump_file, "the node is not scheduled\n");
+-
+- }
+- for (e = u_node->out; e != 0; e = e->next_out)
+- {
+- ddg_node_ptr v_node = e->dest;
+-
+- if (dump_file)
+- {
+- fprintf (dump_file, "\nProcessing edge:");
+- print_ddg_edge (dump_file, e);
+- fprintf (dump_file,
+- "\nScheduling %d (%d) in psp_pss_not_empty,"
+- " checking s %d (%d): ", u_node->cuid, INSN_UID
+- (u_node->insn), v_node->cuid, INSN_UID
+- (v_node->insn));
+- }
+-
+- if (TEST_BIT (sched_nodes, v_node->cuid))
+- {
+- int s_st = SCHED_TIME (v_node);
+-
+- late_start = MIN (late_start,
+- s_st - e->latency
+- + (e->distance * ii));
+-
+- if (dump_file)
+- fprintf (dump_file,
+- "succ st = %d; late_start = %d; latency = %d",
+- s_st, late_start, e->latency);
+-
+- if (e->type == TRUE_DEP && e->data_type == REG_DEP)
+- count_succs++;
+-
+- if (e->data_type == MEM_DEP)
+- start = MAX (start, SCHED_TIME (v_node) - ii + 1);
+- }
+- else if (dump_file)
+- fprintf (dump_file, "the node is not scheduled\n");
+-
+- }
+- start = MAX (start, early_start);
+- end = MIN (end, MIN (early_start + ii, late_start + 1));
+- step = 1;
+- /* If there are more successors than predecessors schedule the
+- node close to it's successors. */
+- if (count_succs >= count_preds)
+- {
+- int old_start = start;
+-
+- start = end - 1;
+- end = old_start - 1;
+- step = -1;
+- }
+- }
+- else /* psp is empty && pss is empty. */
+- {
+- start = SCHED_ASAP (u_node);
+- end = start + ii;
+- step = 1;
+- }
++ }
++
++ /* Now that we've finalized the window, make END an exclusive rather
++ than an inclusive bound. */
++ end += step;
+
+ *start_p = start;
+ *step_p = step;
+@@ -1880,10 +1977,10 @@
+ if (dump_file)
+ fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n",
+ start, end, step);
+- return -1;
++ return -1;
+ }
+
+- return 0;
++ return 0;
+ }
+
+ /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the
+@@ -1939,7 +2036,7 @@
+ SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window */
+ for (e = u_node->in; e != 0; e = e->next_in)
+ if (TEST_BIT (sched_nodes, e->src->cuid)
+- && ((SCHED_TIME (e->src) - (e->distance * ii)) ==
++ && ((SCHED_TIME (e->src->cuid) - (e->distance * ii)) ==
+ first_cycle_in_window))
+ {
+ if (dump_file)
+@@ -1964,7 +2061,7 @@
+ SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window */
+ for (e = u_node->out; e != 0; e = e->next_out)
+ if (TEST_BIT (sched_nodes, e->dest->cuid)
+- && ((SCHED_TIME (e->dest) + (e->distance * ii)) ==
++ && ((SCHED_TIME (e->dest->cuid) + (e->distance * ii)) ==
+ last_cycle_in_window))
+ {
+ if (dump_file)
+@@ -1988,7 +2085,7 @@
+ last row of the scheduling window) */
+
+ static bool
+-try_scheduling_node_in_cycle (partial_schedule_ptr ps, ddg_node_ptr u_node,
++try_scheduling_node_in_cycle (partial_schedule_ptr ps,
+ int u, int cycle, sbitmap sched_nodes,
+ int *num_splits, sbitmap must_precede,
+ sbitmap must_follow)
+@@ -1997,11 +2094,10 @@
+ bool success = 0;
+
+ verify_partial_schedule (ps, sched_nodes);
+- psi = ps_add_node_check_conflicts (ps, u_node, cycle,
+- must_precede, must_follow);
++ psi = ps_add_node_check_conflicts (ps, u, cycle, must_precede, must_follow);
+ if (psi)
+ {
+- SCHED_TIME (u_node) = cycle;
++ SCHED_TIME (u) = cycle;
+ SET_BIT (sched_nodes, u);
+ success = 1;
+ *num_splits = 0;
+@@ -2062,8 +2158,8 @@
+ &step, &end) == 0)
+ {
+ if (dump_file)
+- fprintf (dump_file, "\nTrying to schedule node %d \
+- INSN = %d in (%d .. %d) step %d\n", u, (INSN_UID
++ fprintf (dump_file, "\nTrying to schedule node %d "
++ "INSN = %d in (%d .. %d) step %d\n", u, (INSN_UID
+ (g->nodes[u].insn)), start, end, step);
+
+ gcc_assert ((step > 0 && start < end)
+@@ -2081,7 +2177,7 @@
+ &tmp_precede, must_precede,
+ c, start, end, step);
+ success =
+- try_scheduling_node_in_cycle (ps, u_node, u, c,
++ try_scheduling_node_in_cycle (ps, u, c,
+ sched_nodes,
+ &num_splits, tmp_precede,
+ tmp_follow);
+@@ -2181,7 +2277,7 @@
+ for (crr_insn = rows_new[row];
+ crr_insn; crr_insn = crr_insn->next_in_row)
+ {
+- ddg_node_ptr u = crr_insn->node;
++ int u = crr_insn->id;
+ int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii);
+
+ SCHED_TIME (u) = new_time;
+@@ -2202,7 +2298,7 @@
+ for (crr_insn = rows_new[row + 1];
+ crr_insn; crr_insn = crr_insn->next_in_row)
+ {
+- ddg_node_ptr u = crr_insn->node;
++ int u = crr_insn->id;
+ int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii) + 1;
+
+ SCHED_TIME (u) = new_time;
+@@ -2242,24 +2338,24 @@
+ {
+ ddg_edge_ptr e;
+ int lower = INT_MIN, upper = INT_MAX;
+- ddg_node_ptr crit_pred = NULL;
+- ddg_node_ptr crit_succ = NULL;
++ int crit_pred = -1;
++ int crit_succ = -1;
+ int crit_cycle;
+
+ for (e = u_node->in; e != 0; e = e->next_in)
+ {
+- ddg_node_ptr v_node = e->src;
++ int v = e->src->cuid;
+
+- if (TEST_BIT (sched_nodes, v_node->cuid)
+- && (low == SCHED_TIME (v_node) + e->latency - (e->distance * ii)))
+- if (SCHED_TIME (v_node) > lower)
++ if (TEST_BIT (sched_nodes, v)
++ && (low == SCHED_TIME (v) + e->latency - (e->distance * ii)))
++ if (SCHED_TIME (v) > lower)
+ {
+- crit_pred = v_node;
+- lower = SCHED_TIME (v_node);
++ crit_pred = v;
++ lower = SCHED_TIME (v);
+ }
+ }
+
+- if (crit_pred != NULL)
++ if (crit_pred >= 0)
+ {
+ crit_cycle = SCHED_TIME (crit_pred) + 1;
+ return SMODULO (crit_cycle, ii);
+@@ -2267,17 +2363,18 @@
+
+ for (e = u_node->out; e != 0; e = e->next_out)
+ {
+- ddg_node_ptr v_node = e->dest;
+- if (TEST_BIT (sched_nodes, v_node->cuid)
+- && (up == SCHED_TIME (v_node) - e->latency + (e->distance * ii)))
+- if (SCHED_TIME (v_node) < upper)
++ int v = e->dest->cuid;
++
++ if (TEST_BIT (sched_nodes, v)
++ && (up == SCHED_TIME (v) - e->latency + (e->distance * ii)))
++ if (SCHED_TIME (v) < upper)
+ {
+- crit_succ = v_node;
+- upper = SCHED_TIME (v_node);
++ crit_succ = v;
++ upper = SCHED_TIME (v);
+ }
+ }
+
+- if (crit_succ != NULL)
++ if (crit_succ >= 0)
+ {
+ crit_cycle = SCHED_TIME (crit_succ);
+ return SMODULO (crit_cycle, ii);
+@@ -2301,10 +2398,10 @@
+
+ for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
+ {
+- ddg_node_ptr u = crr_insn->node;
++ int u = crr_insn->id;
+
+ length++;
+- gcc_assert (TEST_BIT (sched_nodes, u->cuid));
++ gcc_assert (TEST_BIT (sched_nodes, u));
+ /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by
+ popcount (sched_nodes) == number of insns in ps. */
+ gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
+@@ -2719,6 +2816,7 @@
+ partial_schedule_ptr ps = XNEW (struct partial_schedule);
+ ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
+ ps->rows_length = (int *) xcalloc (ii, sizeof (int));
++ ps->reg_moves = NULL;
+ ps->ii = ii;
+ ps->history = history;
+ ps->min_cycle = INT_MAX;
+@@ -2753,8 +2851,16 @@
+ static void
+ free_partial_schedule (partial_schedule_ptr ps)
+ {
++ ps_reg_move_info *move;
++ unsigned int i;
++
+ if (!ps)
+ return;
++
++ FOR_EACH_VEC_ELT (ps_reg_move_info, ps->reg_moves, i, move)
++ sbitmap_free (move->uses);
++ VEC_free (ps_reg_move_info, heap, ps->reg_moves);
++
+ free_ps_insns (ps);
+ free (ps->rows);
+ free (ps->rows_length);
+@@ -2796,12 +2902,12 @@
+ fprintf (dump, "\n[ROW %d ]: ", i);
+ while (ps_i)
+ {
+- if (JUMP_P (ps_i->node->insn))
+- fprintf (dump, "%d (branch), ",
+- INSN_UID (ps_i->node->insn));
++ rtx insn = ps_rtl_insn (ps, ps_i->id);
++
++ if (JUMP_P (insn))
++ fprintf (dump, "%d (branch), ", INSN_UID (insn));
+ else
+- fprintf (dump, "%d, ",
+- INSN_UID (ps_i->node->insn));
++ fprintf (dump, "%d, ", INSN_UID (insn));
+
+ ps_i = ps_i->next_in_row;
+ }
+@@ -2810,11 +2916,11 @@
+
+ /* Creates an object of PS_INSN and initializes it to the given parameters. */
+ static ps_insn_ptr
+-create_ps_insn (ddg_node_ptr node, int cycle)
++create_ps_insn (int id, int cycle)
+ {
+ ps_insn_ptr ps_i = XNEW (struct ps_insn);
+
+- ps_i->node = node;
++ ps_i->id = id;
+ ps_i->next_in_row = NULL;
+ ps_i->prev_in_row = NULL;
+ ps_i->cycle = cycle;
+@@ -2879,10 +2985,11 @@
+ next_ps_i;
+ next_ps_i = next_ps_i->next_in_row)
+ {
+- if (must_follow && TEST_BIT (must_follow, next_ps_i->node->cuid)
++ if (must_follow
++ && TEST_BIT (must_follow, next_ps_i->id)
+ && ! first_must_follow)
+ first_must_follow = next_ps_i;
+- if (must_precede && TEST_BIT (must_precede, next_ps_i->node->cuid))
++ if (must_precede && TEST_BIT (must_precede, next_ps_i->id))
+ {
+ /* If we have already met a node that must follow, then
+ there is no possible column. */
+@@ -2893,8 +3000,8 @@
+ }
+ /* The closing branch must be the last in the row. */
+ if (must_precede
+- && TEST_BIT (must_precede, next_ps_i->node->cuid)
+- && JUMP_P (next_ps_i->node->insn))
++ && TEST_BIT (must_precede, next_ps_i->id)
++ && JUMP_P (ps_rtl_insn (ps, next_ps_i->id)))
+ return false;
+
+ last_in_row = next_ps_i;
+@@ -2903,7 +3010,7 @@
+ /* The closing branch is scheduled as well. Make sure there is no
+ dependent instruction after it as the branch should be the last
+ instruction in the row. */
+- if (JUMP_P (ps_i->node->insn))
++ if (JUMP_P (ps_rtl_insn (ps, ps_i->id)))
+ {
+ if (first_must_follow)
+ return false;
+@@ -2954,7 +3061,6 @@
+ {
+ ps_insn_ptr prev, next;
+ int row;
+- ddg_node_ptr next_node;
+
+ if (!ps || !ps_i)
+ return false;
+@@ -2964,11 +3070,9 @@
+ if (! ps_i->next_in_row)
+ return false;
+
+- next_node = ps_i->next_in_row->node;
+-
+ /* Check if next_in_row is dependent on ps_i, both having same sched
+ times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
+- if (must_follow && TEST_BIT (must_follow, next_node->cuid))
++ if (must_follow && TEST_BIT (must_follow, ps_i->next_in_row->id))
+ return false;
+
+ /* Advance PS_I over its next_in_row in the doubly linked list. */
+@@ -2999,7 +3103,7 @@
+ before/after (respectively) the node pointed to by PS_I when scheduled
+ in the same cycle. */
+ static ps_insn_ptr
+-add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
++add_node_to_ps (partial_schedule_ptr ps, int id, int cycle,
+ sbitmap must_precede, sbitmap must_follow)
+ {
+ ps_insn_ptr ps_i;
+@@ -3008,7 +3112,7 @@
+ if (ps->rows_length[row] >= issue_rate)
+ return NULL;
+
+- ps_i = create_ps_insn (node, cycle);
++ ps_i = create_ps_insn (id, cycle);
+
+ /* Finds and inserts PS_I according to MUST_FOLLOW and
+ MUST_PRECEDE. */
+@@ -3060,7 +3164,7 @@
+ crr_insn;
+ crr_insn = crr_insn->next_in_row)
+ {
+- rtx insn = crr_insn->node->insn;
++ rtx insn = ps_rtl_insn (ps, crr_insn->id);
+
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+@@ -3097,7 +3201,7 @@
+ cuid N must be come before/after (respectively) the node pointed to by
+ PS_I when scheduled in the same cycle. */
+ ps_insn_ptr
+-ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
++ps_add_node_check_conflicts (partial_schedule_ptr ps, int n,
+ int c, sbitmap must_precede,
+ sbitmap must_follow)
+ {
+
--- /dev/null
+2011-10-19 Andrew Stubbs <ams@codesourcery.com>
+
+ Backport from FSF:
+
+ 2011-09-09 Andrew Stubbs <ams@codesourcery.com>
+
+ gcc/
+ * config/arm/arm-cores.def (generic-armv7-a): New architecture.
+ * config/arm/arm-tables.opt: Regenerate.
+ * config/arm/arm-tune.md: Regenerate.
+ * config/arm/arm.c (arm_file_start): Output .arch directive when
+ user passes -mcpu=generic-*.
+ (arm_issue_rate): Add genericv7a support.
+ * config/arm/arm.h (EXTRA_SPECS): Add asm_cpu_spec.
+ (ASM_CPU_SPEC): New define.
+ * config/arm/elf.h (ASM_SPEC): Use %(asm_cpu_spec).
+ * config/arm/semi.h (ASM_SPEC): Likewise.
+ * doc/invoke.texi (ARM Options): Document -mcpu=generic-*
+ and -mtune=generic-*.
+
+=== modified file 'gcc/config/arm/arm-cores.def'
+--- old/gcc/config/arm/arm-cores.def 2011-06-14 16:00:30 +0000
++++ new/gcc/config/arm/arm-cores.def 2011-10-19 16:46:51 +0000
+@@ -124,6 +124,7 @@
+ ARM_CORE("mpcore", mpcore, 6K, FL_LDSCHED | FL_VFPV2, 9e)
+ ARM_CORE("arm1156t2-s", arm1156t2s, 6T2, FL_LDSCHED, v6t2)
+ ARM_CORE("arm1156t2f-s", arm1156t2fs, 6T2, FL_LDSCHED | FL_VFPV2, v6t2)
++ARM_CORE("generic-armv7-a", genericv7a, 7A, FL_LDSCHED, cortex)
+ ARM_CORE("cortex-a5", cortexa5, 7A, FL_LDSCHED, cortex_a5)
+ ARM_CORE("cortex-a8", cortexa8, 7A, FL_LDSCHED, cortex)
+ ARM_CORE("cortex-a9", cortexa9, 7A, FL_LDSCHED, cortex_a9)
+@@ -135,3 +136,4 @@
+ ARM_CORE("cortex-m3", cortexm3, 7M, FL_LDSCHED, cortex)
+ ARM_CORE("cortex-m1", cortexm1, 6M, FL_LDSCHED, cortex)
+ ARM_CORE("cortex-m0", cortexm0, 6M, FL_LDSCHED, cortex)
++
+
+=== modified file 'gcc/config/arm/arm-tune.md'
+--- old/gcc/config/arm/arm-tune.md 2011-06-14 14:37:30 +0000
++++ new/gcc/config/arm/arm-tune.md 2011-10-19 16:46:51 +0000
+@@ -1,5 +1,5 @@
+ ;; -*- buffer-read-only: t -*-
+ ;; Generated automatically by gentune.sh from arm-cores.def
+ (define_attr "tune"
+- "arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,fa526,fa626,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexr5,cortexm4,cortexm3,cortexm1,cortexm0"
++ "arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,fa526,fa626,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,genericv7a,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexr5,cortexm4,cortexm3,cortexm1,cortexm0"
+ (const (symbol_ref "((enum attr_tune) arm_tune)")))
+
+=== modified file 'gcc/config/arm/arm.c'
+--- old/gcc/config/arm/arm.c 2011-10-11 02:31:01 +0000
++++ new/gcc/config/arm/arm.c 2011-10-19 16:46:51 +0000
+@@ -22185,6 +22185,8 @@
+ const char *fpu_name;
+ if (arm_selected_arch)
+ asm_fprintf (asm_out_file, "\t.arch %s\n", arm_selected_arch->name);
++ else if (strncmp (arm_selected_cpu->name, "generic", 7) == 0)
++ asm_fprintf (asm_out_file, "\t.arch %s\n", arm_selected_cpu->name + 8);
+ else
+ asm_fprintf (asm_out_file, "\t.cpu %s\n", arm_selected_cpu->name);
+
+@@ -23717,6 +23719,7 @@
+ case cortexr4:
+ case cortexr4f:
+ case cortexr5:
++ case genericv7a:
+ case cortexa5:
+ case cortexa8:
+ case cortexa9:
+
+=== modified file 'gcc/config/arm/arm.h'
+--- old/gcc/config/arm/arm.h 2011-09-05 14:32:11 +0000
++++ new/gcc/config/arm/arm.h 2011-10-19 16:46:51 +0000
+@@ -198,6 +198,7 @@
+ Do not define this macro if it does not need to do anything. */
+ #define EXTRA_SPECS \
+ { "subtarget_cpp_spec", SUBTARGET_CPP_SPEC }, \
++ { "asm_cpu_spec", ASM_CPU_SPEC }, \
+ SUBTARGET_EXTRA_SPECS
+
+ #ifndef SUBTARGET_EXTRA_SPECS
+@@ -2278,4 +2279,8 @@
+ instruction. */
+ #define MAX_LDM_STM_OPS 4
+
++#define ASM_CPU_SPEC \
++ " %{mcpu=generic-*:-march=%*;" \
++ " :%{mcpu=*:-mcpu=%*} %{march=*:-march=%*}}"
++
+ #endif /* ! GCC_ARM_H */
+
+=== modified file 'gcc/config/arm/elf.h'
+--- old/gcc/config/arm/elf.h 2009-06-21 19:48:15 +0000
++++ new/gcc/config/arm/elf.h 2011-10-19 16:46:51 +0000
+@@ -56,8 +56,7 @@
+ #define ASM_SPEC "\
+ %{mbig-endian:-EB} \
+ %{mlittle-endian:-EL} \
+-%{mcpu=*:-mcpu=%*} \
+-%{march=*:-march=%*} \
++%(asm_cpu_spec) \
+ %{mapcs-*:-mapcs-%*} \
+ %(subtarget_asm_float_spec) \
+ %{mthumb-interwork:-mthumb-interwork} \
+
+=== modified file 'gcc/config/arm/semi.h'
+--- old/gcc/config/arm/semi.h 2007-08-02 09:49:31 +0000
++++ new/gcc/config/arm/semi.h 2011-10-19 16:46:51 +0000
+@@ -65,8 +65,7 @@
+ #define ASM_SPEC "\
+ %{fpic|fpie: -k} %{fPIC|fPIE: -k} \
+ %{mbig-endian:-EB} \
+-%{mcpu=*:-mcpu=%*} \
+-%{march=*:-march=%*} \
++%(arm_cpu_spec) \
+ %{mapcs-float:-mfloat} \
+ %{msoft-float:-mfloat-abi=soft} %{mhard-float:-mfloat-abi=hard} \
+ %{mfloat-abi=*} %{mfpu=*} \
+
+=== modified file 'gcc/doc/invoke.texi'
+--- old/gcc/doc/invoke.texi 2011-08-13 08:32:32 +0000
++++ new/gcc/doc/invoke.texi 2011-10-19 16:46:51 +0000
+@@ -10215,6 +10215,10 @@
+ @samp{cortex-m0},
+ @samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312}.
+
++@option{-mcpu=generic-@var{arch}} is also permissible, and is
++equivalent to @option{-march=@var{arch} -mtune=generic-@var{arch}}.
++See @option{-mtune} for more information.
++
+ @item -mtune=@var{name}
+ @opindex mtune
+ This option is very similar to the @option{-mcpu=} option, except that
+@@ -10226,6 +10230,13 @@
+ For some ARM implementations better performance can be obtained by using
+ this option.
+
++@option{-mtune=generic-@var{arch}} specifies that GCC should tune the
++performance for a blend of processors within architecture @var{arch}.
++The aim is to generate code that run well on the current most popular
++processors, balancing between optimizations that benefit some CPUs in the
++range, and avoiding performance pitfalls of other CPUs. The effects of
++this option may change in future GCC versions as CPU models come and go.
++
+ @item -march=@var{name}
+ @opindex march
+ This specifies the name of the target ARM architecture. GCC uses this
+
--- /dev/null
+2011-10-19 Andrew Stubbs <ams@codesourcery.com>
+
+ Backport from FSF:
+
+ 2011-10-18 Andrew Stubbs <ams@codesourcery.com>
+
+ * config/arm/driver-arm.c (host_detect_local_cpu): Close the file
+ before exiting.
+
+ 2011-10-18 Andrew Stubbs <ams@codesourcery.com>
+
+ gcc/
+ * config.host (arm*-*-linux*): Add driver-arm.o and x-arm.
+ * config/arm/arm.opt: Add 'native' processor_type and
+ arm_arch enum values.
+ * config/arm/arm.h (host_detect_local_cpu): New prototype.
+ (EXTRA_SPEC_FUNCTIONS): New define.
+ (MCPU_MTUNE_NATIVE_SPECS): New define.
+ (DRIVER_SELF_SPECS): New define.
+ * config/arm/driver-arm.c: New file.
+ * config/arm/x-arm: New file.
+ * doc/invoke.texi (ARM Options): Document -mcpu=native,
+ -mtune=native and -march=native.
+
+=== modified file 'gcc/config.host'
+--- old/gcc/config.host 2011-02-15 09:49:14 +0000
++++ new/gcc/config.host 2011-10-19 17:01:50 +0000
+@@ -100,6 +100,14 @@
+ esac
+
+ case ${host} in
++ arm*-*-linux*)
++ case ${target} in
++ arm*-*-*)
++ host_extra_gcc_objs="driver-arm.o"
++ host_xmake_file="${host_xmake_file} arm/x-arm"
++ ;;
++ esac
++ ;;
+ alpha*-*-linux*)
+ case ${target} in
+ alpha*-*-linux*)
+
+=== modified file 'gcc/config/arm/arm.h'
+--- old/gcc/config/arm/arm.h 2011-10-19 16:46:51 +0000
++++ new/gcc/config/arm/arm.h 2011-10-19 17:01:50 +0000
+@@ -2283,4 +2283,21 @@
+ " %{mcpu=generic-*:-march=%*;" \
+ " :%{mcpu=*:-mcpu=%*} %{march=*:-march=%*}}"
+
++/* -mcpu=native handling only makes sense with compiler running on
++ an ARM chip. */
++#if defined(__arm__)
++extern const char *host_detect_local_cpu (int argc, const char **argv);
++# define EXTRA_SPEC_FUNCTIONS \
++ { "local_cpu_detect", host_detect_local_cpu },
++
++# define MCPU_MTUNE_NATIVE_SPECS \
++ " %{march=native:%<march=native %:local_cpu_detect(arch)}" \
++ " %{mcpu=native:%<mcpu=native %:local_cpu_detect(cpu)}" \
++ " %{mtune=native:%<mtune=native %:local_cpu_detect(tune)}"
++#else
++# define MCPU_MTUNE_NATIVE_SPECS ""
++#endif
++
++#define DRIVER_SELF_SPECS MCPU_MTUNE_NATIVE_SPECS
++
+ #endif /* ! GCC_ARM_H */
+
+=== modified file 'gcc/config/arm/arm.opt'
+--- old/gcc/config/arm/arm.opt 2011-10-11 02:31:01 +0000
++++ new/gcc/config/arm/arm.opt 2011-10-19 17:01:50 +0000
+@@ -48,6 +48,11 @@
+ Target RejectNegative Joined
+ Specify the name of the target architecture
+
++; Other arm_arch values are loaded from arm-tables.opt
++; but that is a generated file and this is an odd-one-out.
++EnumValue
++Enum(arm_arch) String(native) Value(-1) DriverOnly
++
+ marm
+ Target RejectNegative InverseMask(THUMB) Undocumented
+
+@@ -153,6 +158,11 @@
+ Target RejectNegative Joined
+ Tune code for the given processor
+
++; Other processor_type values are loaded from arm-tables.opt
++; but that is a generated file and this is an odd-one-out.
++EnumValue
++Enum(processor_type) String(native) Value(-1) DriverOnly
++
+ mwords-little-endian
+ Target Report RejectNegative Mask(LITTLE_WORDS)
+ Assume big endian bytes, little endian words
+
+=== added file 'gcc/config/arm/driver-arm.c'
+--- old/gcc/config/arm/driver-arm.c 1970-01-01 00:00:00 +0000
++++ new/gcc/config/arm/driver-arm.c 2011-10-19 17:07:55 +0000
+@@ -0,0 +1,149 @@
++/* Subroutines for the gcc driver.
++ Copyright (C) 2011 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3. If not see
++<http://www.gnu.org/licenses/>. */
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "tm.h"
++#include "configargs.h"
++
++struct vendor_cpu {
++ const char *part_no;
++ const char *arch_name;
++ const char *cpu_name;
++};
++
++static struct vendor_cpu arm_cpu_table[] = {
++ {"0x926", "armv5te", "arm926ej-s"},
++ {"0xa26", "armv5te", "arm1026ej-s"},
++ {"0xb02", "armv6k", "mpcore"},
++ {"0xb36", "armv6j", "arm1136j-s"},
++ {"0xb56", "armv6t2", "arm1156t2-s"},
++ {"0xb76", "armv6zk", "arm1176jz-s"},
++ {"0xc05", "armv7-a", "cortex-a5"},
++ {"0xc08", "armv7-a", "cortex-a8"},
++ {"0xc09", "armv7-a", "cortex-a9"},
++ {"0xc0f", "armv7-a", "cortex-a15"},
++ {"0xc14", "armv7-r", "cortex-r4"},
++ {"0xc15", "armv7-r", "cortex-r5"},
++ {"0xc20", "armv6-m", "cortex-m0"},
++ {"0xc21", "armv6-m", "cortex-m1"},
++ {"0xc23", "armv7-m", "cortex-m3"},
++ {"0xc24", "armv7e-m", "cortex-m4"},
++ {NULL, NULL, NULL}
++};
++
++struct {
++ const char *vendor_no;
++ const struct vendor_cpu *vendor_parts;
++} vendors[] = {
++ {"0x41", arm_cpu_table},
++ {NULL, NULL}
++};
++
++/* This will be called by the spec parser in gcc.c when it sees
++ a %:local_cpu_detect(args) construct. Currently it will be called
++ with either "arch", "cpu" or "tune" as argument depending on if
++ -march=native, -mcpu=native or -mtune=native is to be substituted.
++
++ It returns a string containing new command line parameters to be
++ put at the place of the above two options, depending on what CPU
++ this is executed. E.g. "-march=armv7-a" on a Cortex-A8 for
++ -march=native. If the routine can't detect a known processor,
++ the -march or -mtune option is discarded.
++
++ ARGC and ARGV are set depending on the actual arguments given
++ in the spec. */
++const char *
++host_detect_local_cpu (int argc, const char **argv)
++{
++ const char *val = NULL;
++ char buf[128];
++ FILE *f = NULL;
++ bool arch;
++ const struct vendor_cpu *cpu_table = NULL;
++
++ if (argc < 1)
++ goto not_found;
++
++ arch = strcmp (argv[0], "arch") == 0;
++ if (!arch && strcmp (argv[0], "cpu") != 0 && strcmp (argv[0], "tune"))
++ goto not_found;
++
++ f = fopen ("/proc/cpuinfo", "r");
++ if (f == NULL)
++ goto not_found;
++
++ while (fgets (buf, sizeof (buf), f) != NULL)
++ {
++ /* Ensure that CPU implementer is ARM (0x41). */
++ if (strncmp (buf, "CPU implementer", sizeof ("CPU implementer") - 1) == 0)
++ {
++ int i;
++ for (i = 0; vendors[i].vendor_no != NULL; i++)
++ if (strstr (buf, vendors[i].vendor_no) != NULL)
++ {
++ cpu_table = vendors[i].vendor_parts;
++ break;
++ }
++ }
++
++ /* Detect arch/cpu. */
++ if (strncmp (buf, "CPU part", sizeof ("CPU part") - 1) == 0)
++ {
++ int i;
++
++ if (cpu_table == NULL)
++ goto not_found;
++
++ for (i = 0; cpu_table[i].part_no != NULL; i++)
++ if (strstr (buf, cpu_table[i].part_no) != NULL)
++ {
++ val = arch ? cpu_table[i].arch_name : cpu_table[i].cpu_name;
++ break;
++ }
++ break;
++ }
++ }
++
++ fclose (f);
++
++ if (val == NULL)
++ goto not_found;
++
++ return concat ("-m", argv[0], "=", val, NULL);
++
++not_found:
++ {
++ unsigned int i;
++ unsigned int opt;
++ const char *search[] = {NULL, "arch"};
++
++ if (f)
++ fclose (f);
++
++ search[0] = argv[0];
++ for (opt = 0; opt < ARRAY_SIZE (search); opt++)
++ for (i = 0; i < ARRAY_SIZE (configure_default_options); i++)
++ if (strcmp (configure_default_options[i].name, search[opt]) == 0)
++ return concat ("-m", search[opt], "=",
++ configure_default_options[i].value, NULL);
++ return NULL;
++ }
++}
+
+=== added file 'gcc/config/arm/x-arm'
+--- old/gcc/config/arm/x-arm 1970-01-01 00:00:00 +0000
++++ new/gcc/config/arm/x-arm 2011-10-19 17:01:50 +0000
+@@ -0,0 +1,3 @@
++driver-arm.o: $(srcdir)/config/arm/driver-arm.c \
++ $(CONFIG_H) $(SYSTEM_H)
++ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $<
+
+=== modified file 'gcc/doc/invoke.texi'
+--- old/gcc/doc/invoke.texi 2011-10-19 16:46:51 +0000
++++ new/gcc/doc/invoke.texi 2011-10-19 17:01:50 +0000
+@@ -10215,10 +10215,16 @@
+ @samp{cortex-m0},
+ @samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312}.
+
++
+ @option{-mcpu=generic-@var{arch}} is also permissible, and is
+ equivalent to @option{-march=@var{arch} -mtune=generic-@var{arch}}.
+ See @option{-mtune} for more information.
+
++@option{-mcpu=native} causes the compiler to auto-detect the CPU
++of the build computer. At present, this feature is only supported on
++Linux, and not all architectures are recognised. If the auto-detect is
++unsuccessful the option has no effect.
++
+ @item -mtune=@var{name}
+ @opindex mtune
+ This option is very similar to the @option{-mcpu=} option, except that
+@@ -10237,6 +10243,11 @@
+ range, and avoiding performance pitfalls of other CPUs. The effects of
+ this option may change in future GCC versions as CPU models come and go.
+
++@option{-mtune=native} causes the compiler to auto-detect the CPU
++of the build computer. At present, this feature is only supported on
++Linux, and not all architectures are recognised. If the auto-detect is
++unsuccessful the option has no effect.
++
+ @item -march=@var{name}
+ @opindex march
+ This specifies the name of the target ARM architecture. GCC uses this
+@@ -10250,6 +10261,11 @@
+ @samp{armv7}, @samp{armv7-a}, @samp{armv7-r}, @samp{armv7-m},
+ @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312}.
+
++@option{-march=native} causes the compiler to auto-detect the architecture
++of the build computer. At present, this feature is only supported on
++Linux, and not all architectures are recognised. If the auto-detect is
++unsuccessful the option has no effect.
++
+ @item -mfpu=@var{name}
+ @itemx -mfpe=@var{number}
+ @itemx -mfp=@var{number}
+
--- /dev/null
+2011-10-19 Andrew Stubbs <ams@codesourcery.com>
+
+ Backport from FSF:
+
+ 2011-10-18 Andrew Stubbs <ams@codesourcery.com>
+
+ PR tree-optimization/50717
+
+ gcc/
+ * tree-ssa-math-opts.c (is_widening_mult_p): Remove the 'type'
+ parameter. Calculate 'type' from stmt.
+ (convert_mult_to_widen): Update call the is_widening_mult_p.
+ (convert_plusminus_to_widen): Likewise.
+
+ gcc/testsuite/
+ * gcc.dg/pr50717-1.c: New file.
+ * gcc.target/arm/wmul-12.c: Correct types.
+ * gcc.target/arm/wmul-8.c: Correct types.
+
+=== added file 'gcc/testsuite/gcc.dg/pr50717-1.c'
+--- old/gcc/testsuite/gcc.dg/pr50717-1.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/pr50717-1.c 2011-10-19 14:42:50 +0000
+@@ -0,0 +1,26 @@
++/* PR tree-optimization/50717 */
++/* Ensure that widening multiply-and-accumulate is not used where integer
++ type promotion or users' casts should prevent it. */
++
++/* { dg-options "-O2 -fdump-tree-widening_mul" } */
++
++long long
++f (unsigned int a, char b, long long c)
++{
++ return (a * b) + c;
++}
++
++int
++g (short a, short b, int c)
++{
++ return (short)(a * b) + c;
++}
++
++int
++h (char a, char b, int c)
++{
++ return (char)(a * b) + c;
++}
++
++/* { dg-final { scan-tree-dump-times "WIDEN_MULT_PLUS_EXPR" 0 "widening_mul" } } */
++/* { dg-final { cleanup-tree-dump "widening_mul" } } */
+
+=== modified file 'gcc/testsuite/gcc.target/arm/wmul-12.c'
+--- old/gcc/testsuite/gcc.target/arm/wmul-12.c 2011-07-22 15:46:42 +0000
++++ new/gcc/testsuite/gcc.target/arm/wmul-12.c 2011-10-19 14:42:50 +0000
+@@ -4,8 +4,8 @@
+ long long
+ foo (int *b, int *c)
+ {
+- int tmp = *b * *c;
+- return 10 + (long long)tmp;
++ long long tmp = (long long)*b * *c;
++ return 10 + tmp;
+ }
+
+ /* { dg-final { scan-assembler "smlal" } } */
+
+=== modified file 'gcc/testsuite/gcc.target/arm/wmul-8.c'
+--- old/gcc/testsuite/gcc.target/arm/wmul-8.c 2011-07-15 14:16:54 +0000
++++ new/gcc/testsuite/gcc.target/arm/wmul-8.c 2011-10-19 14:42:50 +0000
+@@ -4,7 +4,7 @@
+ long long
+ foo (long long a, int *b, int *c)
+ {
+- return a + *b * *c;
++ return a + (long long)*b * *c;
+ }
+
+ /* { dg-final { scan-assembler "smlal" } } */
+
+=== modified file 'gcc/tree-ssa-math-opts.c'
+--- old/gcc/tree-ssa-math-opts.c 2011-09-08 20:11:43 +0000
++++ new/gcc/tree-ssa-math-opts.c 2011-10-19 14:42:50 +0000
+@@ -1351,10 +1351,12 @@
+ and *TYPE2_OUT would give the operands of the multiplication. */
+
+ static bool
+-is_widening_mult_p (tree type, gimple stmt,
++is_widening_mult_p (gimple stmt,
+ tree *type1_out, tree *rhs1_out,
+ tree *type2_out, tree *rhs2_out)
+ {
++ tree type = TREE_TYPE (gimple_assign_lhs (stmt));
++
+ if (TREE_CODE (type) != INTEGER_TYPE
+ && TREE_CODE (type) != FIXED_POINT_TYPE)
+ return false;
+@@ -1416,7 +1418,7 @@
+ if (TREE_CODE (type) != INTEGER_TYPE)
+ return false;
+
+- if (!is_widening_mult_p (type, stmt, &type1, &rhs1, &type2, &rhs2))
++ if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
+ return false;
+
+ to_mode = TYPE_MODE (type);
+@@ -1592,7 +1594,7 @@
+ if (code == PLUS_EXPR
+ && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
+ {
+- if (!is_widening_mult_p (type, rhs1_stmt, &type1, &mult_rhs1,
++ if (!is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
+ &type2, &mult_rhs2))
+ return false;
+ add_rhs = rhs2;
+@@ -1600,7 +1602,7 @@
+ }
+ else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
+ {
+- if (!is_widening_mult_p (type, rhs2_stmt, &type1, &mult_rhs1,
++ if (!is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
+ &type2, &mult_rhs2))
+ return false;
+ add_rhs = rhs1;
+
--- /dev/null
+2011-10-21 Andrew Stubbs <ams@codesourcery.com>
+
+ Backport from FSF mainline:
+
+ 2011-10-21 Andrew Stubbs <ams@codesourcery.com>
+
+ PR target/50809
+
+ gcc/
+ * config/arm/driver-arm.c (vendors): Make static.
+
+=== modified file 'gcc/config/arm/driver-arm.c'
+--- old/gcc/config/arm/driver-arm.c 2011-10-19 17:07:55 +0000
++++ new/gcc/config/arm/driver-arm.c 2011-10-21 19:27:47 +0000
+@@ -49,7 +49,7 @@
+ {NULL, NULL, NULL}
+ };
+
+-struct {
++static struct {
+ const char *vendor_no;
+ const struct vendor_cpu *vendor_parts;
+ } vendors[] = {
+
--- /dev/null
+2011-10-27 Ira Rosen <ira.rosen@linaro.org>
+
+ Backport from mainline:
+
+ 2011-10-16 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/
+ * tree-vect-stmts.c (vectorizable_load): For SLP without permutation
+ treat the first load of the node as the first element in its
+ interleaving chain.
+ * tree-vect-slp.c (vect_get_and_check_slp_defs): Swap the operands if
+ necessary and possible.
+ (vect_build_slp_tree): Add new argument. Allow load groups of any size
+ in basic blocks. Keep all the loads for further permutation check.
+ Use the new argument to determine if there is a permutation. Update
+ the recursive calls.
+ (vect_supported_load_permutation_p): Allow subchains of interleaving
+ chains in basic block vectorization.
+ (vect_analyze_slp_instance): Update the call to vect_build_slp_tree.
+ Check load permutation based on the new parameter.
+ (vect_schedule_slp_instance): Don't start from the first element in
+ interleaving chain unless the loads are permuted.
+
+ gcc/testsuite/
+ * gcc.dg/vect/bb-slp-29.c: New test.
+
+=== added file 'gcc/testsuite/gcc.dg/vect/bb-slp-29.c'
+--- old/gcc/testsuite/gcc.dg/vect/bb-slp-29.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/bb-slp-29.c 2011-10-23 11:29:25 +0000
+@@ -0,0 +1,59 @@
++/* { dg-require-effective-target vect_int } */
++
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define A 3
++#define B 4
++#define N 256
++
++short src[N], dst[N];
++
++void foo (short * __restrict__ dst, short * __restrict__ src, int h, int stride, int dummy)
++{
++ int i;
++ h /= 16;
++ for (i = 0; i < h; i++)
++ {
++ dst[0] = A*src[0] + B*src[1];
++ dst[1] = A*src[1] + B*src[2];
++ dst[2] = A*src[2] + B*src[3];
++ dst[3] = A*src[3] + B*src[4];
++ dst[4] = A*src[4] + B*src[5];
++ dst[5] = A*src[5] + B*src[6];
++ dst[6] = A*src[6] + B*src[7];
++ dst[7] = A*src[7] + B*src[8];
++ dst += stride;
++ src += stride;
++ if (dummy == 32)
++ abort ();
++ }
++}
++
++
++int main (void)
++{
++ int i;
++
++ check_vect ();
++
++ for (i = 0; i < N; i++)
++ {
++ dst[i] = 0;
++ src[i] = i;
++ }
++
++ foo (dst, src, N, 8, 0);
++
++ for (i = 0; i < N/2; i++)
++ {
++ if (dst[i] != A * src[i] + B * src[i+1])
++ abort ();
++ }
++
++ return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp" { target { vect_int_mult && vect_element_align } } } } */
++/* { dg-final { cleanup-tree-dump "slp" } } */
++
+
+=== modified file 'gcc/tree-vect-slp.c'
+--- old/gcc/tree-vect-slp.c 2011-10-06 11:08:08 +0000
++++ new/gcc/tree-vect-slp.c 2011-10-23 11:29:25 +0000
+@@ -115,13 +115,15 @@
+ {
+ tree oprnd;
+ unsigned int i, number_of_oprnds;
+- tree def;
++ tree def[2];
+ gimple def_stmt;
+ enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
+ stmt_vec_info stmt_info =
+ vinfo_for_stmt (VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0));
+ enum gimple_rhs_class rhs_class;
+ struct loop *loop = NULL;
++ enum tree_code rhs_code;
++ bool different_types = false;
+
+ if (loop_vinfo)
+ loop = LOOP_VINFO_LOOP (loop_vinfo);
+@@ -133,7 +135,7 @@
+ {
+ oprnd = gimple_op (stmt, i + 1);
+
+- if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def,
++ if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def[i],
+ &dt[i])
+ || (!def_stmt && dt[i] != vect_constant_def))
+ {
+@@ -188,11 +190,11 @@
+ switch (gimple_code (def_stmt))
+ {
+ case GIMPLE_PHI:
+- def = gimple_phi_result (def_stmt);
++ def[i] = gimple_phi_result (def_stmt);
+ break;
+
+ case GIMPLE_ASSIGN:
+- def = gimple_assign_lhs (def_stmt);
++ def[i] = gimple_assign_lhs (def_stmt);
+ break;
+
+ default:
+@@ -206,8 +208,8 @@
+ {
+ /* op0 of the first stmt of the group - store its info. */
+ *first_stmt_dt0 = dt[i];
+- if (def)
+- *first_stmt_def0_type = TREE_TYPE (def);
++ if (def[i])
++ *first_stmt_def0_type = TREE_TYPE (def[i]);
+ else
+ *first_stmt_const_oprnd = oprnd;
+
+@@ -227,8 +229,8 @@
+ {
+ /* op1 of the first stmt of the group - store its info. */
+ *first_stmt_dt1 = dt[i];
+- if (def)
+- *first_stmt_def1_type = TREE_TYPE (def);
++ if (def[i])
++ *first_stmt_def1_type = TREE_TYPE (def[i]);
+ else
+ {
+ /* We assume that the stmt contains only one constant
+@@ -249,22 +251,53 @@
+ the def-stmt/s of the first stmt. */
+ if ((i == 0
+ && (*first_stmt_dt0 != dt[i]
+- || (*first_stmt_def0_type && def
++ || (*first_stmt_def0_type && def[0]
+ && !types_compatible_p (*first_stmt_def0_type,
+- TREE_TYPE (def)))))
++ TREE_TYPE (def[0])))))
+ || (i == 1
+ && (*first_stmt_dt1 != dt[i]
+- || (*first_stmt_def1_type && def
++ || (*first_stmt_def1_type && def[1]
+ && !types_compatible_p (*first_stmt_def1_type,
+- TREE_TYPE (def)))))
+- || (!def
++ TREE_TYPE (def[1])))))
++ || (!def[i]
+ && !types_compatible_p (TREE_TYPE (*first_stmt_const_oprnd),
+- TREE_TYPE (oprnd))))
++ TREE_TYPE (oprnd)))
++ || different_types)
+ {
+- if (vect_print_dump_info (REPORT_SLP))
+- fprintf (vect_dump, "Build SLP failed: different types ");
++ if (i != number_of_oprnds - 1)
++ different_types = true;
++ else
++ {
++ if (is_gimple_assign (stmt)
++ && (rhs_code = gimple_assign_rhs_code (stmt))
++ && TREE_CODE_CLASS (rhs_code) == tcc_binary
++ && commutative_tree_code (rhs_code)
++ && *first_stmt_dt0 == dt[1]
++ && *first_stmt_dt1 == dt[0]
++ && def[0] && def[1]
++ && !(*first_stmt_def0_type
++ && !types_compatible_p (*first_stmt_def0_type,
++ TREE_TYPE (def[1])))
++ && !(*first_stmt_def1_type
++ && !types_compatible_p (*first_stmt_def1_type,
++ TREE_TYPE (def[0]))))
++ {
++ if (vect_print_dump_info (REPORT_SLP))
++ {
++ fprintf (vect_dump, "Swapping operands of ");
++ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
++ }
++ swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
++ gimple_assign_rhs2_ptr (stmt));
++ }
++ else
++ {
++ if (vect_print_dump_info (REPORT_SLP))
++ fprintf (vect_dump, "Build SLP failed: different types ");
+
+- return false;
++ return false;
++ }
++ }
+ }
+ }
+ }
+@@ -278,10 +311,10 @@
+
+ case vect_internal_def:
+ case vect_reduction_def:
+- if (i == 0)
++ if ((i == 0 && !different_types) || (i == 1 && different_types))
+ VEC_safe_push (gimple, heap, *def_stmts0, def_stmt);
+ else
+- VEC_safe_push (gimple, heap, *def_stmts1, def_stmt);
++ VEC_safe_push (gimple, heap, *def_stmts1, def_stmt);
+ break;
+
+ default:
+@@ -289,7 +322,7 @@
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: illegal type of def ");
+- print_generic_expr (vect_dump, def, TDF_SLIM);
++ print_generic_expr (vect_dump, def[i], TDF_SLIM);
+ }
+
+ return false;
+@@ -312,7 +345,7 @@
+ int ncopies_for_cost, unsigned int *max_nunits,
+ VEC (int, heap) **load_permutation,
+ VEC (slp_tree, heap) **loads,
+- unsigned int vectorization_factor)
++ unsigned int vectorization_factor, bool *loads_permuted)
+ {
+ VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size);
+ VEC (gimple, heap) *def_stmts1 = VEC_alloc (gimple, heap, group_size);
+@@ -523,7 +556,9 @@
+
+ /* Check that the size of interleaved loads group is not
+ greater than the SLP group size. */
+- if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
++ if (loop_vinfo
++ && DR_GROUP_SIZE (vinfo_for_stmt (stmt))
++ > ncopies * group_size)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+@@ -644,19 +679,22 @@
+ /* Strided loads were reached - stop the recursion. */
+ if (stop_recursion)
+ {
++ VEC_safe_push (slp_tree, heap, *loads, *node);
+ if (permutation)
+ {
+- VEC_safe_push (slp_tree, heap, *loads, *node);
++
++ *loads_permuted = true;
+ *inside_cost
+ += targetm.vectorize.builtin_vectorization_cost (vec_perm, NULL, 0)
+ * group_size;
+ }
+ else
+- {
+- /* We don't check here complex numbers chains, so we keep them in
+- LOADS for further check in vect_supported_load_permutation_p. */
++ {
++ /* We don't check here complex numbers chains, so we set
++ LOADS_PERMUTED for further check in
++ vect_supported_load_permutation_p. */
+ if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR)
+- VEC_safe_push (slp_tree, heap, *loads, *node);
++ *loads_permuted = true;
+ }
+
+ return true;
+@@ -675,7 +713,7 @@
+ if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &left_node, group_size,
+ inside_cost, outside_cost, ncopies_for_cost,
+ max_nunits, load_permutation, loads,
+- vectorization_factor))
++ vectorization_factor, loads_permuted))
+ return false;
+
+ SLP_TREE_LEFT (*node) = left_node;
+@@ -693,7 +731,7 @@
+ if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &right_node, group_size,
+ inside_cost, outside_cost, ncopies_for_cost,
+ max_nunits, load_permutation, loads,
+- vectorization_factor))
++ vectorization_factor, loads_permuted))
+ return false;
+
+ SLP_TREE_RIGHT (*node) = right_node;
+@@ -879,8 +917,10 @@
+ bool supported, bad_permutation = false;
+ sbitmap load_index;
+ slp_tree node, other_complex_node;
+- gimple stmt, first = NULL, other_node_first;
++ gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
+ unsigned complex_numbers = 0;
++ struct data_reference *dr;
++ bb_vec_info bb_vinfo;
+
+ /* FORNOW: permutations are only supported in SLP. */
+ if (!slp_instn)
+@@ -1040,6 +1080,76 @@
+ }
+ }
+
++ /* In basic block vectorization we allow any subchain of an interleaving
++ chain.
++ FORNOW: not supported in loop SLP because of realignment compications. */
++ bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
++ bad_permutation = false;
++ /* Check that for every node in the instance teh loads form a subchain. */
++ if (bb_vinfo)
++ {
++ FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node)
++ {
++ next_load = NULL;
++ first_load = NULL;
++ FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), j, load)
++ {
++ if (!first_load)
++ first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (load));
++ else if (first_load
++ != DR_GROUP_FIRST_DR (vinfo_for_stmt (load)))
++ {
++ bad_permutation = true;
++ break;
++ }
++
++ if (j != 0 && next_load != load)
++ {
++ bad_permutation = true;
++ break;
++ }
++
++ next_load = DR_GROUP_NEXT_DR (vinfo_for_stmt (load));
++ }
++
++ if (bad_permutation)
++ break;
++ }
++
++ /* Check that the alignment of the first load in every subchain, i.e.,
++ the first statement in every load node, is supported. */
++ if (!bad_permutation)
++ {
++ FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node)
++ {
++ first_load = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
++ if (first_load
++ != DR_GROUP_FIRST_DR (vinfo_for_stmt (first_load)))
++ {
++ dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
++ if (vect_supportable_dr_alignment (dr, false)
++ == dr_unaligned_unsupported)
++ {
++ if (vect_print_dump_info (REPORT_SLP))
++ {
++ fprintf (vect_dump, "unsupported unaligned load ");
++ print_gimple_stmt (vect_dump, first_load, 0,
++ TDF_SLIM);
++ }
++ bad_permutation = true;
++ break;
++ }
++ }
++ }
++
++ if (!bad_permutation)
++ {
++ VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn));
++ return true;
++ }
++ }
++ }
++
+ /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
+ GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
+ well (unless it's reduction). */
+@@ -1149,6 +1259,7 @@
+ VEC (int, heap) *load_permutation;
+ VEC (slp_tree, heap) *loads;
+ struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
++ bool loads_permuted = false;
+
+ if (dr)
+ {
+@@ -1238,7 +1349,7 @@
+ if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
+ &inside_cost, &outside_cost, ncopies_for_cost,
+ &max_nunits, &load_permutation, &loads,
+- vectorization_factor))
++ vectorization_factor, &loads_permuted))
+ {
+ /* Calculate the unrolling factor based on the smallest type. */
+ if (max_nunits > nunits)
+@@ -1263,7 +1374,8 @@
+ SLP_INSTANCE_LOADS (new_instance) = loads;
+ SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
+ SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
+- if (VEC_length (slp_tree, loads))
++
++ if (loads_permuted)
+ {
+ if (!vect_supported_load_permutation_p (new_instance, group_size,
+ load_permutation))
+@@ -2542,10 +2654,11 @@
+ /* Loads should be inserted before the first load. */
+ if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
+ && STMT_VINFO_STRIDED_ACCESS (stmt_info)
+- && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
++ && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
++ && SLP_INSTANCE_LOAD_PERMUTATION (instance))
+ si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
+ else if (is_pattern_stmt_p (stmt_info))
+- si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
++ si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
+ else
+ si = gsi_for_stmt (stmt);
+
+
+=== modified file 'gcc/tree-vect-stmts.c'
+--- old/gcc/tree-vect-stmts.c 2011-10-16 12:16:07 +0000
++++ new/gcc/tree-vect-stmts.c 2011-10-23 11:29:25 +0000
+@@ -4285,6 +4285,11 @@
+ if (strided_load)
+ {
+ first_stmt = DR_GROUP_FIRST_DR (stmt_info);
++ if (slp
++ && !SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance)
++ && first_stmt != VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0))
++ first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
++
+ /* Check if the chain of loads is already vectorized. */
+ if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
+ {
+
--- /dev/null
+ 2011-10-18 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/
+ * doc/md.texi (vec_widen_ushiftl_hi, vec_widen_ushiftl_lo,
+ vec_widen_sshiftl_hi, vec_widen_sshiftl_lo): Document.
+ * tree-pretty-print.c (dump_generic_node): Handle WIDEN_LSHIFT_EXPR,
+ VEC_WIDEN_LSHIFT_HI_EXPR and VEC_WIDEN_LSHIFT_LO_EXPR.
+ (op_code_prio): Likewise.
+ (op_symbol_code): Handle WIDEN_LSHIFT_EXPR.
+ * optabs.c (optab_for_tree_code): Handle
+ VEC_WIDEN_LSHIFT_HI_EXPR and VEC_WIDEN_LSHIFT_LO_EXPR.
+ (init-optabs): Initialize optab codes for vec_widen_u/sshiftl_hi/lo.
+ * optabs.h (enum optab_index): Add OTI_vec_widen_u/sshiftl_hi/lo.
+ * genopinit.c (optabs): Initialize the new optabs.
+ * expr.c (expand_expr_real_2): Handle
+ VEC_WIDEN_LSHIFT_HI_EXPR and VEC_WIDEN_LSHIFT_LO_EXPR.
+ * gimple-pretty-print.c (dump_binary_rhs): Likewise.
+ * tree-vectorizer.h (NUM_PATTERNS): Increase to 8.
+ * tree.def (WIDEN_LSHIFT_EXPR, VEC_WIDEN_LSHIFT_HI_EXPR,
+ VEC_WIDEN_LSHIFT_LO_EXPR): New.
+ * cfgexpand.c (expand_debug_expr): Handle new tree codes.
+ * tree-vect-patterns.c (vect_vect_recog_func_ptrs): Add
+ vect_recog_widen_shift_pattern.
+ (vect_handle_widen_mult_by_const): Rename...
+ (vect_handle_widen_op_by_const): ...to this. Handle shifts.
+ Add a new argument, update documentation.
+ (vect_recog_widen_mult_pattern): Assume that only second
+ operand can be constant. Update call to
+ vect_handle_widen_op_by_const.
+ (vect_recog_over_widening_pattern): Fix typo.
+ (vect_recog_widen_shift_pattern): New.
+ * tree-vect-stmts.c (vectorizable_type_promotion): Handle
+ widening shifts.
+ (supportable_widening_operation): Likewise.
+ * tree-inline.c (estimate_operator_cost): Handle new tree codes.
+ * tree-vect-generic.c (expand_vector_operations_1): Likewise.
+ * tree-cfg.c (verify_gimple_assign_binary): Likewise.
+ * config/arm/neon.md (neon_vec_<US>shiftl_<mode>): New.
+ (vec_widen_<US>shiftl_lo_<mode>, neon_vec_<US>shiftl_hi_<mode>,
+ vec_widen_<US>shiftl_hi_<mode>, neon_vec_<US>shift_left_<mode>):
+ Likewise.
+ * config/arm/predicates.md (const_neon_scalar_shift_amount_operand):
+ New.
+ * config/arm/iterators.md (V_innermode): New.
+ * tree-vect-slp.c (vect_build_slp_tree): Require same shift operand
+ for widening shift.
+
+ gcc/testsuite
+ * testsuite/lib/target-supports.exp
+ (check_effective_target_vect_widen_shift): New.
+ * gcc.dg/vect/vect-widen-shift-s16.c: New.
+ * gcc.dg/vect/vect-widen-shift-s8.c: New.
+ * gcc.dg/vect/vect-widen-shift-u16.c: New.
+ * gcc.dg/vect/vect-widen-shift-u8.c: New.
+
+ 2011-10-06 Jakub Jelinek <jakub@redhat.com>
+
+ gcc/
+ * tree-vect-patterns.c (vect_pattern_recog_1): Use
+ vect_recog_func_ptr typedef for the first argument.
+ (vect_pattern_recog): Rename vect_recog_func_ptr variable
+ to vect_recog_func, use vect_recog_func_ptr typedef for it.
+
+ 2011-10-16 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/
+ PR tree-optimization/50727
+ * tree-vect-patterns.c (vect_operation_fits_smaller_type): Add
+ DEF_STMT to the list of statements to be replaced by the
+ pattern statements.
+
+ 2011-10-09 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/
+ PR tree-optimization/50635
+ * tree-vect-patterns.c (vect_handle_widen_mult_by_const): Add
+ DEF_STMT to the list of statements to be replaced by the
+ pattern statements.
+ (vect_handle_widen_mult_by_const): Don't check TYPE_OUT.
+
+=== modified file 'gcc/cfgexpand.c'
+--- old/gcc/cfgexpand.c 2011-07-01 09:19:21 +0000
++++ new/gcc/cfgexpand.c 2011-10-23 13:33:07 +0000
+@@ -3215,6 +3215,8 @@
+ case VEC_UNPACK_LO_EXPR:
+ case VEC_WIDEN_MULT_HI_EXPR:
+ case VEC_WIDEN_MULT_LO_EXPR:
++ case VEC_WIDEN_LSHIFT_HI_EXPR:
++ case VEC_WIDEN_LSHIFT_LO_EXPR:
+ return NULL;
+
+ /* Misc codes. */
+
+=== modified file 'gcc/config/arm/iterators.md'
+--- old/gcc/config/arm/iterators.md 2011-09-06 14:29:24 +0000
++++ new/gcc/config/arm/iterators.md 2011-10-23 13:33:07 +0000
+@@ -388,6 +388,9 @@
+ (define_mode_attr qhs_extenddi_cstr [(SI "r") (HI "rm") (QI "rUq")])
+ (define_mode_attr qhs_zextenddi_cstr [(SI "r") (HI "rm") (QI "rm")])
+
++;; Mode attribute for vshll.
++(define_mode_attr V_innermode [(V8QI "QI") (V4HI "HI") (V2SI "SI")])
++
+ ;;----------------------------------------------------------------------------
+ ;; Code attributes
+ ;;----------------------------------------------------------------------------
+
+=== modified file 'gcc/config/arm/neon.md'
+--- old/gcc/config/arm/neon.md 2011-10-03 01:32:17 +0000
++++ new/gcc/config/arm/neon.md 2011-10-23 13:33:07 +0000
+@@ -5316,6 +5316,44 @@
+ }
+ )
+
++(define_insn "neon_vec_<US>shiftl_<mode>"
++ [(set (match_operand:<V_widen> 0 "register_operand" "=w")
++ (SE:<V_widen> (ashift:VW (match_operand:VW 1 "register_operand" "w")
++ (match_operand:<V_innermode> 2 "const_neon_scalar_shift_amount_operand" ""))))]
++ "TARGET_NEON"
++{
++ return "vshll.<US><V_sz_elem> %q0, %P1, %2";
++}
++ [(set_attr "neon_type" "neon_shift_1")]
++)
++
++(define_expand "vec_widen_<US>shiftl_lo_<mode>"
++ [(match_operand:<V_unpack> 0 "register_operand" "")
++ (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))
++ (match_operand:SI 2 "immediate_operand" "i")]
++ "TARGET_NEON && !BYTES_BIG_ENDIAN"
++ {
++ emit_insn (gen_neon_vec_<US>shiftl_<V_half> (operands[0],
++ simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 0),
++ operands[2]));
++ DONE;
++ }
++)
++
++(define_expand "vec_widen_<US>shiftl_hi_<mode>"
++ [(match_operand:<V_unpack> 0 "register_operand" "")
++ (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))
++ (match_operand:SI 2 "immediate_operand" "i")]
++ "TARGET_NEON && !BYTES_BIG_ENDIAN"
++ {
++ emit_insn (gen_neon_vec_<US>shiftl_<V_half> (operands[0],
++ simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode,
++ GET_MODE_SIZE (<V_HALF>mode)),
++ operands[2]));
++ DONE;
++ }
++)
++
+ ;; Vectorize for non-neon-quad case
+ (define_insn "neon_unpack<US>_<mode>"
+ [(set (match_operand:<V_widen> 0 "register_operand" "=w")
+@@ -5392,6 +5430,34 @@
+ }
+ )
+
++(define_expand "vec_widen_<US>shiftl_hi_<mode>"
++ [(match_operand:<V_double_width> 0 "register_operand" "")
++ (SE:<V_double_width> (match_operand:VDI 1 "register_operand" ""))
++ (match_operand:SI 2 "immediate_operand" "i")]
++ "TARGET_NEON"
++ {
++ rtx tmpreg = gen_reg_rtx (<V_widen>mode);
++ emit_insn (gen_neon_vec_<US>shiftl_<mode> (tmpreg, operands[1], operands[2]));
++ emit_insn (gen_neon_vget_high<V_widen_l> (operands[0], tmpreg));
++
++ DONE;
++ }
++)
++
++(define_expand "vec_widen_<US>shiftl_lo_<mode>"
++ [(match_operand:<V_double_width> 0 "register_operand" "")
++ (SE:<V_double_width> (match_operand:VDI 1 "register_operand" ""))
++ (match_operand:SI 2 "immediate_operand" "i")]
++ "TARGET_NEON"
++ {
++ rtx tmpreg = gen_reg_rtx (<V_widen>mode);
++ emit_insn (gen_neon_vec_<US>shiftl_<mode> (tmpreg, operands[1], operands[2]));
++ emit_insn (gen_neon_vget_low<V_widen_l> (operands[0], tmpreg));
++
++ DONE;
++ }
++)
++
+ ;; The case when using all quad registers.
+ (define_insn "vec_pack_trunc_<mode>"
+ [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
+
+=== modified file 'gcc/config/arm/predicates.md'
+--- old/gcc/config/arm/predicates.md 2011-10-10 11:43:28 +0000
++++ new/gcc/config/arm/predicates.md 2011-10-23 13:33:07 +0000
+@@ -136,6 +136,11 @@
+ (match_operand 0 "s_register_operand"))
+ (match_operand 0 "const_int_operand")))
+
++(define_predicate "const_neon_scalar_shift_amount_operand"
++ (and (match_code "const_int")
++ (match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) <= GET_MODE_BITSIZE (mode)
++ && ((unsigned HOST_WIDE_INT) INTVAL (op)) > 0")))
++
+ (define_predicate "arm_add_operand"
+ (ior (match_operand 0 "arm_rhs_operand")
+ (match_operand 0 "arm_neg_immediate_operand")))
+
+=== modified file 'gcc/doc/md.texi'
+--- old/gcc/doc/md.texi 2011-08-13 08:32:32 +0000
++++ new/gcc/doc/md.texi 2011-10-23 13:33:07 +0000
+@@ -4230,6 +4230,17 @@
+ elements of the two vectors, and put the N/2 products of size 2*S in the
+ output vector (operand 0).
+
++@cindex @code{vec_widen_ushiftl_hi_@var{m}} instruction pattern
++@cindex @code{vec_widen_ushiftl_lo_@var{m}} instruction pattern
++@cindex @code{vec_widen_sshiftl_hi_@var{m}} instruction pattern
++@cindex @code{vec_widen_sshiftl_lo_@var{m}} instruction pattern
++@item @samp{vec_widen_ushiftl_hi_@var{m}}, @samp{vec_widen_ushiftl_lo_@var{m}}
++@itemx @samp{vec_widen_sshiftl_hi_@var{m}}, @samp{vec_widen_sshiftl_lo_@var{m}}
++Signed/Unsigned widening shift left. The first input (operand 1) is a vector
++with N signed/unsigned elements of size S@. Operand 2 is a constant. Shift
++the high/low elements of operand 1, and put the N/2 results of size 2*S in the
++output vector (operand 0).
++
+ @cindex @code{mulhisi3} instruction pattern
+ @item @samp{mulhisi3}
+ Multiply operands 1 and 2, which have mode @code{HImode}, and store
+
+=== modified file 'gcc/expr.c'
+--- old/gcc/expr.c 2011-08-25 11:42:09 +0000
++++ new/gcc/expr.c 2011-10-23 13:33:07 +0000
+@@ -8290,6 +8290,19 @@
+ return target;
+ }
+
++ case VEC_WIDEN_LSHIFT_HI_EXPR:
++ case VEC_WIDEN_LSHIFT_LO_EXPR:
++ {
++ tree oprnd0 = treeop0;
++ tree oprnd1 = treeop1;
++
++ expand_operands (oprnd0, oprnd1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
++ target = expand_widen_pattern_expr (ops, op0, op1, NULL_RTX,
++ target, unsignedp);
++ gcc_assert (target);
++ return target;
++ }
++
+ case VEC_PACK_TRUNC_EXPR:
+ case VEC_PACK_SAT_EXPR:
+ case VEC_PACK_FIX_TRUNC_EXPR:
+
+=== modified file 'gcc/genopinit.c'
+--- old/gcc/genopinit.c 2011-07-15 13:06:31 +0000
++++ new/gcc/genopinit.c 2011-10-23 13:33:07 +0000
+@@ -268,6 +268,10 @@
+ "set_optab_handler (vec_widen_umult_lo_optab, $A, CODE_FOR_$(vec_widen_umult_lo_$a$))",
+ "set_optab_handler (vec_widen_smult_hi_optab, $A, CODE_FOR_$(vec_widen_smult_hi_$a$))",
+ "set_optab_handler (vec_widen_smult_lo_optab, $A, CODE_FOR_$(vec_widen_smult_lo_$a$))",
++ "set_optab_handler (vec_widen_ushiftl_hi_optab, $A, CODE_FOR_$(vec_widen_ushiftl_hi_$a$))",
++ "set_optab_handler (vec_widen_ushiftl_lo_optab, $A, CODE_FOR_$(vec_widen_ushiftl_lo_$a$))",
++ "set_optab_handler (vec_widen_sshiftl_hi_optab, $A, CODE_FOR_$(vec_widen_sshiftl_hi_$a$))",
++ "set_optab_handler (vec_widen_sshiftl_lo_optab, $A, CODE_FOR_$(vec_widen_sshiftl_lo_$a$))",
+ "set_optab_handler (vec_unpacks_hi_optab, $A, CODE_FOR_$(vec_unpacks_hi_$a$))",
+ "set_optab_handler (vec_unpacks_lo_optab, $A, CODE_FOR_$(vec_unpacks_lo_$a$))",
+ "set_optab_handler (vec_unpacku_hi_optab, $A, CODE_FOR_$(vec_unpacku_hi_$a$))",
+
+=== modified file 'gcc/gimple-pretty-print.c'
+--- old/gcc/gimple-pretty-print.c 2011-05-05 15:42:22 +0000
++++ new/gcc/gimple-pretty-print.c 2011-10-23 13:33:07 +0000
+@@ -343,6 +343,8 @@
+ case VEC_EXTRACT_ODD_EXPR:
+ case VEC_INTERLEAVE_HIGH_EXPR:
+ case VEC_INTERLEAVE_LOW_EXPR:
++ case VEC_WIDEN_LSHIFT_HI_EXPR:
++ case VEC_WIDEN_LSHIFT_LO_EXPR:
+ for (p = tree_code_name [(int) code]; *p; p++)
+ pp_character (buffer, TOUPPER (*p));
+ pp_string (buffer, " <");
+
+=== modified file 'gcc/optabs.c'
+--- old/gcc/optabs.c 2011-08-11 15:46:01 +0000
++++ new/gcc/optabs.c 2011-10-23 13:33:07 +0000
+@@ -454,6 +454,14 @@
+ return TYPE_UNSIGNED (type) ?
+ vec_widen_umult_lo_optab : vec_widen_smult_lo_optab;
+
++ case VEC_WIDEN_LSHIFT_HI_EXPR:
++ return TYPE_UNSIGNED (type) ?
++ vec_widen_ushiftl_hi_optab : vec_widen_sshiftl_hi_optab;
++
++ case VEC_WIDEN_LSHIFT_LO_EXPR:
++ return TYPE_UNSIGNED (type) ?
++ vec_widen_ushiftl_lo_optab : vec_widen_sshiftl_lo_optab;
++
+ case VEC_UNPACK_HI_EXPR:
+ return TYPE_UNSIGNED (type) ?
+ vec_unpacku_hi_optab : vec_unpacks_hi_optab;
+@@ -6351,6 +6359,10 @@
+ init_optab (vec_widen_umult_lo_optab, UNKNOWN);
+ init_optab (vec_widen_smult_hi_optab, UNKNOWN);
+ init_optab (vec_widen_smult_lo_optab, UNKNOWN);
++ init_optab (vec_widen_ushiftl_hi_optab, UNKNOWN);
++ init_optab (vec_widen_ushiftl_lo_optab, UNKNOWN);
++ init_optab (vec_widen_sshiftl_hi_optab, UNKNOWN);
++ init_optab (vec_widen_sshiftl_lo_optab, UNKNOWN);
+ init_optab (vec_unpacks_hi_optab, UNKNOWN);
+ init_optab (vec_unpacks_lo_optab, UNKNOWN);
+ init_optab (vec_unpacku_hi_optab, UNKNOWN);
+
+=== modified file 'gcc/optabs.h'
+--- old/gcc/optabs.h 2011-07-27 14:12:45 +0000
++++ new/gcc/optabs.h 2011-10-23 13:33:07 +0000
+@@ -350,6 +350,12 @@
+ OTI_vec_widen_umult_lo,
+ OTI_vec_widen_smult_hi,
+ OTI_vec_widen_smult_lo,
++ /* Widening shift left.
++ The high/low part of the resulting vector is returned. */
++ OTI_vec_widen_ushiftl_hi,
++ OTI_vec_widen_ushiftl_lo,
++ OTI_vec_widen_sshiftl_hi,
++ OTI_vec_widen_sshiftl_lo,
+ /* Extract and widen the high/low part of a vector of signed or
+ floating point elements. */
+ OTI_vec_unpacks_hi,
+@@ -542,6 +548,10 @@
+ #define vec_widen_umult_lo_optab (&optab_table[OTI_vec_widen_umult_lo])
+ #define vec_widen_smult_hi_optab (&optab_table[OTI_vec_widen_smult_hi])
+ #define vec_widen_smult_lo_optab (&optab_table[OTI_vec_widen_smult_lo])
++#define vec_widen_ushiftl_hi_optab (&optab_table[OTI_vec_widen_ushiftl_hi])
++#define vec_widen_ushiftl_lo_optab (&optab_table[OTI_vec_widen_ushiftl_lo])
++#define vec_widen_sshiftl_hi_optab (&optab_table[OTI_vec_widen_sshiftl_hi])
++#define vec_widen_sshiftl_lo_optab (&optab_table[OTI_vec_widen_sshiftl_lo])
+ #define vec_unpacks_hi_optab (&optab_table[OTI_vec_unpacks_hi])
+ #define vec_unpacks_lo_optab (&optab_table[OTI_vec_unpacks_lo])
+ #define vec_unpacku_hi_optab (&optab_table[OTI_vec_unpacku_hi])
+
+=== added file 'gcc/testsuite/gcc.dg/vect/vect-widen-shift-s16.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-widen-shift-s16.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-widen-shift-s16.c 2011-10-23 13:33:07 +0000
+@@ -0,0 +1,107 @@
++/* { dg-require-effective-target vect_int } */
++/* { dg-require-effective-target vect_shift } */
++
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define N 64
++#define C 16
++
++__attribute__ ((noinline)) void
++foo (short *src, int *dst)
++{
++ int i;
++ short b, b0, b1, b2, b3, *s = src;
++ int *d = dst;
++
++ for (i = 0; i < N/4; i++)
++ {
++ b0 = *s++;
++ b1 = *s++;
++ b2 = *s++;
++ b3 = *s++;
++ *d = b0 << C;
++ d++;
++ *d = b1 << C;
++ d++;
++ *d = b2 << C;
++ d++;
++ *d = b3 << C;
++ d++;
++ }
++
++ s = src;
++ d = dst;
++ for (i = 0; i < N; i++)
++ {
++ b = *s++;
++ if (*d != b << C)
++ abort ();
++ d++;
++ }
++
++ s = src;
++ d = dst;
++ for (i = 0; i < N/4; i++)
++ {
++ b0 = *s++;
++ b1 = *s++;
++ b2 = *s++;
++ b3 = *s++;
++ *d = b0 << C;
++ d++;
++ *d = b1 << C;
++ d++;
++ *d = b2 << C;
++ d++;
++ *d = b3 << 6;
++ d++;
++ }
++
++ s = src;
++ d = dst;
++ for (i = 0; i < N/4; i++)
++ {
++ b = *s++;
++ if (*d != b << C)
++ abort ();
++ d++;
++ b = *s++;
++ if (*d != b << C)
++ abort ();
++ d++;
++ b = *s++;
++ if (*d != b << C)
++ abort ();
++ d++;
++ b = *s++;
++ if (*d != b << 6)
++ abort ();
++ d++;
++ }
++}
++
++int main (void)
++{
++ int i;
++ short in[N];
++ int out[N];
++
++ check_vect ();
++
++ for (i = 0; i < N; i++)
++ {
++ in[i] = i;
++ out[i] = 255;
++ __asm__ volatile ("");
++ }
++
++ foo (in, out);
++
++ return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "vect_recog_widen_shift_pattern: detected" 8 "vect" { target vect_widen_shift } } } */
++/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
++/* { dg-final { cleanup-tree-dump "vect" } } */
++
+
+=== added file 'gcc/testsuite/gcc.dg/vect/vect-widen-shift-s8.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-widen-shift-s8.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-widen-shift-s8.c 2011-10-23 13:33:07 +0000
+@@ -0,0 +1,58 @@
++/* { dg-require-effective-target vect_int } */
++/* { dg-require-effective-target vect_shift } */
++
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define N 64
++#define C 12
++
++__attribute__ ((noinline)) void
++foo (char *src, int *dst)
++{
++ int i;
++ char b, *s = src;
++ int *d = dst;
++
++ for (i = 0; i < N; i++)
++ {
++ b = *s++;
++ *d = b << C;
++ d++;
++ }
++
++ s = src;
++ d = dst;
++ for (i = 0; i < N; i++)
++ {
++ b = *s++;
++ if (*d != b << C)
++ abort ();
++ d++;
++ }
++}
++
++int main (void)
++{
++ int i;
++ char in[N];
++ int out[N];
++
++ check_vect ();
++
++ for (i = 0; i < N; i++)
++ {
++ in[i] = i;
++ out[i] = 255;
++ __asm__ volatile ("");
++ }
++
++ foo (in, out);
++
++ return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "vect_recog_widen_shift_pattern: detected" 1 "vect" { target vect_widen_shift } } } */
++/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
++/* { dg-final { cleanup-tree-dump "vect" } } */
++
+
+=== added file 'gcc/testsuite/gcc.dg/vect/vect-widen-shift-u16.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-widen-shift-u16.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-widen-shift-u16.c 2011-10-23 13:33:07 +0000
+@@ -0,0 +1,58 @@
++/* { dg-require-effective-target vect_int } */
++/* { dg-require-effective-target vect_shift } */
++
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define N 64
++#define C 7
++
++__attribute__ ((noinline)) void
++foo (unsigned short *src, unsigned int *dst)
++{
++ int i;
++ unsigned short b, *s = src;
++ unsigned int *d = dst;
++
++ for (i = 0; i < N; i++)
++ {
++ b = *s++;
++ *d = b << C;
++ d++;
++ }
++
++ s = src;
++ d = dst;
++ for (i = 0; i < N; i++)
++ {
++ b = *s++;
++ if (*d != b << C)
++ abort ();
++ d++;
++ }
++}
++
++int main (void)
++{
++ int i;
++ unsigned short in[N];
++ unsigned int out[N];
++
++ check_vect ();
++
++ for (i = 0; i < N; i++)
++ {
++ in[i] = i;
++ out[i] = 255;
++ __asm__ volatile ("");
++ }
++
++ foo (in, out);
++
++ return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "vect_recog_widen_shift_pattern: detected" 1 "vect" { target vect_widen_shift } } } */
++/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
++/* { dg-final { cleanup-tree-dump "vect" } } */
++
+
+=== added file 'gcc/testsuite/gcc.dg/vect/vect-widen-shift-u8.c'
+--- old/gcc/testsuite/gcc.dg/vect/vect-widen-shift-u8.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.dg/vect/vect-widen-shift-u8.c 2011-10-23 13:33:07 +0000
+@@ -0,0 +1,65 @@
++/* { dg-require-effective-target vect_int } */
++/* { dg-require-effective-target vect_shift } */
++
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define N 64
++#define C1 10
++#define C2 5
++
++__attribute__ ((noinline)) void
++foo (unsigned char *src, unsigned int *dst1, unsigned int *dst2)
++{
++ int i;
++ unsigned char b, *s = src;
++ unsigned int *d1 = dst1, *d2 = dst2;
++
++ for (i = 0; i < N; i++)
++ {
++ b = *s++;
++ *d1 = b << C1;
++ d1++;
++ *d2 = b << C2;
++ d2++;
++ }
++
++ s = src;
++ d1 = dst1;
++ d2 = dst2;
++ for (i = 0; i < N; i++)
++ {
++ b = *s++;
++ if (*d1 != b << C1 || *d2 != b << C2)
++ abort ();
++ d1++;
++ d2++;
++ }
++}
++
++int main (void)
++{
++ int i;
++ unsigned char in[N];
++ unsigned int out1[N];
++ unsigned int out2[N];
++
++ check_vect ();
++
++ for (i = 0; i < N; i++)
++ {
++ in[i] = i;
++ out1[i] = 255;
++ out2[i] = 255;
++ __asm__ volatile ("");
++ }
++
++ foo (in, out1, out2);
++
++ return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "vect_recog_widen_shift_pattern: detected" 1 "vect" { target vect_widen_shift } } } */
++/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
++/* { dg-final { cleanup-tree-dump "vect" } } */
++
+
+=== modified file 'gcc/testsuite/lib/target-supports.exp'
+--- old/gcc/testsuite/lib/target-supports.exp 2011-10-06 11:08:08 +0000
++++ new/gcc/testsuite/lib/target-supports.exp 2011-10-23 13:33:07 +0000
+@@ -2783,6 +2783,26 @@
+ }
+
+ # Return 1 if the target plus current options supports a vector
++# widening shift, 0 otherwise.
++#
++# This won't change for different subtargets so cache the result.
++
++proc check_effective_target_vect_widen_shift { } {
++ global et_vect_widen_shift_saved
++
++ if [info exists et_vect_shift_saved] {
++ verbose "check_effective_target_vect_widen_shift: using cached result" 2
++ } else {
++ set et_vect_widen_shift_saved 0
++ if { ([istarget arm*-*-*] && [check_effective_target_arm_neon_ok]) } {
++ set et_vect_widen_shift_saved 1
++ }
++ }
++ verbose "check_effective_target_vect_widen_shift: returning $et_vect_widen_shift_saved" 2
++ return $et_vect_widen_shift_saved
++}
++
++# Return 1 if the target plus current options supports a vector
+ # dot-product of signed chars, 0 otherwise.
+ #
+ # This won't change for different subtargets so cache the result.
+
+=== modified file 'gcc/tree-cfg.c'
+--- old/gcc/tree-cfg.c 2011-07-15 13:44:50 +0000
++++ new/gcc/tree-cfg.c 2011-10-23 13:33:07 +0000
+@@ -3473,6 +3473,44 @@
+ return false;
+ }
+
++ case WIDEN_LSHIFT_EXPR:
++ {
++ if (!INTEGRAL_TYPE_P (lhs_type)
++ || !INTEGRAL_TYPE_P (rhs1_type)
++ || TREE_CODE (rhs2) != INTEGER_CST
++ || (2 * TYPE_PRECISION (rhs1_type) > TYPE_PRECISION (lhs_type)))
++ {
++ error ("type mismatch in widening vector shift expression");
++ debug_generic_expr (lhs_type);
++ debug_generic_expr (rhs1_type);
++ debug_generic_expr (rhs2_type);
++ return true;
++ }
++
++ return false;
++ }
++
++ case VEC_WIDEN_LSHIFT_HI_EXPR:
++ case VEC_WIDEN_LSHIFT_LO_EXPR:
++ {
++ if (TREE_CODE (rhs1_type) != VECTOR_TYPE
++ || TREE_CODE (lhs_type) != VECTOR_TYPE
++ || !INTEGRAL_TYPE_P (TREE_TYPE (rhs1_type))
++ || !INTEGRAL_TYPE_P (TREE_TYPE (lhs_type))
++ || TREE_CODE (rhs2) != INTEGER_CST
++ || (2 * TYPE_PRECISION (TREE_TYPE (rhs1_type))
++ > TYPE_PRECISION (TREE_TYPE (lhs_type))))
++ {
++ error ("type mismatch in widening vector shift expression");
++ debug_generic_expr (lhs_type);
++ debug_generic_expr (rhs1_type);
++ debug_generic_expr (rhs2_type);
++ return true;
++ }
++
++ return false;
++ }
++
+ case PLUS_EXPR:
+ case MINUS_EXPR:
+ {
+
+=== modified file 'gcc/tree-inline.c'
+--- old/gcc/tree-inline.c 2011-08-13 08:32:32 +0000
++++ new/gcc/tree-inline.c 2011-10-23 13:33:07 +0000
+@@ -3343,6 +3343,7 @@
+ case DOT_PROD_EXPR:
+ case WIDEN_MULT_PLUS_EXPR:
+ case WIDEN_MULT_MINUS_EXPR:
++ case WIDEN_LSHIFT_EXPR:
+
+ case VEC_WIDEN_MULT_HI_EXPR:
+ case VEC_WIDEN_MULT_LO_EXPR:
+@@ -3357,6 +3358,8 @@
+ case VEC_EXTRACT_ODD_EXPR:
+ case VEC_INTERLEAVE_HIGH_EXPR:
+ case VEC_INTERLEAVE_LOW_EXPR:
++ case VEC_WIDEN_LSHIFT_HI_EXPR:
++ case VEC_WIDEN_LSHIFT_LO_EXPR:
+
+ return 1;
+
+
+=== modified file 'gcc/tree-pretty-print.c'
+--- old/gcc/tree-pretty-print.c 2010-11-05 09:00:50 +0000
++++ new/gcc/tree-pretty-print.c 2011-10-23 13:33:07 +0000
+@@ -1539,6 +1539,7 @@
+ case RROTATE_EXPR:
+ case VEC_LSHIFT_EXPR:
+ case VEC_RSHIFT_EXPR:
++ case WIDEN_LSHIFT_EXPR:
+ case BIT_IOR_EXPR:
+ case BIT_XOR_EXPR:
+ case BIT_AND_EXPR:
+@@ -2209,6 +2210,22 @@
+ pp_string (buffer, " > ");
+ break;
+
++ case VEC_WIDEN_LSHIFT_HI_EXPR:
++ pp_string (buffer, " VEC_WIDEN_LSHIFT_HI_EXPR < ");
++ dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false);
++ pp_string (buffer, ", ");
++ dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false);
++ pp_string (buffer, " > ");
++ break;
++
++ case VEC_WIDEN_LSHIFT_LO_EXPR:
++ pp_string (buffer, " VEC_WIDEN_LSHIFT_HI_EXPR < ");
++ dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false);
++ pp_string (buffer, ", ");
++ dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false);
++ pp_string (buffer, " > ");
++ break;
++
+ case VEC_UNPACK_HI_EXPR:
+ pp_string (buffer, " VEC_UNPACK_HI_EXPR < ");
+ dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false);
+@@ -2531,6 +2548,9 @@
+ case RSHIFT_EXPR:
+ case LROTATE_EXPR:
+ case RROTATE_EXPR:
++ case VEC_WIDEN_LSHIFT_HI_EXPR:
++ case VEC_WIDEN_LSHIFT_LO_EXPR:
++ case WIDEN_LSHIFT_EXPR:
+ return 11;
+
+ case WIDEN_SUM_EXPR:
+@@ -2706,6 +2726,9 @@
+ case VEC_RSHIFT_EXPR:
+ return "v>>";
+
++ case WIDEN_LSHIFT_EXPR:
++ return "w<<";
++
+ case POINTER_PLUS_EXPR:
+ return "+";
+
+
+=== modified file 'gcc/tree-vect-generic.c'
+--- old/gcc/tree-vect-generic.c 2011-02-08 14:16:50 +0000
++++ new/gcc/tree-vect-generic.c 2011-10-23 13:33:07 +0000
+@@ -552,7 +552,9 @@
+ || code == VEC_UNPACK_LO_EXPR
+ || code == VEC_PACK_TRUNC_EXPR
+ || code == VEC_PACK_SAT_EXPR
+- || code == VEC_PACK_FIX_TRUNC_EXPR)
++ || code == VEC_PACK_FIX_TRUNC_EXPR
++ || code == VEC_WIDEN_LSHIFT_HI_EXPR
++ || code == VEC_WIDEN_LSHIFT_LO_EXPR)
+ type = TREE_TYPE (rhs1);
+
+ /* Optabs will try converting a negation into a subtraction, so
+
+=== modified file 'gcc/tree-vect-patterns.c'
+--- old/gcc/tree-vect-patterns.c 2011-09-05 06:23:37 +0000
++++ new/gcc/tree-vect-patterns.c 2011-10-23 13:33:07 +0000
+@@ -48,12 +48,15 @@
+ static gimple vect_recog_pow_pattern (VEC (gimple, heap) **, tree *, tree *);
+ static gimple vect_recog_over_widening_pattern (VEC (gimple, heap) **, tree *,
+ tree *);
++static gimple vect_recog_widen_shift_pattern (VEC (gimple, heap) **,
++ tree *, tree *);
+ static vect_recog_func_ptr vect_vect_recog_func_ptrs[NUM_PATTERNS] = {
+ vect_recog_widen_mult_pattern,
+ vect_recog_widen_sum_pattern,
+ vect_recog_dot_prod_pattern,
+ vect_recog_pow_pattern,
+- vect_recog_over_widening_pattern};
++ vect_recog_over_widening_pattern,
++ vect_recog_widen_shift_pattern};
+
+
+ /* Function widened_name_p
+@@ -331,27 +334,38 @@
+ return pattern_stmt;
+ }
+
+-/* Handle two cases of multiplication by a constant. The first one is when
+- the constant, CONST_OPRND, fits the type (HALF_TYPE) of the second
+- operand (OPRND). In that case, we can peform widen-mult from HALF_TYPE to
+- TYPE.
++
++/* Handle widening operation by a constant. At the moment we support MULT_EXPR
++ and LSHIFT_EXPR.
++
++ For MULT_EXPR we check that CONST_OPRND fits HALF_TYPE, and for LSHIFT_EXPR
++ we check that CONST_OPRND is less or equal to the size of HALF_TYPE.
+
+ Otherwise, if the type of the result (TYPE) is at least 4 times bigger than
+- HALF_TYPE, and CONST_OPRND fits an intermediate type (2 times smaller than
+- TYPE), we can perform widen-mult from the intermediate type to TYPE and
+- replace a_T = (TYPE) a_t; with a_it - (interm_type) a_t; */
++ HALF_TYPE, and there is an intermediate type (2 times smaller than TYPE)
++ that satisfies the above restrictions, we can perform a widening opeartion
++ from the intermediate type to TYPE and replace a_T = (TYPE) a_t;
++ with a_it = (interm_type) a_t; */
+
+ static bool
+-vect_handle_widen_mult_by_const (gimple stmt, tree const_oprnd, tree *oprnd,
+- VEC (gimple, heap) **stmts, tree type,
+- tree *half_type, gimple def_stmt)
++vect_handle_widen_op_by_const (gimple stmt, enum tree_code code,
++ tree const_oprnd, tree *oprnd,
++ VEC (gimple, heap) **stmts, tree type,
++ tree *half_type, gimple def_stmt)
+ {
+ tree new_type, new_oprnd, tmp;
+ gimple new_stmt;
+ loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt));
+ struct loop *loop = LOOP_VINFO_LOOP (loop_info);
+
+- if (int_fits_type_p (const_oprnd, *half_type))
++ if (code != MULT_EXPR && code != LSHIFT_EXPR)
++ return false;
++
++ if (((code == MULT_EXPR && int_fits_type_p (const_oprnd, *half_type))
++ || (code == LSHIFT_EXPR
++ && compare_tree_int (const_oprnd, TYPE_PRECISION (*half_type))
++ != 1))
++ && TYPE_PRECISION (type) == (TYPE_PRECISION (*half_type) * 2))
+ {
+ /* CONST_OPRND is a constant of HALF_TYPE. */
+ *oprnd = gimple_assign_rhs1 (def_stmt);
+@@ -364,14 +378,16 @@
+ || !vinfo_for_stmt (def_stmt))
+ return false;
+
+- /* TYPE is 4 times bigger than HALF_TYPE, try widen-mult for
++ /* TYPE is 4 times bigger than HALF_TYPE, try widening operation for
+ a type 2 times bigger than HALF_TYPE. */
+ new_type = build_nonstandard_integer_type (TYPE_PRECISION (type) / 2,
+ TYPE_UNSIGNED (type));
+- if (!int_fits_type_p (const_oprnd, new_type))
++ if ((code == MULT_EXPR && !int_fits_type_p (const_oprnd, new_type))
++ || (code == LSHIFT_EXPR
++ && compare_tree_int (const_oprnd, TYPE_PRECISION (new_type)) == 1))
+ return false;
+
+- /* Use NEW_TYPE for widen_mult. */
++ /* Use NEW_TYPE for widening operation. */
+ if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)))
+ {
+ new_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
+@@ -381,6 +397,7 @@
+ || TREE_TYPE (gimple_assign_lhs (new_stmt)) != new_type)
+ return false;
+
++ VEC_safe_push (gimple, heap, *stmts, def_stmt);
+ *oprnd = gimple_assign_lhs (new_stmt);
+ }
+ else
+@@ -392,7 +409,6 @@
+ new_oprnd = make_ssa_name (tmp, NULL);
+ new_stmt = gimple_build_assign_with_ops (NOP_EXPR, new_oprnd, *oprnd,
+ NULL_TREE);
+- SSA_NAME_DEF_STMT (new_oprnd) = new_stmt;
+ STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)) = new_stmt;
+ VEC_safe_push (gimple, heap, *stmts, def_stmt);
+ *oprnd = new_oprnd;
+@@ -402,7 +418,6 @@
+ return true;
+ }
+
+-
+ /* Function vect_recog_widen_mult_pattern
+
+ Try to find the following pattern:
+@@ -491,7 +506,7 @@
+ enum tree_code dummy_code;
+ int dummy_int;
+ VEC (tree, heap) *dummy_vec;
+- bool op0_ok, op1_ok;
++ bool op1_ok;
+
+ if (!is_gimple_assign (last_stmt))
+ return NULL;
+@@ -511,38 +526,23 @@
+ return NULL;
+
+ /* Check argument 0. */
+- op0_ok = widened_name_p (oprnd0, last_stmt, &half_type0, &def_stmt0, false);
++ if (!widened_name_p (oprnd0, last_stmt, &half_type0, &def_stmt0, false))
++ return NULL;
+ /* Check argument 1. */
+ op1_ok = widened_name_p (oprnd1, last_stmt, &half_type1, &def_stmt1, false);
+
+- /* In case of multiplication by a constant one of the operands may not match
+- the pattern, but not both. */
+- if (!op0_ok && !op1_ok)
+- return NULL;
+-
+- if (op0_ok && op1_ok)
++ if (op1_ok)
+ {
+ oprnd0 = gimple_assign_rhs1 (def_stmt0);
+ oprnd1 = gimple_assign_rhs1 (def_stmt1);
+ }
+- else if (!op0_ok)
+- {
+- if (TREE_CODE (oprnd0) == INTEGER_CST
+- && TREE_CODE (half_type1) == INTEGER_TYPE
+- && vect_handle_widen_mult_by_const (last_stmt, oprnd0, &oprnd1,
+- stmts, type,
+- &half_type1, def_stmt1))
+- half_type0 = half_type1;
+- else
+- return NULL;
+- }
+- else if (!op1_ok)
++ else
+ {
+ if (TREE_CODE (oprnd1) == INTEGER_CST
+ && TREE_CODE (half_type0) == INTEGER_TYPE
+- && vect_handle_widen_mult_by_const (last_stmt, oprnd1, &oprnd0,
+- stmts, type,
+- &half_type0, def_stmt0))
++ && vect_handle_widen_op_by_const (last_stmt, MULT_EXPR, oprnd1,
++ &oprnd0, stmts, type,
++ &half_type0, def_stmt0))
+ half_type1 = half_type0;
+ else
+ return NULL;
+@@ -998,6 +998,7 @@
+ || TREE_TYPE (gimple_assign_lhs (new_stmt)) != interm_type)
+ return false;
+
++ VEC_safe_push (gimple, heap, *stmts, def_stmt);
+ oprnd = gimple_assign_lhs (new_stmt);
+ }
+ else
+@@ -1128,7 +1129,7 @@
+ statetments, except for the case when the last statement in the
+ sequence doesn't have a corresponding pattern statement. In such
+ case we associate the last pattern statement with the last statement
+- in the sequence. Therefore, we only add an original statetement to
++ in the sequence. Therefore, we only add the original statement to
+ the list if we know that it is not the last. */
+ if (prev_stmt)
+ VEC_safe_push (gimple, heap, *stmts, prev_stmt);
+@@ -1215,6 +1216,231 @@
+ }
+
+
++/* Detect widening shift pattern:
++
++ type a_t;
++ TYPE a_T, res_T;
++
++ S1 a_t = ;
++ S2 a_T = (TYPE) a_t;
++ S3 res_T = a_T << CONST;
++
++ where type 'TYPE' is at least double the size of type 'type'.
++
++ Also detect unsigned cases:
++
++ unsigned type a_t;
++ unsigned TYPE u_res_T;
++ TYPE a_T, res_T;
++
++ S1 a_t = ;
++ S2 a_T = (TYPE) a_t;
++ S3 res_T = a_T << CONST;
++ S4 u_res_T = (unsigned TYPE) res_T;
++
++ And a case when 'TYPE' is 4 times bigger than 'type'. In that case we
++ create an additional pattern stmt for S2 to create a variable of an
++ intermediate type, and perform widen-shift on the intermediate type:
++
++ type a_t;
++ interm_type a_it;
++ TYPE a_T, res_T, res_T';
++
++ S1 a_t = ;
++ S2 a_T = (TYPE) a_t;
++ '--> a_it = (interm_type) a_t;
++ S3 res_T = a_T << CONST;
++ '--> res_T' = a_it <<* CONST;
++
++ Input/Output:
++
++ * STMTS: Contains a stmt from which the pattern search begins.
++ In case of unsigned widen-shift, the original stmt (S3) is replaced with S4
++ in STMTS. When an intermediate type is used and a pattern statement is
++ created for S2, we also put S2 here (before S3).
++
++ Output:
++
++ * TYPE_IN: The type of the input arguments to the pattern.
++
++ * TYPE_OUT: The type of the output of this pattern.
++
++ * Return value: A new stmt that will be used to replace the sequence of
++ stmts that constitute the pattern. In this case it will be:
++ WIDEN_LSHIFT_EXPR <a_t, CONST>. */
++
++static gimple
++vect_recog_widen_shift_pattern (VEC (gimple, heap) **stmts,
++ tree *type_in, tree *type_out)
++{
++ gimple last_stmt = VEC_pop (gimple, *stmts);
++ gimple def_stmt0;
++ tree oprnd0, oprnd1;
++ tree type, half_type0;
++ gimple pattern_stmt, orig_stmt = NULL;
++ tree vectype, vectype_out = NULL_TREE;
++ tree dummy;
++ tree var;
++ enum tree_code dummy_code;
++ int dummy_int;
++ VEC (tree, heap) * dummy_vec;
++ gimple use_stmt = NULL;
++ bool over_widen = false;
++
++ if (!is_gimple_assign (last_stmt) || !vinfo_for_stmt (last_stmt))
++ return NULL;
++
++ orig_stmt = last_stmt;
++ if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (last_stmt)))
++ {
++ /* This statement was also detected as over-widening operation (it can't
++ be any other pattern, because only over-widening detects shifts).
++ LAST_STMT is the final type demotion statement, but its related
++ statement is shift. We analyze the related statement to catch cases:
++
++ orig code:
++ type a_t;
++ itype res;
++ TYPE a_T, res_T;
++
++ S1 a_T = (TYPE) a_t;
++ S2 res_T = a_T << CONST;
++ S3 res = (itype)res_T;
++
++ (size of type * 2 <= size of itype
++ and size of itype * 2 <= size of TYPE)
++
++ code after over-widening pattern detection:
++
++ S1 a_T = (TYPE) a_t;
++ --> a_it = (itype) a_t;
++ S2 res_T = a_T << CONST;
++ S3 res = (itype)res_T; <--- LAST_STMT
++ --> res = a_it << CONST;
++
++ after widen_shift:
++
++ S1 a_T = (TYPE) a_t;
++ --> a_it = (itype) a_t; - redundant
++ S2 res_T = a_T << CONST;
++ S3 res = (itype)res_T;
++ --> res = a_t w<< CONST;
++
++ i.e., we replace the three statements with res = a_t w<< CONST. */
++ last_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_stmt));
++ over_widen = true;
++ }
++
++ if (gimple_assign_rhs_code (last_stmt) != LSHIFT_EXPR)
++ return NULL;
++
++ oprnd0 = gimple_assign_rhs1 (last_stmt);
++ oprnd1 = gimple_assign_rhs2 (last_stmt);
++ if (TREE_CODE (oprnd0) != SSA_NAME || TREE_CODE (oprnd1) != INTEGER_CST)
++ return NULL;
++
++ /* Check operand 0: it has to be defined by a type promotion. */
++ if (!widened_name_p (oprnd0, last_stmt, &half_type0, &def_stmt0, false))
++ return NULL;
++
++ /* Check operand 1: has to be positive. We check that it fits the type
++ in vect_handle_widen_op_by_const (). */
++ if (tree_int_cst_compare (oprnd1, size_zero_node) <= 0)
++ return NULL;
++
++ oprnd0 = gimple_assign_rhs1 (def_stmt0);
++ type = gimple_expr_type (last_stmt);
++
++ /* Check if this a widening operation. */
++ if (!vect_handle_widen_op_by_const (last_stmt, LSHIFT_EXPR, oprnd1,
++ &oprnd0, stmts,
++ type, &half_type0, def_stmt0))
++ return NULL;
++
++ /* Handle unsigned case. Look for
++ S4 u_res_T = (unsigned TYPE) res_T;
++ Use unsigned TYPE as the type for WIDEN_LSHIFT_EXPR. */
++ if (TYPE_UNSIGNED (type) != TYPE_UNSIGNED (half_type0))
++ {
++ tree lhs = gimple_assign_lhs (last_stmt), use_lhs;
++ imm_use_iterator imm_iter;
++ use_operand_p use_p;
++ int nuses = 0;
++ tree use_type;
++
++ if (over_widen)
++ {
++ /* In case of over-widening pattern, S4 should be ORIG_STMT itself.
++ We check here that TYPE is the correct type for the operation,
++ i.e., it's the type of the original result. */
++ tree orig_type = gimple_expr_type (orig_stmt);
++ if ((TYPE_UNSIGNED (type) != TYPE_UNSIGNED (orig_type))
++ || (TYPE_PRECISION (type) != TYPE_PRECISION (orig_type)))
++ return NULL;
++ }
++ else
++ {
++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
++ {
++ if (is_gimple_debug (USE_STMT (use_p)))
++ continue;
++ use_stmt = USE_STMT (use_p);
++ nuses++;
++ }
++
++ if (nuses != 1 || !is_gimple_assign (use_stmt)
++ || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (use_stmt)))
++ return NULL;
++
++ use_lhs = gimple_assign_lhs (use_stmt);
++ use_type = TREE_TYPE (use_lhs);
++
++ if (!INTEGRAL_TYPE_P (use_type)
++ || (TYPE_UNSIGNED (type) == TYPE_UNSIGNED (use_type))
++ || (TYPE_PRECISION (type) != TYPE_PRECISION (use_type)))
++ return NULL;
++
++ type = use_type;
++ }
++ }
++
++ /* Pattern detected. */
++ if (vect_print_dump_info (REPORT_DETAILS))
++ fprintf (vect_dump, "vect_recog_widen_shift_pattern: detected: ");
++
++ /* Check target support. */
++ vectype = get_vectype_for_scalar_type (half_type0);
++ vectype_out = get_vectype_for_scalar_type (type);
++
++ if (!vectype
++ || !vectype_out
++ || !supportable_widening_operation (WIDEN_LSHIFT_EXPR, last_stmt,
++ vectype_out, vectype,
++ &dummy, &dummy, &dummy_code,
++ &dummy_code, &dummy_int,
++ &dummy_vec))
++ return NULL;
++
++ *type_in = vectype;
++ *type_out = vectype_out;
++
++ /* Pattern supported. Create a stmt to be used to replace the pattern. */
++ var = vect_recog_temp_ssa_var (type, NULL);
++ pattern_stmt =
++ gimple_build_assign_with_ops (WIDEN_LSHIFT_EXPR, var, oprnd0, oprnd1);
++
++ if (vect_print_dump_info (REPORT_DETAILS))
++ print_gimple_stmt (vect_dump, pattern_stmt, 0, TDF_SLIM);
++
++ if (use_stmt)
++ last_stmt = use_stmt;
++ else
++ last_stmt = orig_stmt;
++
++ VEC_safe_push (gimple, heap, *stmts, last_stmt);
++ return pattern_stmt;
++}
++
+ /* Mark statements that are involved in a pattern. */
+
+ static inline void
+@@ -1278,7 +1504,8 @@
+ static void
+ vect_pattern_recog_1 (
+ gimple (* vect_recog_func) (VEC (gimple, heap) **, tree *, tree *),
+- gimple_stmt_iterator si)
++ gimple_stmt_iterator si,
++ VEC (gimple, heap) **stmts_to_replace)
+ {
+ gimple stmt = gsi_stmt (si), pattern_stmt;
+ stmt_vec_info stmt_info;
+@@ -1288,14 +1515,14 @@
+ enum tree_code code;
+ int i;
+ gimple next;
+- VEC (gimple, heap) *stmts_to_replace = VEC_alloc (gimple, heap, 1);
+
+- VEC_quick_push (gimple, stmts_to_replace, stmt);
+- pattern_stmt = (* vect_recog_func) (&stmts_to_replace, &type_in, &type_out);
++ VEC_truncate (gimple, *stmts_to_replace, 0);
++ VEC_quick_push (gimple, *stmts_to_replace, stmt);
++ pattern_stmt = (* vect_recog_func) (stmts_to_replace, &type_in, &type_out);
+ if (!pattern_stmt)
+ return;
+
+- stmt = VEC_last (gimple, stmts_to_replace);
++ stmt = VEC_last (gimple, *stmts_to_replace);
+ stmt_info = vinfo_for_stmt (stmt);
+ loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+
+@@ -1303,8 +1530,6 @@
+ {
+ /* No need to check target support (already checked by the pattern
+ recognition function). */
+- if (type_out)
+- gcc_assert (VECTOR_MODE_P (TYPE_MODE (type_out)));
+ pattern_vectype = type_out ? type_out : type_in;
+ }
+ else
+@@ -1360,8 +1585,8 @@
+ /* It is possible that additional pattern stmts are created and inserted in
+ STMTS_TO_REPLACE. We create a stmt_info for each of them, and mark the
+ relevant statements. */
+- for (i = 0; VEC_iterate (gimple, stmts_to_replace, i, stmt)
+- && (unsigned) i < (VEC_length (gimple, stmts_to_replace) - 1);
++ for (i = 0; VEC_iterate (gimple, *stmts_to_replace, i, stmt)
++ && (unsigned) i < (VEC_length (gimple, *stmts_to_replace) - 1);
+ i++)
+ {
+ stmt_info = vinfo_for_stmt (stmt);
+@@ -1374,8 +1599,6 @@
+
+ vect_mark_pattern_stmts (stmt, pattern_stmt, NULL_TREE);
+ }
+-
+- VEC_free (gimple, heap, stmts_to_replace);
+ }
+
+
+@@ -1465,6 +1688,7 @@
+ gimple_stmt_iterator si;
+ unsigned int i, j;
+ gimple (* vect_recog_func_ptr) (VEC (gimple, heap) **, tree *, tree *);
++ VEC (gimple, heap) *stmts_to_replace = VEC_alloc (gimple, heap, 1);
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "=== vect_pattern_recog ===");
+@@ -1480,8 +1704,11 @@
+ for (j = 0; j < NUM_PATTERNS; j++)
+ {
+ vect_recog_func_ptr = vect_vect_recog_func_ptrs[j];
+- vect_pattern_recog_1 (vect_recog_func_ptr, si);
++ vect_pattern_recog_1 (vect_recog_func_ptr, si,
++ &stmts_to_replace);
+ }
+ }
+ }
++
++ VEC_free (gimple, heap, stmts_to_replace);
+ }
+
+=== modified file 'gcc/tree-vect-slp.c'
+--- old/gcc/tree-vect-slp.c 2011-10-23 11:29:25 +0000
++++ new/gcc/tree-vect-slp.c 2011-10-27 11:27:59 +0000
+@@ -480,6 +480,11 @@
+ }
+ }
+ }
++ else if (rhs_code == WIDEN_LSHIFT_EXPR)
++ {
++ need_same_oprnds = true;
++ first_op1 = gimple_assign_rhs2 (stmt);
++ }
+ }
+ else
+ {
+
+=== modified file 'gcc/tree-vect-stmts.c'
+--- old/gcc/tree-vect-stmts.c 2011-10-23 11:29:25 +0000
++++ new/gcc/tree-vect-stmts.c 2011-10-27 11:27:59 +0000
+@@ -3359,6 +3359,7 @@
+ VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
+ VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
+ bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
++ unsigned int k;
+
+ if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
+ return false;
+@@ -3375,7 +3376,8 @@
+
+ code = gimple_assign_rhs_code (stmt);
+ if (!CONVERT_EXPR_CODE_P (code)
+- && code != WIDEN_MULT_EXPR)
++ && code != WIDEN_MULT_EXPR
++ && code != WIDEN_LSHIFT_EXPR)
+ return false;
+
+ scalar_dest = gimple_assign_lhs (stmt);
+@@ -3403,7 +3405,7 @@
+ bool ok;
+
+ op1 = gimple_assign_rhs2 (stmt);
+- if (code == WIDEN_MULT_EXPR)
++ if (code == WIDEN_MULT_EXPR || code == WIDEN_LSHIFT_EXPR)
+ {
+ /* For WIDEN_MULT_EXPR, if OP0 is a constant, use the type of
+ OP1. */
+@@ -3480,7 +3482,7 @@
+ fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
+ ncopies);
+
+- if (code == WIDEN_MULT_EXPR)
++ if (code == WIDEN_MULT_EXPR || code == WIDEN_LSHIFT_EXPR)
+ {
+ if (CONSTANT_CLASS_P (op0))
+ op0 = fold_convert (TREE_TYPE (op1), op0);
+@@ -3521,6 +3523,8 @@
+ if (op_type == binary_op)
+ vec_oprnds1 = VEC_alloc (tree, heap, 1);
+ }
++ else if (code == WIDEN_LSHIFT_EXPR)
++ vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
+
+ /* In case the vectorization factor (VF) is bigger than the number
+ of elements that we can fit in a vectype (nunits), we have to generate
+@@ -3534,15 +3538,33 @@
+ if (j == 0)
+ {
+ if (slp_node)
+- vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0,
+- &vec_oprnds1, -1);
+- else
++ {
++ if (code == WIDEN_LSHIFT_EXPR)
++ {
++ vec_oprnd1 = op1;
++ /* Store vec_oprnd1 for every vector stmt to be created
++ for SLP_NODE. We check during the analysis that all
++ the shift arguments are the same. */
++ for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
++ VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
++
++ vect_get_slp_defs (op0, NULL_TREE, slp_node, &vec_oprnds0, NULL,
++ -1);
++ }
++ else
++ vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0,
++ &vec_oprnds1, -1);
++ }
++ else
+ {
+ vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
+ VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
+ if (op_type == binary_op)
+ {
+- vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
++ if (code == WIDEN_LSHIFT_EXPR)
++ vec_oprnd1 = op1;
++ else
++ vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
+ VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+ }
+ }
+@@ -3553,7 +3575,10 @@
+ VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
+ if (op_type == binary_op)
+ {
+- vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
++ if (code == WIDEN_LSHIFT_EXPR)
++ vec_oprnd1 = op1;
++ else
++ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
+ VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
+ }
+ }
+@@ -5853,6 +5878,19 @@
+ }
+ break;
+
++ case WIDEN_LSHIFT_EXPR:
++ if (BYTES_BIG_ENDIAN)
++ {
++ c1 = VEC_WIDEN_LSHIFT_HI_EXPR;
++ c2 = VEC_WIDEN_LSHIFT_LO_EXPR;
++ }
++ else
++ {
++ c2 = VEC_WIDEN_LSHIFT_HI_EXPR;
++ c1 = VEC_WIDEN_LSHIFT_LO_EXPR;
++ }
++ break;
++
+ CASE_CONVERT:
+ if (BYTES_BIG_ENDIAN)
+ {
+
+=== modified file 'gcc/tree-vectorizer.h'
+--- old/gcc/tree-vectorizer.h 2011-09-05 06:23:37 +0000
++++ new/gcc/tree-vectorizer.h 2011-10-23 13:33:07 +0000
+@@ -896,7 +896,7 @@
+ Additional pattern recognition functions can (and will) be added
+ in the future. */
+ typedef gimple (* vect_recog_func_ptr) (VEC (gimple, heap) **, tree *, tree *);
+-#define NUM_PATTERNS 5
++#define NUM_PATTERNS 6
+ void vect_pattern_recog (loop_vec_info);
+
+ /* In tree-vectorizer.c. */
+
+=== modified file 'gcc/tree.def'
+--- old/gcc/tree.def 2011-01-21 14:14:12 +0000
++++ new/gcc/tree.def 2011-10-23 13:33:07 +0000
+@@ -1092,6 +1092,19 @@
+ is subtracted from t3. */
+ DEFTREECODE (WIDEN_MULT_MINUS_EXPR, "widen_mult_minus_expr", tcc_expression, 3)
+
++/* Widening shift left.
++ The first operand is of type t1.
++ The second operand is the number of bits to shift by; it need not be the
++ same type as the first operand and result.
++ Note that the result is undefined if the second operand is larger
++ than or equal to the first operand's type size.
++ The type of the entire expression is t2, such that t2 is at least twice
++ the size of t1.
++ WIDEN_LSHIFT_EXPR is equivalent to first widening (promoting)
++ the first argument from type t1 to type t2, and then shifting it
++ by the second argument. */
++DEFTREECODE (WIDEN_LSHIFT_EXPR, "widen_lshift_expr", tcc_binary, 2)
++
+ /* Fused multiply-add.
+ All operands and the result are of the same type. No intermediate
+ rounding is performed after multiplying operand one with operand two
+@@ -1147,6 +1160,16 @@
+ DEFTREECODE (VEC_INTERLEAVE_HIGH_EXPR, "vec_interleavehigh_expr", tcc_binary, 2)
+ DEFTREECODE (VEC_INTERLEAVE_LOW_EXPR, "vec_interleavelow_expr", tcc_binary, 2)
+
++/* Widening vector shift left in bits.
++ Operand 0 is a vector to be shifted with N elements of size S.
++ Operand 1 is an integer shift amount in bits.
++ The result of the operation is N elements of size 2*S.
++ VEC_WIDEN_LSHIFT_HI_EXPR computes the N/2 high results.
++ VEC_WIDEN_LSHIFT_LO_EXPR computes the N/2 low results.
++ */
++DEFTREECODE (VEC_WIDEN_LSHIFT_HI_EXPR, "widen_lshift_hi_expr", tcc_binary, 2)
++DEFTREECODE (VEC_WIDEN_LSHIFT_LO_EXPR, "widen_lshift_lo_expr", tcc_binary, 2)
++
+ /* PREDICT_EXPR. Specify hint for branch prediction. The
+ PREDICT_EXPR_PREDICTOR specify predictor and PREDICT_EXPR_OUTCOME the
+ outcome (0 for not taken and 1 for taken). Once the profile is guessed
+
--- /dev/null
+2011-11-04 Revital Eres <revital.eres@linaro.org>
+
+ Backport from mainline -r180673:
+
+ gcc/
+ * modulo-sched.c (generate_prolog_epilog): Mark prolog
+ and epilog as BB_DISABLE_SCHEDULE.
+ (mark_loop_unsched): New function.
+ (sms_schedule): Call it.
+
+=== modified file 'gcc/modulo-sched.c'
+--- old/gcc/modulo-sched.c 2011-10-10 14:35:32 +0000
++++ new/gcc/modulo-sched.c 2011-10-30 05:31:00 +0000
+@@ -1173,6 +1173,8 @@
+ /* Put the prolog on the entry edge. */
+ e = loop_preheader_edge (loop);
+ split_edge_and_insert (e, get_insns ());
++ if (!flag_resched_modulo_sched)
++ e->dest->flags |= BB_DISABLE_SCHEDULE;
+
+ end_sequence ();
+
+@@ -1186,9 +1188,24 @@
+ gcc_assert (single_exit (loop));
+ e = single_exit (loop);
+ split_edge_and_insert (e, get_insns ());
++ if (!flag_resched_modulo_sched)
++ e->dest->flags |= BB_DISABLE_SCHEDULE;
++
+ end_sequence ();
+ }
+
++/* Mark LOOP as software pipelined so the later
++ scheduling passes don't touch it. */
++static void
++mark_loop_unsched (struct loop *loop)
++{
++ unsigned i;
++ basic_block *bbs = get_loop_body (loop);
++
++ for (i = 0; i < loop->num_nodes; i++)
++ bbs[i]->flags |= BB_DISABLE_SCHEDULE;
++}
++
+ /* Return true if all the BBs of the loop are empty except the
+ loop header. */
+ static bool
+@@ -1714,9 +1731,10 @@
+ permute_partial_schedule (ps, g->closing_branch->first_note);
+
+ /* Mark this loop as software pipelined so the later
+- scheduling passes doesn't touch it. */
++ scheduling passes don't touch it. */
+ if (! flag_resched_modulo_sched)
+- g->bb->flags |= BB_DISABLE_SCHEDULE;
++ mark_loop_unsched (loop);
++
+ /* The life-info is not valid any more. */
+ df_set_bb_dirty (g->bb);
+
+
--- /dev/null
+2011-11-02 Andrew Stubbs <ams@codesourcery.com>
+
+ Backport from FSF mainline:
+
+ 2011-11-01 Andrew Stubbs <ams@codesourcery.com>
+
+ gcc/
+ * config/arm/bpabi.h (BE8_LINK_SPEC): Recognize generic-armv7 tuning.
+
+=== modified file 'gcc/config/arm/bpabi.h'
+--- old/gcc/config/arm/bpabi.h 2010-12-20 17:48:51 +0000
++++ new/gcc/config/arm/bpabi.h 2011-11-02 21:02:53 +0000
+@@ -56,7 +56,8 @@
+ "|march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}"
+
+ #define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a5"\
+- "|mcpu=cortex-a8|mcpu=cortex-a9|mcpu=cortex-a15:%{!r:--be8}}}"
++ "|mcpu=cortex-a8|mcpu=cortex-a9|mcpu=cortex-a15|mcpu=generic-armv7-a"\
++ ":%{!r:--be8}}}"
+
+ /* Tell the assembler to build BPABI binaries. */
+ #undef SUBTARGET_EXTRA_ASM_SPEC
+
--- /dev/null
+2011-11-17 Ira Rosen <ira.rosen@linaro.org>
+
+ Backport from mainline:
+
+ 2011-11-03 Ira Rosen <ira.rosen@linaro.org>
+
+ gcc/
+ * tree-vectorizer.h (slp_void_p): New.
+ (struct _slp_tree): Replace left and right with children. Update
+ documentation.
+ (struct _slp_oprnd_info): New.
+ (vect_get_vec_defs): Declare.
+ (vect_get_slp_defs): Update arguments.
+ * tree-vect-loop.c (vect_create_epilog_for_reduction): Call
+ vect_get_vec_defs instead of vect_get_slp_defs.
+ (vectorizable_reduction): Likewise.
+ * tree-vect-stmts.c (vect_get_vec_defs): Remove static, add argument.
+ Update call to vect_get_slp_defs.
+ (vectorizable_conversion): Update call to vect_get_vec_defs.
+ (vectorizable_assignment, vectorizable_shift,
+ vectorizable_operation): Likewise.
+ (vectorizable_type_demotion): Call vect_get_vec_defs instead of
+ vect_get_slp_defs.
+ (vectorizable_type_promotion, vectorizable_store): Likewise.
+ (vect_analyze_stmt): Fix typo.
+ * tree-vect-slp.c (vect_free_slp_tree): Update SLP tree traversal.
+ (vect_print_slp_tree, vect_mark_slp_stmts,
+ vect_mark_slp_stmts_relevant, vect_slp_rearrange_stmts,
+ vect_detect_hybrid_slp_stmts, vect_slp_analyze_node_operations,
+ vect_schedule_slp_instance): Likewise.
+ (vect_create_new_slp_node): New.
+ (vect_create_oprnd_info, vect_free_oprnd_info): Likewise.
+ (vect_get_and_check_slp_defs): Pass information about defs using
+ oprnds_info, allow any number of operands.
+ (vect_build_slp_tree): Likewise. Update calls to
+ vect_get_and_check_slp_defs. Fix comments.
+ (vect_analyze_slp_instance): Move node creation to
+ vect_create_new_slp_node.
+ (vect_get_slp_defs): Allow any number of operands.
+
+ 2011-11-11 Jakub Jelinek <jakub@redhat.com>
+
+ gcc/
+ * tree-vect-slp.c (vect_free_slp_tree): Also free SLP_TREE_CHILDREN
+ vector.
+ (vect_create_new_slp_node): Don't allocate node before checking stmt
+ type.
+ (vect_free_oprnd_info): Remove FREE_DEF_STMTS argument, always
+ free def_stmts vectors and additionally free oprnd_info.
+ (vect_build_slp_tree): Adjust callers. Call it even if
+ stop_recursion. If vect_create_new_slp_node or
+ vect_build_slp_tree fails, properly handle freeing memory.
+ If it succeeded, clear def_stmts in oprnd_info.
+
+=== modified file 'gcc/tree-vect-loop.c'
+--- old/gcc/tree-vect-loop.c 2011-09-05 06:23:37 +0000
++++ new/gcc/tree-vect-loop.c 2011-11-14 11:38:08 +0000
+@@ -3282,8 +3282,8 @@
+
+ /* Get the loop-entry arguments. */
+ if (slp_node)
+- vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
+- NULL, reduc_index);
++ vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
++ NULL, slp_node, reduc_index);
+ else
+ {
+ vec_initial_defs = VEC_alloc (tree, heap, 1);
+@@ -4451,8 +4451,8 @@
+ }
+
+ if (slp_node)
+- vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
+- -1);
++ vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
++ slp_node, -1);
+ else
+ {
+ loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
+
+=== modified file 'gcc/tree-vect-slp.c'
+--- old/gcc/tree-vect-slp.c 2011-10-27 11:27:59 +0000
++++ new/gcc/tree-vect-slp.c 2011-11-14 11:38:08 +0000
+@@ -67,15 +67,16 @@
+ static void
+ vect_free_slp_tree (slp_tree node)
+ {
++ int i;
++ slp_void_p child;
++
+ if (!node)
+ return;
+
+- if (SLP_TREE_LEFT (node))
+- vect_free_slp_tree (SLP_TREE_LEFT (node));
+-
+- if (SLP_TREE_RIGHT (node))
+- vect_free_slp_tree (SLP_TREE_RIGHT (node));
+-
++ FOR_EACH_VEC_ELT (slp_void_p, SLP_TREE_CHILDREN (node), i, child)
++ vect_free_slp_tree ((slp_tree) child);
++
++ VEC_free (slp_void_p, heap, SLP_TREE_CHILDREN (node));
+ VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node));
+
+ if (SLP_TREE_VEC_STMTS (node))
+@@ -96,48 +97,116 @@
+ }
+
+
+-/* Get the defs for the rhs of STMT (collect them in DEF_STMTS0/1), check that
+- they are of a legal type and that they match the defs of the first stmt of
+- the SLP group (stored in FIRST_STMT_...). */
++/* Create an SLP node for SCALAR_STMTS. */
++
++static slp_tree
++vect_create_new_slp_node (VEC (gimple, heap) *scalar_stmts)
++{
++ slp_tree node;
++ gimple stmt = VEC_index (gimple, scalar_stmts, 0);
++ unsigned int nops;
++
++ if (is_gimple_call (stmt))
++ nops = gimple_call_num_args (stmt);
++ else if (is_gimple_assign (stmt))
++ nops = gimple_num_ops (stmt) - 1;
++ else
++ return NULL;
++
++ node = XNEW (struct _slp_tree);
++ SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
++ SLP_TREE_VEC_STMTS (node) = NULL;
++ SLP_TREE_CHILDREN (node) = VEC_alloc (slp_void_p, heap, nops);
++ SLP_TREE_OUTSIDE_OF_LOOP_COST (node) = 0;
++ SLP_TREE_INSIDE_OF_LOOP_COST (node) = 0;
++
++ return node;
++}
++
++
++/* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
++ operand. */
++static VEC (slp_oprnd_info, heap) *
++vect_create_oprnd_info (int nops, int group_size)
++{
++ int i;
++ slp_oprnd_info oprnd_info;
++ VEC (slp_oprnd_info, heap) *oprnds_info;
++
++ oprnds_info = VEC_alloc (slp_oprnd_info, heap, nops);
++ for (i = 0; i < nops; i++)
++ {
++ oprnd_info = XNEW (struct _slp_oprnd_info);
++ oprnd_info->def_stmts = VEC_alloc (gimple, heap, group_size);
++ oprnd_info->first_dt = vect_uninitialized_def;
++ oprnd_info->first_def_type = NULL_TREE;
++ oprnd_info->first_const_oprnd = NULL_TREE;
++ oprnd_info->first_pattern = false;
++ VEC_quick_push (slp_oprnd_info, oprnds_info, oprnd_info);
++ }
++
++ return oprnds_info;
++}
++
++
++/* Free operands info. */
++
++static void
++vect_free_oprnd_info (VEC (slp_oprnd_info, heap) **oprnds_info)
++{
++ int i;
++ slp_oprnd_info oprnd_info;
++
++ FOR_EACH_VEC_ELT (slp_oprnd_info, *oprnds_info, i, oprnd_info)
++ {
++ VEC_free (gimple, heap, oprnd_info->def_stmts);
++ XDELETE (oprnd_info);
++ }
++
++ VEC_free (slp_oprnd_info, heap, *oprnds_info);
++}
++
++
++/* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
++ they are of a valid type and that they match the defs of the first stmt of
++ the SLP group (stored in OPRNDS_INFO). */
+
+ static bool
+ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
+ slp_tree slp_node, gimple stmt,
+- VEC (gimple, heap) **def_stmts0,
+- VEC (gimple, heap) **def_stmts1,
+- enum vect_def_type *first_stmt_dt0,
+- enum vect_def_type *first_stmt_dt1,
+- tree *first_stmt_def0_type,
+- tree *first_stmt_def1_type,
+- tree *first_stmt_const_oprnd,
+- int ncopies_for_cost,
+- bool *pattern0, bool *pattern1)
++ int ncopies_for_cost, bool first,
++ VEC (slp_oprnd_info, heap) **oprnds_info)
+ {
+ tree oprnd;
+ unsigned int i, number_of_oprnds;
+- tree def[2];
++ tree def, def_op0 = NULL_TREE;
+ gimple def_stmt;
+- enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
+- stmt_vec_info stmt_info =
+- vinfo_for_stmt (VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0));
+- enum gimple_rhs_class rhs_class;
++ enum vect_def_type dt = vect_uninitialized_def;
++ enum vect_def_type dt_op0 = vect_uninitialized_def;
++ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
++ tree lhs = gimple_get_lhs (stmt);
+ struct loop *loop = NULL;
+ enum tree_code rhs_code;
+ bool different_types = false;
++ bool pattern = false;
++ slp_oprnd_info oprnd_info, oprnd0_info, oprnd1_info;
+
+ if (loop_vinfo)
+ loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+- rhs_class = get_gimple_rhs_class (gimple_assign_rhs_code (stmt));
+- number_of_oprnds = gimple_num_ops (stmt) - 1; /* RHS only */
++ if (is_gimple_call (stmt))
++ number_of_oprnds = gimple_call_num_args (stmt);
++ else
++ number_of_oprnds = gimple_num_ops (stmt) - 1;
+
+ for (i = 0; i < number_of_oprnds; i++)
+ {
+ oprnd = gimple_op (stmt, i + 1);
++ oprnd_info = VEC_index (slp_oprnd_info, *oprnds_info, i);
+
+- if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def[i],
+- &dt[i])
+- || (!def_stmt && dt[i] != vect_constant_def))
++ if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def,
++ &dt)
++ || (!def_stmt && dt != vect_constant_def))
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+@@ -158,29 +227,24 @@
+ && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
+ && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
+ {
+- if (!*first_stmt_dt0)
+- *pattern0 = true;
+- else
+- {
+- if (i == 1 && !*first_stmt_dt1)
+- *pattern1 = true;
+- else if ((i == 0 && !*pattern0) || (i == 1 && !*pattern1))
+- {
+- if (vect_print_dump_info (REPORT_DETAILS))
+- {
+- fprintf (vect_dump, "Build SLP failed: some of the stmts"
+- " are in a pattern, and others are not ");
+- print_generic_expr (vect_dump, oprnd, TDF_SLIM);
+- }
++ pattern = true;
++ if (!first && !oprnd_info->first_pattern)
++ {
++ if (vect_print_dump_info (REPORT_DETAILS))
++ {
++ fprintf (vect_dump, "Build SLP failed: some of the stmts"
++ " are in a pattern, and others are not ");
++ print_generic_expr (vect_dump, oprnd, TDF_SLIM);
++ }
+
+- return false;
+- }
++ return false;
+ }
+
+ def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
+- dt[i] = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
++ dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
+
+- if (*dt == vect_unknown_def_type)
++ if (dt == vect_unknown_def_type
++ || STMT_VINFO_PATTERN_DEF_STMT (vinfo_for_stmt (def_stmt)))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "Unsupported pattern.");
+@@ -190,11 +254,11 @@
+ switch (gimple_code (def_stmt))
+ {
+ case GIMPLE_PHI:
+- def[i] = gimple_phi_result (def_stmt);
++ def = gimple_phi_result (def_stmt);
+ break;
+
+ case GIMPLE_ASSIGN:
+- def[i] = gimple_assign_lhs (def_stmt);
++ def = gimple_assign_lhs (def_stmt);
+ break;
+
+ default:
+@@ -204,117 +268,125 @@
+ }
+ }
+
+- if (!*first_stmt_dt0)
++ if (first)
+ {
+- /* op0 of the first stmt of the group - store its info. */
+- *first_stmt_dt0 = dt[i];
+- if (def[i])
+- *first_stmt_def0_type = TREE_TYPE (def[i]);
+- else
+- *first_stmt_const_oprnd = oprnd;
++ oprnd_info->first_dt = dt;
++ oprnd_info->first_pattern = pattern;
++ if (def)
++ {
++ oprnd_info->first_def_type = TREE_TYPE (def);
++ oprnd_info->first_const_oprnd = NULL_TREE;
++ }
++ else
++ {
++ oprnd_info->first_def_type = NULL_TREE;
++ oprnd_info->first_const_oprnd = oprnd;
++ }
+
+- /* Analyze costs (for the first stmt of the group only). */
+- if (rhs_class != GIMPLE_SINGLE_RHS)
+- /* Not memory operation (we don't call this functions for loads). */
+- vect_model_simple_cost (stmt_info, ncopies_for_cost, dt, slp_node);
+- else
+- /* Store. */
+- vect_model_store_cost (stmt_info, ncopies_for_cost, false,
+- dt[0], slp_node);
++ if (i == 0)
++ {
++ def_op0 = def;
++ dt_op0 = dt;
++ /* Analyze costs (for the first stmt of the group only). */
++ if (REFERENCE_CLASS_P (lhs))
++ /* Store. */
++ vect_model_store_cost (stmt_info, ncopies_for_cost, false,
++ dt, slp_node);
++ else
++ /* Not memory operation (we don't call this function for
++ loads). */
++ vect_model_simple_cost (stmt_info, ncopies_for_cost, &dt,
++ slp_node);
++ }
+ }
+
+ else
+ {
+- if (!*first_stmt_dt1 && i == 1)
+- {
+- /* op1 of the first stmt of the group - store its info. */
+- *first_stmt_dt1 = dt[i];
+- if (def[i])
+- *first_stmt_def1_type = TREE_TYPE (def[i]);
+- else
+- {
+- /* We assume that the stmt contains only one constant
+- operand. We fail otherwise, to be on the safe side. */
+- if (*first_stmt_const_oprnd)
+- {
+- if (vect_print_dump_info (REPORT_SLP))
+- fprintf (vect_dump, "Build SLP failed: two constant "
+- "oprnds in stmt");
+- return false;
+- }
+- *first_stmt_const_oprnd = oprnd;
+- }
+- }
+- else
+- {
+- /* Not first stmt of the group, check that the def-stmt/s match
+- the def-stmt/s of the first stmt. */
+- if ((i == 0
+- && (*first_stmt_dt0 != dt[i]
+- || (*first_stmt_def0_type && def[0]
+- && !types_compatible_p (*first_stmt_def0_type,
+- TREE_TYPE (def[0])))))
+- || (i == 1
+- && (*first_stmt_dt1 != dt[i]
+- || (*first_stmt_def1_type && def[1]
+- && !types_compatible_p (*first_stmt_def1_type,
+- TREE_TYPE (def[1])))))
+- || (!def[i]
+- && !types_compatible_p (TREE_TYPE (*first_stmt_const_oprnd),
+- TREE_TYPE (oprnd)))
+- || different_types)
+- {
+- if (i != number_of_oprnds - 1)
+- different_types = true;
++ /* Not first stmt of the group, check that the def-stmt/s match
++ the def-stmt/s of the first stmt. Allow different definition
++ types for reduction chains: the first stmt must be a
++ vect_reduction_def (a phi node), and the rest
++ vect_internal_def. */
++ if (((oprnd_info->first_dt != dt
++ && !(oprnd_info->first_dt == vect_reduction_def
++ && dt == vect_internal_def))
++ || (oprnd_info->first_def_type != NULL_TREE
++ && def
++ && !types_compatible_p (oprnd_info->first_def_type,
++ TREE_TYPE (def))))
++ || (!def
++ && !types_compatible_p (TREE_TYPE (oprnd_info->first_const_oprnd),
++ TREE_TYPE (oprnd)))
++ || different_types)
++ {
++ if (number_of_oprnds != 2)
++ {
++ if (vect_print_dump_info (REPORT_SLP))
++ fprintf (vect_dump, "Build SLP failed: different types ");
++
++ return false;
++ }
++
++ /* Try to swap operands in case of binary operation. */
++ if (i == 0)
++ different_types = true;
++ else
++ {
++ oprnd0_info = VEC_index (slp_oprnd_info, *oprnds_info, 0);
++ if (is_gimple_assign (stmt)
++ && (rhs_code = gimple_assign_rhs_code (stmt))
++ && TREE_CODE_CLASS (rhs_code) == tcc_binary
++ && commutative_tree_code (rhs_code)
++ && oprnd0_info->first_dt == dt
++ && oprnd_info->first_dt == dt_op0
++ && def_op0 && def
++ && !(oprnd0_info->first_def_type
++ && !types_compatible_p (oprnd0_info->first_def_type,
++ TREE_TYPE (def)))
++ && !(oprnd_info->first_def_type
++ && !types_compatible_p (oprnd_info->first_def_type,
++ TREE_TYPE (def_op0))))
++ {
++ if (vect_print_dump_info (REPORT_SLP))
++ {
++ fprintf (vect_dump, "Swapping operands of ");
++ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
++ }
++
++ swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
++ gimple_assign_rhs2_ptr (stmt));
++ }
+ else
+- {
+- if (is_gimple_assign (stmt)
+- && (rhs_code = gimple_assign_rhs_code (stmt))
+- && TREE_CODE_CLASS (rhs_code) == tcc_binary
+- && commutative_tree_code (rhs_code)
+- && *first_stmt_dt0 == dt[1]
+- && *first_stmt_dt1 == dt[0]
+- && def[0] && def[1]
+- && !(*first_stmt_def0_type
+- && !types_compatible_p (*first_stmt_def0_type,
+- TREE_TYPE (def[1])))
+- && !(*first_stmt_def1_type
+- && !types_compatible_p (*first_stmt_def1_type,
+- TREE_TYPE (def[0]))))
+- {
+- if (vect_print_dump_info (REPORT_SLP))
+- {
+- fprintf (vect_dump, "Swapping operands of ");
+- print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+- }
+- swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
+- gimple_assign_rhs2_ptr (stmt));
+- }
+- else
+- {
+- if (vect_print_dump_info (REPORT_SLP))
+- fprintf (vect_dump, "Build SLP failed: different types ");
+-
+- return false;
+- }
+- }
++ {
++ if (vect_print_dump_info (REPORT_SLP))
++ fprintf (vect_dump, "Build SLP failed: different types ");
++
++ return false;
++ }
+ }
+ }
+ }
+
+ /* Check the types of the definitions. */
+- switch (dt[i])
++ switch (dt)
+ {
+ case vect_constant_def:
+ case vect_external_def:
++ case vect_reduction_def:
+ break;
+
+ case vect_internal_def:
+- case vect_reduction_def:
+- if ((i == 0 && !different_types) || (i == 1 && different_types))
+- VEC_safe_push (gimple, heap, *def_stmts0, def_stmt);
++ if (different_types)
++ {
++ oprnd0_info = VEC_index (slp_oprnd_info, *oprnds_info, 0);
++ oprnd1_info = VEC_index (slp_oprnd_info, *oprnds_info, 0);
++ if (i == 0)
++ VEC_quick_push (gimple, oprnd1_info->def_stmts, def_stmt);
++ else
++ VEC_quick_push (gimple, oprnd0_info->def_stmts, def_stmt);
++ }
+ else
+- VEC_safe_push (gimple, heap, *def_stmts1, def_stmt);
++ VEC_quick_push (gimple, oprnd_info->def_stmts, def_stmt);
+ break;
+
+ default:
+@@ -322,7 +394,7 @@
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: illegal type of def ");
+- print_generic_expr (vect_dump, def[i], TDF_SLIM);
++ print_generic_expr (vect_dump, def, TDF_SLIM);
+ }
+
+ return false;
+@@ -347,15 +419,10 @@
+ VEC (slp_tree, heap) **loads,
+ unsigned int vectorization_factor, bool *loads_permuted)
+ {
+- VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size);
+- VEC (gimple, heap) *def_stmts1 = VEC_alloc (gimple, heap, group_size);
+ unsigned int i;
+ VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (*node);
+ gimple stmt = VEC_index (gimple, stmts, 0);
+- enum vect_def_type first_stmt_dt0 = vect_uninitialized_def;
+- enum vect_def_type first_stmt_dt1 = vect_uninitialized_def;
+ enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
+- tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE;
+ tree lhs;
+ bool stop_recursion = false, need_same_oprnds = false;
+ tree vectype, scalar_type, first_op1 = NULL_TREE;
+@@ -364,13 +431,21 @@
+ int icode;
+ enum machine_mode optab_op2_mode;
+ enum machine_mode vec_mode;
+- tree first_stmt_const_oprnd = NULL_TREE;
+ struct data_reference *first_dr;
+- bool pattern0 = false, pattern1 = false;
+ HOST_WIDE_INT dummy;
+ bool permutation = false;
+ unsigned int load_place;
+ gimple first_load, prev_first_load = NULL;
++ VEC (slp_oprnd_info, heap) *oprnds_info;
++ unsigned int nops;
++ slp_oprnd_info oprnd_info;
++
++ if (is_gimple_call (stmt))
++ nops = gimple_call_num_args (stmt);
++ else
++ nops = gimple_num_ops (stmt) - 1;
++
++ oprnds_info = vect_create_oprnd_info (nops, group_size);
+
+ /* For every stmt in NODE find its def stmt/s. */
+ FOR_EACH_VEC_ELT (gimple, stmts, i, stmt)
+@@ -391,6 +466,7 @@
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+
+@@ -400,10 +476,11 @@
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump,
+- "Build SLP failed: not GIMPLE_ASSIGN nor GIMPLE_CALL");
++ "Build SLP failed: not GIMPLE_ASSIGN nor GIMPLE_CALL ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+
+@@ -416,6 +493,8 @@
+ fprintf (vect_dump, "Build SLP failed: unsupported data-type ");
+ print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+ }
++
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+
+@@ -462,6 +541,7 @@
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "Build SLP failed: no optab.");
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+ icode = (int) optab_handler (optab, vec_mode);
+@@ -470,6 +550,7 @@
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "Build SLP failed: "
+ "op not supported by target.");
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+ optab_op2_mode = insn_data[icode].operand[2].mode;
+@@ -506,6 +587,7 @@
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+
+@@ -519,6 +601,7 @@
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+ }
+@@ -530,15 +613,12 @@
+ {
+ /* Store. */
+ if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node,
+- stmt, &def_stmts0, &def_stmts1,
+- &first_stmt_dt0,
+- &first_stmt_dt1,
+- &first_stmt_def0_type,
+- &first_stmt_def1_type,
+- &first_stmt_const_oprnd,
+- ncopies_for_cost,
+- &pattern0, &pattern1))
+- return false;
++ stmt, ncopies_for_cost,
++ (i == 0), &oprnds_info))
++ {
++ vect_free_oprnd_info (&oprnds_info);
++ return false;
++ }
+ }
+ else
+ {
+@@ -556,6 +636,7 @@
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+
+@@ -573,6 +654,7 @@
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+
+@@ -593,6 +675,7 @@
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+ }
+@@ -612,6 +695,7 @@
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+
+@@ -639,7 +723,7 @@
+ {
+ if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
+ {
+- /* Not strided load. */
++ /* Not strided load. */
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: not strided load ");
+@@ -647,6 +731,7 @@
+ }
+
+ /* FORNOW: Not strided loads are not supported. */
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+
+@@ -661,19 +746,18 @@
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
++ vect_free_oprnd_info (&oprnds_info);
+ return false;
+ }
+
+ /* Find the def-stmts. */
+ if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, stmt,
+- &def_stmts0, &def_stmts1,
+- &first_stmt_dt0, &first_stmt_dt1,
+- &first_stmt_def0_type,
+- &first_stmt_def1_type,
+- &first_stmt_const_oprnd,
+- ncopies_for_cost,
+- &pattern0, &pattern1))
+- return false;
++ ncopies_for_cost, (i == 0),
++ &oprnds_info))
++ {
++ vect_free_oprnd_info (&oprnds_info);
++ return false;
++ }
+ }
+ }
+
+@@ -702,46 +786,37 @@
+ *loads_permuted = true;
+ }
+
++ vect_free_oprnd_info (&oprnds_info);
+ return true;
+ }
+
+ /* Create SLP_TREE nodes for the definition node/s. */
+- if (first_stmt_dt0 == vect_internal_def)
+- {
+- slp_tree left_node = XNEW (struct _slp_tree);
+- SLP_TREE_SCALAR_STMTS (left_node) = def_stmts0;
+- SLP_TREE_VEC_STMTS (left_node) = NULL;
+- SLP_TREE_LEFT (left_node) = NULL;
+- SLP_TREE_RIGHT (left_node) = NULL;
+- SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0;
+- SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0;
+- if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &left_node, group_size,
+- inside_cost, outside_cost, ncopies_for_cost,
+- max_nunits, load_permutation, loads,
+- vectorization_factor, loads_permuted))
+- return false;
+-
+- SLP_TREE_LEFT (*node) = left_node;
+- }
+-
+- if (first_stmt_dt1 == vect_internal_def)
+- {
+- slp_tree right_node = XNEW (struct _slp_tree);
+- SLP_TREE_SCALAR_STMTS (right_node) = def_stmts1;
+- SLP_TREE_VEC_STMTS (right_node) = NULL;
+- SLP_TREE_LEFT (right_node) = NULL;
+- SLP_TREE_RIGHT (right_node) = NULL;
+- SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0;
+- SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0;
+- if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &right_node, group_size,
+- inside_cost, outside_cost, ncopies_for_cost,
+- max_nunits, load_permutation, loads,
+- vectorization_factor, loads_permuted))
+- return false;
+-
+- SLP_TREE_RIGHT (*node) = right_node;
+- }
+-
++ FOR_EACH_VEC_ELT (slp_oprnd_info, oprnds_info, i, oprnd_info)
++ {
++ slp_tree child;
++
++ if (oprnd_info->first_dt != vect_internal_def)
++ continue;
++
++ child = vect_create_new_slp_node (oprnd_info->def_stmts);
++ if (!child
++ || !vect_build_slp_tree (loop_vinfo, bb_vinfo, &child, group_size,
++ inside_cost, outside_cost, ncopies_for_cost,
++ max_nunits, load_permutation, loads,
++ vectorization_factor, loads_permuted))
++ {
++ if (child)
++ oprnd_info->def_stmts = NULL;
++ vect_free_slp_tree (child);
++ vect_free_oprnd_info (&oprnds_info);
++ return false;
++ }
++
++ oprnd_info->def_stmts = NULL;
++ VEC_quick_push (slp_void_p, SLP_TREE_CHILDREN (*node), child);
++ }
++
++ vect_free_oprnd_info (&oprnds_info);
+ return true;
+ }
+
+@@ -751,6 +826,7 @@
+ {
+ int i;
+ gimple stmt;
++ slp_void_p child;
+
+ if (!node)
+ return;
+@@ -763,8 +839,8 @@
+ }
+ fprintf (vect_dump, "\n");
+
+- vect_print_slp_tree (SLP_TREE_LEFT (node));
+- vect_print_slp_tree (SLP_TREE_RIGHT (node));
++ FOR_EACH_VEC_ELT (slp_void_p, SLP_TREE_CHILDREN (node), i, child)
++ vect_print_slp_tree ((slp_tree) child);
+ }
+
+
+@@ -778,6 +854,7 @@
+ {
+ int i;
+ gimple stmt;
++ slp_void_p child;
+
+ if (!node)
+ return;
+@@ -786,8 +863,8 @@
+ if (j < 0 || i == j)
+ STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
+
+- vect_mark_slp_stmts (SLP_TREE_LEFT (node), mark, j);
+- vect_mark_slp_stmts (SLP_TREE_RIGHT (node), mark, j);
++ FOR_EACH_VEC_ELT (slp_void_p, SLP_TREE_CHILDREN (node), i, child)
++ vect_mark_slp_stmts ((slp_tree) child, mark, j);
+ }
+
+
+@@ -799,6 +876,7 @@
+ int i;
+ gimple stmt;
+ stmt_vec_info stmt_info;
++ slp_void_p child;
+
+ if (!node)
+ return;
+@@ -811,8 +889,8 @@
+ STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
+ }
+
+- vect_mark_slp_stmts_relevant (SLP_TREE_LEFT (node));
+- vect_mark_slp_stmts_relevant (SLP_TREE_RIGHT (node));
++ FOR_EACH_VEC_ELT (slp_void_p, SLP_TREE_CHILDREN (node), i, child)
++ vect_mark_slp_stmts_relevant ((slp_tree) child);
+ }
+
+
+@@ -885,12 +963,13 @@
+ gimple stmt;
+ VEC (gimple, heap) *tmp_stmts;
+ unsigned int index, i;
++ slp_void_p child;
+
+ if (!node)
+ return;
+
+- vect_slp_rearrange_stmts (SLP_TREE_LEFT (node), group_size, permutation);
+- vect_slp_rearrange_stmts (SLP_TREE_RIGHT (node), group_size, permutation);
++ FOR_EACH_VEC_ELT (slp_void_p, SLP_TREE_CHILDREN (node), i, child)
++ vect_slp_rearrange_stmts ((slp_tree) child, group_size, permutation);
+
+ gcc_assert (group_size == VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)));
+ tmp_stmts = VEC_alloc (gimple, heap, group_size);
+@@ -1253,7 +1332,7 @@
+ gimple stmt)
+ {
+ slp_instance new_instance;
+- slp_tree node = XNEW (struct _slp_tree);
++ slp_tree node;
+ unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
+ unsigned int unrolling_factor = 1, nunits;
+ tree vectype, scalar_type = NULL_TREE;
+@@ -1265,6 +1344,7 @@
+ VEC (slp_tree, heap) *loads;
+ struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
+ bool loads_permuted = false;
++ VEC (gimple, heap) *scalar_stmts;
+
+ if (dr)
+ {
+@@ -1308,39 +1388,26 @@
+ }
+
+ /* Create a node (a root of the SLP tree) for the packed strided stores. */
+- SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size);
++ scalar_stmts = VEC_alloc (gimple, heap, group_size);
+ next = stmt;
+ if (dr)
+ {
+ /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
+ while (next)
+ {
+- VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
++ VEC_safe_push (gimple, heap, scalar_stmts, next);
+ next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+ }
+ }
+ else
+ {
+ /* Collect reduction statements. */
+- for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i,
+- next);
+- i++)
+- {
+- VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
+- if (vect_print_dump_info (REPORT_DETAILS))
+- {
+- fprintf (vect_dump, "pushing reduction into node: ");
+- print_gimple_stmt (vect_dump, next, 0, TDF_SLIM);
+- }
+- }
++ VEC (gimple, heap) *reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
++ for (i = 0; VEC_iterate (gimple, reductions, i, next); i++)
++ VEC_safe_push (gimple, heap, scalar_stmts, next);
+ }
+
+- SLP_TREE_VEC_STMTS (node) = NULL;
+- SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
+- SLP_TREE_LEFT (node) = NULL;
+- SLP_TREE_RIGHT (node) = NULL;
+- SLP_TREE_OUTSIDE_OF_LOOP_COST (node) = 0;
+- SLP_TREE_INSIDE_OF_LOOP_COST (node) = 0;
++ node = vect_create_new_slp_node (scalar_stmts);
+
+ /* Calculate the number of vector stmts to create based on the unrolling
+ factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
+@@ -1517,6 +1584,7 @@
+ imm_use_iterator imm_iter;
+ gimple use_stmt;
+ stmt_vec_info stmt_vinfo;
++ slp_void_p child;
+
+ if (!node)
+ return;
+@@ -1534,8 +1602,8 @@
+ == vect_reduction_def))
+ vect_mark_slp_stmts (node, hybrid, i);
+
+- vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node));
+- vect_detect_hybrid_slp_stmts (SLP_TREE_RIGHT (node));
++ FOR_EACH_VEC_ELT (slp_void_p, SLP_TREE_CHILDREN (node), i, child)
++ vect_detect_hybrid_slp_stmts ((slp_tree) child);
+ }
+
+
+@@ -1625,13 +1693,14 @@
+ bool dummy;
+ int i;
+ gimple stmt;
++ slp_void_p child;
+
+ if (!node)
+ return true;
+
+- if (!vect_slp_analyze_node_operations (bb_vinfo, SLP_TREE_LEFT (node))
+- || !vect_slp_analyze_node_operations (bb_vinfo, SLP_TREE_RIGHT (node)))
+- return false;
++ FOR_EACH_VEC_ELT (slp_void_p, SLP_TREE_CHILDREN (node), i, child)
++ if (!vect_slp_analyze_node_operations (bb_vinfo, (slp_tree) child))
++ return false;
+
+ FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt)
+ {
+@@ -2207,88 +2276,102 @@
+ If the scalar definitions are loop invariants or constants, collect them and
+ call vect_get_constant_vectors() to create vector stmts.
+ Otherwise, the def-stmts must be already vectorized and the vectorized stmts
+- must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
+- vect_get_slp_vect_defs() to retrieve them.
+- If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
+- the right node. This is used when the second operand must remain scalar. */
++ must be stored in the corresponding child of SLP_NODE, and we call
++ vect_get_slp_vect_defs () to retrieve them. */
+
+ void
+-vect_get_slp_defs (tree op0, tree op1, slp_tree slp_node,
+- VEC (tree,heap) **vec_oprnds0,
+- VEC (tree,heap) **vec_oprnds1, int reduc_index)
++vect_get_slp_defs (VEC (tree, heap) *ops, slp_tree slp_node,
++ VEC (slp_void_p, heap) **vec_oprnds, int reduc_index)
+ {
+- gimple first_stmt;
+- enum tree_code code;
+- int number_of_vects;
++ gimple first_stmt, first_def;
++ int number_of_vects = 0, i;
++ unsigned int child_index = 0;
+ HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
++ slp_tree child = NULL;
++ VEC (tree, heap) *vec_defs;
++ tree oprnd, def_lhs;
++ bool vectorized_defs;
+
+ first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
+- /* The number of vector defs is determined by the number of vector statements
+- in the node from which we get those statements. */
+- if (SLP_TREE_LEFT (slp_node))
+- number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
+- else
+- {
+- number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+- /* Number of vector stmts was calculated according to LHS in
+- vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
+- necessary. See vect_get_smallest_scalar_type () for details. */
+- vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
+- &rhs_size_unit);
+- if (rhs_size_unit != lhs_size_unit)
+- {
+- number_of_vects *= rhs_size_unit;
+- number_of_vects /= lhs_size_unit;
+- }
++ FOR_EACH_VEC_ELT (tree, ops, i, oprnd)
++ {
++ /* For each operand we check if it has vectorized definitions in a child
++ node or we need to create them (for invariants and constants). We
++ check if the LHS of the first stmt of the next child matches OPRND.
++ If it does, we found the correct child. Otherwise, we call
++ vect_get_constant_vectors (), and not advance CHILD_INDEX in order
++ to check this child node for the next operand. */
++ vectorized_defs = false;
++ if (VEC_length (slp_void_p, SLP_TREE_CHILDREN (slp_node)) > child_index)
++ {
++ child = (slp_tree) VEC_index (slp_void_p,
++ SLP_TREE_CHILDREN (slp_node),
++ child_index);
++ first_def = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (child), 0);
++
++ /* In the end of a pattern sequence we have a use of the original stmt,
++ so we need to compare OPRND with the original def. */
++ if (is_pattern_stmt_p (vinfo_for_stmt (first_def))
++ && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first_stmt))
++ && !is_pattern_stmt_p (vinfo_for_stmt (first_stmt)))
++ first_def = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
++
++ if (is_gimple_call (first_def))
++ def_lhs = gimple_call_lhs (first_def);
++ else
++ def_lhs = gimple_assign_lhs (first_def);
++
++ if (operand_equal_p (oprnd, def_lhs, 0))
++ {
++ /* The number of vector defs is determined by the number of
++ vector statements in the node from which we get those
++ statements. */
++ number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
++ vectorized_defs = true;
++ child_index++;
++ }
++ }
++
++ if (!vectorized_defs)
++ {
++ if (i == 0)
++ {
++ number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
++ /* Number of vector stmts was calculated according to LHS in
++ vect_schedule_slp_instance (), fix it by replacing LHS with
++ RHS, if necessary. See vect_get_smallest_scalar_type () for
++ details. */
++ vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
++ &rhs_size_unit);
++ if (rhs_size_unit != lhs_size_unit)
++ {
++ number_of_vects *= rhs_size_unit;
++ number_of_vects /= lhs_size_unit;
++ }
++ }
++ }
++
++ /* Allocate memory for vectorized defs. */
++ vec_defs = VEC_alloc (tree, heap, number_of_vects);
++
++ /* For reduction defs we call vect_get_constant_vectors (), since we are
++ looking for initial loop invariant values. */
++ if (vectorized_defs && reduc_index == -1)
++ /* The defs are already vectorized. */
++ vect_get_slp_vect_defs (child, &vec_defs);
++ else
++ /* Build vectors from scalar defs. */
++ vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
++ number_of_vects, reduc_index);
++
++ VEC_quick_push (slp_void_p, *vec_oprnds, (slp_void_p) vec_defs);
++
++ /* For reductions, we only need initial values. */
++ if (reduc_index != -1)
++ return;
+ }
+-
+- /* Allocate memory for vectorized defs. */
+- *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
+-
+- /* SLP_NODE corresponds either to a group of stores or to a group of
+- unary/binary operations. We don't call this function for loads.
+- For reduction defs we call vect_get_constant_vectors(), since we are
+- looking for initial loop invariant values. */
+- if (SLP_TREE_LEFT (slp_node) && reduc_index == -1)
+- /* The defs are already vectorized. */
+- vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
+- else
+- /* Build vectors from scalar defs. */
+- vect_get_constant_vectors (op0, slp_node, vec_oprnds0, 0, number_of_vects,
+- reduc_index);
+-
+- if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
+- /* Since we don't call this function with loads, this is a group of
+- stores. */
+- return;
+-
+- /* For reductions, we only need initial values. */
+- if (reduc_index != -1)
+- return;
+-
+- code = gimple_assign_rhs_code (first_stmt);
+- if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
+- return;
+-
+- /* The number of vector defs is determined by the number of vector statements
+- in the node from which we get those statements. */
+- if (SLP_TREE_RIGHT (slp_node))
+- number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
+- else
+- number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+-
+- *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
+-
+- if (SLP_TREE_RIGHT (slp_node))
+- /* The defs are already vectorized. */
+- vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
+- else
+- /* Build vectors from scalar defs. */
+- vect_get_constant_vectors (op1, slp_node, vec_oprnds1, 1, number_of_vects,
+- -1);
+ }
+
+-
+ /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
+ building a vector of type MASK_TYPE from it) and two input vectors placed in
+ DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
+@@ -2605,14 +2688,14 @@
+ tree vectype;
+ int i;
+ slp_tree loads_node;
++ slp_void_p child;
+
+ if (!node)
+ return false;
+
+- vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
+- vectorization_factor);
+- vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
+- vectorization_factor);
++ FOR_EACH_VEC_ELT (slp_void_p, SLP_TREE_CHILDREN (node), i, child)
++ vect_schedule_slp_instance ((slp_tree) child, instance,
++ vectorization_factor);
+
+ stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
+ stmt_info = vinfo_for_stmt (stmt);
+
+=== modified file 'gcc/tree-vect-stmts.c'
+--- old/gcc/tree-vect-stmts.c 2011-10-27 11:27:59 +0000
++++ new/gcc/tree-vect-stmts.c 2011-11-14 11:38:08 +0000
+@@ -1419,16 +1419,35 @@
+ }
+
+
+-/* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not
+- NULL. */
++/* Get vectorized definitions for OP0 and OP1.
++ REDUC_INDEX is the index of reduction operand in case of reduction,
++ and -1 otherwise. */
+
+-static void
++void
+ vect_get_vec_defs (tree op0, tree op1, gimple stmt,
+- VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
+- slp_tree slp_node)
++ VEC (tree, heap) **vec_oprnds0,
++ VEC (tree, heap) **vec_oprnds1,
++ slp_tree slp_node, int reduc_index)
+ {
+ if (slp_node)
+- vect_get_slp_defs (op0, op1, slp_node, vec_oprnds0, vec_oprnds1, -1);
++ {
++ int nops = (op1 == NULL_TREE) ? 1 : 2;
++ VEC (tree, heap) *ops = VEC_alloc (tree, heap, nops);
++ VEC (slp_void_p, heap) *vec_defs = VEC_alloc (slp_void_p, heap, nops);
++
++ VEC_quick_push (tree, ops, op0);
++ if (op1)
++ VEC_quick_push (tree, ops, op1);
++
++ vect_get_slp_defs (ops, slp_node, &vec_defs, reduc_index);
++
++ *vec_oprnds0 = (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
++ if (op1)
++ *vec_oprnds1 = (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 1);
++
++ VEC_free (tree, heap, ops);
++ VEC_free (slp_void_p, heap, vec_defs);
++ }
+ else
+ {
+ tree vec_oprnd;
+@@ -2016,7 +2035,8 @@
+ for (j = 0; j < ncopies; j++)
+ {
+ if (j == 0)
+- vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
++ vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node,
++ -1);
+ else
+ vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
+
+@@ -2221,7 +2241,7 @@
+ {
+ /* Handle uses. */
+ if (j == 0)
+- vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
++ vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node, -1);
+ else
+ vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds, NULL);
+
+@@ -2576,10 +2596,10 @@
+ operand 1 should be of a vector type (the usual case). */
+ if (vec_oprnd1)
+ vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
+- slp_node);
++ slp_node, -1);
+ else
+ vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
+- slp_node);
++ slp_node, -1);
+ }
+ else
+ vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
+@@ -2887,10 +2907,10 @@
+ {
+ if (op_type == binary_op || op_type == ternary_op)
+ vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
+- slp_node);
++ slp_node, -1);
+ else
+ vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
+- slp_node);
++ slp_node, -1);
+ if (op_type == ternary_op)
+ {
+ vec_oprnds2 = VEC_alloc (tree, heap, 1);
+@@ -3202,7 +3222,8 @@
+ {
+ /* Handle uses. */
+ if (slp_node)
+- vect_get_slp_defs (op0, NULL_TREE, slp_node, &vec_oprnds0, NULL, -1);
++ vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
++ slp_node, -1);
+ else
+ {
+ VEC_free (tree, heap, vec_oprnds0);
+@@ -3548,12 +3569,12 @@
+ for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
+ VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+
+- vect_get_slp_defs (op0, NULL_TREE, slp_node, &vec_oprnds0, NULL,
+- -1);
++ vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
++ slp_node, -1);
+ }
+ else
+- vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0,
+- &vec_oprnds1, -1);
++ vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0,
++ &vec_oprnds1, slp_node, -1);
+ }
+ else
+ {
+@@ -3796,6 +3817,7 @@
+ vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
+ first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
++ op = gimple_assign_rhs1 (first_stmt);
+ }
+ else
+ /* VEC_NUM is the number of vect stmts to be created for this
+@@ -3878,8 +3900,8 @@
+ if (slp)
+ {
+ /* Get vectorized arguments for SLP_NODE. */
+- vect_get_slp_defs (NULL_TREE, NULL_TREE, slp_node, &vec_oprnds,
+- NULL, -1);
++ vect_get_vec_defs (op, NULL_TREE, stmt, &vec_oprnds,
++ NULL, slp_node, -1);
+
+ vec_oprnd = VEC_index (tree, vec_oprnds, 0);
+ }
+@@ -5040,7 +5062,7 @@
+ In basic blocks we only analyze statements that are a part of some SLP
+ instance, therefore, all the statements are relevant.
+
+- Pattern statement need to be analyzed instead of the original statement
++ Pattern statement needs to be analyzed instead of the original statement
+ if the original statement is not relevant. Otherwise, we analyze both
+ statements. */
+
+
+=== modified file 'gcc/tree-vectorizer.h'
+--- old/gcc/tree-vectorizer.h 2011-10-23 13:33:07 +0000
++++ new/gcc/tree-vectorizer.h 2011-11-14 11:38:08 +0000
+@@ -73,15 +73,15 @@
+ /************************************************************************
+ SLP
+ ************************************************************************/
++typedef void *slp_void_p;
++DEF_VEC_P (slp_void_p);
++DEF_VEC_ALLOC_P (slp_void_p, heap);
+
+-/* A computation tree of an SLP instance. Each node corresponds to a group of
++/* A computation tree of an SLP instance. Each node corresponds to a group of
+ stmts to be packed in a SIMD stmt. */
+ typedef struct _slp_tree {
+- /* Only binary and unary operations are supported. LEFT child corresponds to
+- the first operand and RIGHT child to the second if the operation is
+- binary. */
+- struct _slp_tree *left;
+- struct _slp_tree *right;
++ /* Nodes that contain def-stmts of this node statements operands. */
++ VEC (slp_void_p, heap) *children;
+ /* A group of scalar stmts to be vectorized together. */
+ VEC (gimple, heap) *stmts;
+ /* Vectorized stmt/s. */
+@@ -146,14 +146,32 @@
+ #define SLP_INSTANCE_LOADS(S) (S)->loads
+ #define SLP_INSTANCE_FIRST_LOAD_STMT(S) (S)->first_load
+
+-#define SLP_TREE_LEFT(S) (S)->left
+-#define SLP_TREE_RIGHT(S) (S)->right
++#define SLP_TREE_CHILDREN(S) (S)->children
+ #define SLP_TREE_SCALAR_STMTS(S) (S)->stmts
+ #define SLP_TREE_VEC_STMTS(S) (S)->vec_stmts
+ #define SLP_TREE_NUMBER_OF_VEC_STMTS(S) (S)->vec_stmts_size
+ #define SLP_TREE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop
+ #define SLP_TREE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop
+
++/* This structure is used in creation of an SLP tree. Each instance
++ corresponds to the same operand in a group of scalar stmts in an SLP
++ node. */
++typedef struct _slp_oprnd_info
++{
++ /* Def-stmts for the operands. */
++ VEC (gimple, heap) *def_stmts;
++ /* Information about the first statement, its vector def-type, type, the
++ operand itself in case it's constant, and an indication if it's a pattern
++ stmt. */
++ enum vect_def_type first_dt;
++ tree first_def_type;
++ tree first_const_oprnd;
++ bool first_pattern;
++} *slp_oprnd_info;
++
++DEF_VEC_P(slp_oprnd_info);
++DEF_VEC_ALLOC_P(slp_oprnd_info, heap);
++
+
+ typedef struct _vect_peel_info
+ {
+@@ -819,6 +837,8 @@
+ unsigned int *, unsigned int *);
+ extern void vect_get_store_cost (struct data_reference *, int, unsigned int *);
+ extern bool vect_supportable_shift (enum tree_code, tree);
++extern void vect_get_vec_defs (tree, tree, gimple, VEC (tree, heap) **,
++ VEC (tree, heap) **, slp_tree, int);
+
+ /* In tree-vect-data-refs.c. */
+ extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);
+@@ -885,8 +905,9 @@
+ extern bool vect_analyze_slp (loop_vec_info, bb_vec_info);
+ extern void vect_make_slp_decision (loop_vec_info);
+ extern void vect_detect_hybrid_slp (loop_vec_info);
+-extern void vect_get_slp_defs (tree, tree, slp_tree, VEC (tree,heap) **,
+- VEC (tree,heap) **, int);
++extern void vect_get_slp_defs (VEC (tree, heap) *, slp_tree,
++ VEC (slp_void_p, heap) **, int);
++
+ extern LOC find_bb_location (basic_block);
+ extern bb_vec_info vect_slp_analyze_bb (basic_block);
+ extern void vect_slp_transform_bb (basic_block);
+
--- /dev/null
+2011-11-21 Michael Hope <michael.hope@linaro.org>
+
+ Backport from mainline r180131:
+
+ 2011-10-18 Julian Brown <julian@codesourcery.com>
+
+ gcc/
+ * config/arm/arm.c (arm_block_move_unaligned_straight)
+ (arm_adjust_block_mem, arm_block_move_unaligned_loop)
+ (arm_movmemqi_unaligned): New.
+ (arm_gen_movmemqi): Support unaligned block copies.
+
+ gcc/testsuite/
+ * lib/target-supports.exp (check_effective_target_arm_unaligned): New.
+ * gcc.target/arm/unaligned-memcpy-1.c: New.
+ * gcc.target/arm/unaligned-memcpy-2.c: New.
+ * gcc.target/arm/unaligned-memcpy-3.c: New.
+ * gcc.target/arm/unaligned-memcpy-4.c: New.
+
+ 2011-09-15 James Greenhalgh <james.greenhalgh@arm.com>
+
+ gcc/
+ * config/arm/arm.h (TARGET_CPU_CPP_BUILTINS): New builtin macro.
+
+=== modified file 'gcc/config/arm/arm.c'
+--- old/gcc/config/arm/arm.c 2011-10-26 11:38:30 +0000
++++ new/gcc/config/arm/arm.c 2011-11-21 01:45:54 +0000
+@@ -10803,6 +10803,335 @@
+ return true;
+ }
+
++/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit
++ unaligned copies on processors which support unaligned semantics for those
++ instructions. INTERLEAVE_FACTOR can be used to attempt to hide load latency
++ (using more registers) by doing e.g. load/load/store/store for a factor of 2.
++ An interleave factor of 1 (the minimum) will perform no interleaving.
++ Load/store multiple are used for aligned addresses where possible. */
++
++static void
++arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase,
++ HOST_WIDE_INT length,
++ unsigned int interleave_factor)
++{
++ rtx *regs = XALLOCAVEC (rtx, interleave_factor);
++ int *regnos = XALLOCAVEC (int, interleave_factor);
++ HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD;
++ HOST_WIDE_INT i, j;
++ HOST_WIDE_INT remaining = length, words;
++ rtx halfword_tmp = NULL, byte_tmp = NULL;
++ rtx dst, src;
++ bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD;
++ bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD;
++ HOST_WIDE_INT srcoffset, dstoffset;
++ HOST_WIDE_INT src_autoinc, dst_autoinc;
++ rtx mem, addr;
++
++ gcc_assert (1 <= interleave_factor && interleave_factor <= 4);
++
++ /* Use hard registers if we have aligned source or destination so we can use
++ load/store multiple with contiguous registers. */
++ if (dst_aligned || src_aligned)
++ for (i = 0; i < interleave_factor; i++)
++ regs[i] = gen_rtx_REG (SImode, i);
++ else
++ for (i = 0; i < interleave_factor; i++)
++ regs[i] = gen_reg_rtx (SImode);
++
++ dst = copy_addr_to_reg (XEXP (dstbase, 0));
++ src = copy_addr_to_reg (XEXP (srcbase, 0));
++
++ srcoffset = dstoffset = 0;
++
++ /* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST.
++ For copying the last bytes we want to subtract this offset again. */
++ src_autoinc = dst_autoinc = 0;
++
++ for (i = 0; i < interleave_factor; i++)
++ regnos[i] = i;
++
++ /* Copy BLOCK_SIZE_BYTES chunks. */
++
++ for (i = 0; i + block_size_bytes <= length; i += block_size_bytes)
++ {
++ /* Load words. */
++ if (src_aligned && interleave_factor > 1)
++ {
++ emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src,
++ TRUE, srcbase, &srcoffset));
++ src_autoinc += UNITS_PER_WORD * interleave_factor;
++ }
++ else
++ {
++ for (j = 0; j < interleave_factor; j++)
++ {
++ addr = plus_constant (src, srcoffset + j * UNITS_PER_WORD
++ - src_autoinc);
++ mem = adjust_automodify_address (srcbase, SImode, addr,
++ srcoffset + j * UNITS_PER_WORD);
++ emit_insn (gen_unaligned_loadsi (regs[j], mem));
++ }
++ srcoffset += block_size_bytes;
++ }
++
++ /* Store words. */
++ if (dst_aligned && interleave_factor > 1)
++ {
++ emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst,
++ TRUE, dstbase, &dstoffset));
++ dst_autoinc += UNITS_PER_WORD * interleave_factor;
++ }
++ else
++ {
++ for (j = 0; j < interleave_factor; j++)
++ {
++ addr = plus_constant (dst, dstoffset + j * UNITS_PER_WORD
++ - dst_autoinc);
++ mem = adjust_automodify_address (dstbase, SImode, addr,
++ dstoffset + j * UNITS_PER_WORD);
++ emit_insn (gen_unaligned_storesi (mem, regs[j]));
++ }
++ dstoffset += block_size_bytes;
++ }
++
++ remaining -= block_size_bytes;
++ }
++
++ /* Copy any whole words left (note these aren't interleaved with any
++ subsequent halfword/byte load/stores in the interests of simplicity). */
++
++ words = remaining / UNITS_PER_WORD;
++
++ gcc_assert (words < interleave_factor);
++
++ if (src_aligned && words > 1)
++ {
++ emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase,
++ &srcoffset));
++ src_autoinc += UNITS_PER_WORD * words;
++ }
++ else
++ {
++ for (j = 0; j < words; j++)
++ {
++ addr = plus_constant (src,
++ srcoffset + j * UNITS_PER_WORD - src_autoinc);
++ mem = adjust_automodify_address (srcbase, SImode, addr,
++ srcoffset + j * UNITS_PER_WORD);
++ emit_insn (gen_unaligned_loadsi (regs[j], mem));
++ }
++ srcoffset += words * UNITS_PER_WORD;
++ }
++
++ if (dst_aligned && words > 1)
++ {
++ emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase,
++ &dstoffset));
++ dst_autoinc += words * UNITS_PER_WORD;
++ }
++ else
++ {
++ for (j = 0; j < words; j++)
++ {
++ addr = plus_constant (dst,
++ dstoffset + j * UNITS_PER_WORD - dst_autoinc);
++ mem = adjust_automodify_address (dstbase, SImode, addr,
++ dstoffset + j * UNITS_PER_WORD);
++ emit_insn (gen_unaligned_storesi (mem, regs[j]));
++ }
++ dstoffset += words * UNITS_PER_WORD;
++ }
++
++ remaining -= words * UNITS_PER_WORD;
++
++ gcc_assert (remaining < 4);
++
++ /* Copy a halfword if necessary. */
++
++ if (remaining >= 2)
++ {
++ halfword_tmp = gen_reg_rtx (SImode);
++
++ addr = plus_constant (src, srcoffset - src_autoinc);
++ mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset);
++ emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem));
++
++ /* Either write out immediately, or delay until we've loaded the last
++ byte, depending on interleave factor. */
++ if (interleave_factor == 1)
++ {
++ addr = plus_constant (dst, dstoffset - dst_autoinc);
++ mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
++ emit_insn (gen_unaligned_storehi (mem,
++ gen_lowpart (HImode, halfword_tmp)));
++ halfword_tmp = NULL;
++ dstoffset += 2;
++ }
++
++ remaining -= 2;
++ srcoffset += 2;
++ }
++
++ gcc_assert (remaining < 2);
++
++ /* Copy last byte. */
++
++ if ((remaining & 1) != 0)
++ {
++ byte_tmp = gen_reg_rtx (SImode);
++
++ addr = plus_constant (src, srcoffset - src_autoinc);
++ mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset);
++ emit_move_insn (gen_lowpart (QImode, byte_tmp), mem);
++
++ if (interleave_factor == 1)
++ {
++ addr = plus_constant (dst, dstoffset - dst_autoinc);
++ mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
++ emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
++ byte_tmp = NULL;
++ dstoffset++;
++ }
++
++ remaining--;
++ srcoffset++;
++ }
++
++ /* Store last halfword if we haven't done so already. */
++
++ if (halfword_tmp)
++ {
++ addr = plus_constant (dst, dstoffset - dst_autoinc);
++ mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
++ emit_insn (gen_unaligned_storehi (mem,
++ gen_lowpart (HImode, halfword_tmp)));
++ dstoffset += 2;
++ }
++
++ /* Likewise for last byte. */
++
++ if (byte_tmp)
++ {
++ addr = plus_constant (dst, dstoffset - dst_autoinc);
++ mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
++ emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
++ dstoffset++;
++ }
++
++ gcc_assert (remaining == 0 && srcoffset == dstoffset);
++}
++
++/* From mips_adjust_block_mem:
++
++ Helper function for doing a loop-based block operation on memory
++ reference MEM. Each iteration of the loop will operate on LENGTH
++ bytes of MEM.
++
++ Create a new base register for use within the loop and point it to
++ the start of MEM. Create a new memory reference that uses this
++ register. Store them in *LOOP_REG and *LOOP_MEM respectively. */
++
++static void
++arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
++ rtx *loop_mem)
++{
++ *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
++
++ /* Although the new mem does not refer to a known location,
++ it does keep up to LENGTH bytes of alignment. */
++ *loop_mem = change_address (mem, BLKmode, *loop_reg);
++ set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
++}
++
++/* From mips_block_move_loop:
++
++ Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
++ bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that
++ the memory regions do not overlap. */
++
++static void
++arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
++ unsigned int interleave_factor,
++ HOST_WIDE_INT bytes_per_iter)
++{
++ rtx label, src_reg, dest_reg, final_src, test;
++ HOST_WIDE_INT leftover;
++
++ leftover = length % bytes_per_iter;
++ length -= leftover;
++
++ /* Create registers and memory references for use within the loop. */
++ arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
++ arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
++
++ /* Calculate the value that SRC_REG should have after the last iteration of
++ the loop. */
++ final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
++ 0, 0, OPTAB_WIDEN);
++
++ /* Emit the start of the loop. */
++ label = gen_label_rtx ();
++ emit_label (label);
++
++ /* Emit the loop body. */
++ arm_block_move_unaligned_straight (dest, src, bytes_per_iter,
++ interleave_factor);
++
++ /* Move on to the next block. */
++ emit_move_insn (src_reg, plus_constant (src_reg, bytes_per_iter));
++ emit_move_insn (dest_reg, plus_constant (dest_reg, bytes_per_iter));
++
++ /* Emit the loop condition. */
++ test = gen_rtx_NE (VOIDmode, src_reg, final_src);
++ emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
++
++ /* Mop up any left-over bytes. */
++ if (leftover)
++ arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor);
++}
++
++/* Emit a block move when either the source or destination is unaligned (not
++ aligned to a four-byte boundary). This may need further tuning depending on
++ core type, optimize_size setting, etc. */
++
++static int
++arm_movmemqi_unaligned (rtx *operands)
++{
++ HOST_WIDE_INT length = INTVAL (operands[2]);
++
++ if (optimize_size)
++ {
++ bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD;
++ bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD;
++ /* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit
++ size of code if optimizing for size. We'll use ldm/stm if src_aligned
++ or dst_aligned though: allow more interleaving in those cases since the
++ resulting code can be smaller. */
++ unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1;
++ HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4;
++
++ if (length > 12)
++ arm_block_move_unaligned_loop (operands[0], operands[1], length,
++ interleave_factor, bytes_per_iter);
++ else
++ arm_block_move_unaligned_straight (operands[0], operands[1], length,
++ interleave_factor);
++ }
++ else
++ {
++ /* Note that the loop created by arm_block_move_unaligned_loop may be
++ subject to loop unrolling, which makes tuning this condition a little
++ redundant. */
++ if (length > 32)
++ arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16);
++ else
++ arm_block_move_unaligned_straight (operands[0], operands[1], length, 4);
++ }
++
++ return 1;
++}
++
+ int
+ arm_gen_movmemqi (rtx *operands)
+ {
+@@ -10815,8 +11144,13 @@
+
+ if (GET_CODE (operands[2]) != CONST_INT
+ || GET_CODE (operands[3]) != CONST_INT
+- || INTVAL (operands[2]) > 64
+- || INTVAL (operands[3]) & 3)
++ || INTVAL (operands[2]) > 64)
++ return 0;
++
++ if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
++ return arm_movmemqi_unaligned (operands);
++
++ if (INTVAL (operands[3]) & 3)
+ return 0;
+
+ dstbase = operands[0];
+
+=== modified file 'gcc/config/arm/arm.h'
+--- old/gcc/config/arm/arm.h 2011-10-19 17:01:50 +0000
++++ new/gcc/config/arm/arm.h 2011-11-21 01:45:54 +0000
+@@ -47,6 +47,8 @@
+ { \
+ if (TARGET_DSP_MULTIPLY) \
+ builtin_define ("__ARM_FEATURE_DSP"); \
++ if (unaligned_access) \
++ builtin_define ("__ARM_FEATURE_UNALIGNED"); \
+ /* Define __arm__ even when in thumb mode, for \
+ consistency with armcc. */ \
+ builtin_define ("__arm__"); \
+
+=== added file 'gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c'
+--- old/gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c 2011-10-19 22:56:19 +0000
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_unaligned } */
++/* { dg-options "-O2" } */
++
++#include <string.h>
++
++void unknown_alignment (char *dest, char *src)
++{
++ memcpy (dest, src, 15);
++}
++
++/* We should see three unaligned word loads and store pairs, one unaligned
++ ldrh/strh pair, and an ldrb/strb pair. Sanity check that. */
++
++/* { dg-final { scan-assembler-times "@ unaligned" 8 } } */
++/* { dg-final { scan-assembler-times "ldrh" 1 } } */
++/* { dg-final { scan-assembler-times "strh" 1 } } */
++/* { dg-final { scan-assembler-times "ldrb" 1 } } */
++/* { dg-final { scan-assembler-times "strb" 1 } } */
+
+=== added file 'gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c'
+--- old/gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c 2011-10-19 22:56:19 +0000
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_unaligned } */
++/* { dg-options "-O2" } */
++
++#include <string.h>
++
++char dest[16];
++
++void aligned_dest (char *src)
++{
++ memcpy (dest, src, 15);
++}
++
++/* Expect a multi-word store for the main part of the copy, but subword
++ loads/stores for the remainder. */
++
++/* { dg-final { scan-assembler-times "stmia" 1 } } */
++/* { dg-final { scan-assembler-times "ldrh" 1 } } */
++/* { dg-final { scan-assembler-times "strh" 1 } } */
++/* { dg-final { scan-assembler-times "ldrb" 1 } } */
++/* { dg-final { scan-assembler-times "strb" 1 } } */
+
+=== added file 'gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c'
+--- old/gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c 2011-10-19 22:56:19 +0000
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_unaligned } */
++/* { dg-options "-O2" } */
++
++#include <string.h>
++
++char src[16];
++
++void aligned_src (char *dest)
++{
++ memcpy (dest, src, 15);
++}
++
++/* Expect a multi-word load for the main part of the copy, but subword
++ loads/stores for the remainder. */
++
++/* { dg-final { scan-assembler-times "ldmia" 1 } } */
++/* { dg-final { scan-assembler-times "ldrh" 1 } } */
++/* { dg-final { scan-assembler-times "strh" 1 } } */
++/* { dg-final { scan-assembler-times "ldrb" 1 } } */
++/* { dg-final { scan-assembler-times "strb" 1 } } */
+
+=== added file 'gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c'
+--- old/gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c 1970-01-01 00:00:00 +0000
++++ new/gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c 2011-10-19 22:56:19 +0000
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_unaligned } */
++/* { dg-options "-O2" } */
++
++#include <string.h>
++
++char src[16];
++char dest[16];
++
++void aligned_both (void)
++{
++ memcpy (dest, src, 15);
++}
++
++/* We know both src and dest to be aligned: expect multiword loads/stores. */
++
++/* { dg-final { scan-assembler-times "ldmia" 1 } } */
++/* { dg-final { scan-assembler-times "stmia" 1 } } */
+
+=== modified file 'gcc/testsuite/lib/target-supports.exp'
+--- old/gcc/testsuite/lib/target-supports.exp 2011-10-23 13:33:07 +0000
++++ new/gcc/testsuite/lib/target-supports.exp 2011-11-21 01:45:54 +0000
+@@ -1894,6 +1894,18 @@
+ }]
+ }
+
++# Return 1 if this is an ARM target that supports unaligned word/halfword
++# load/store instructions.
++
++proc check_effective_target_arm_unaligned { } {
++ return [check_no_compiler_messages arm_unaligned assembly {
++ #ifndef __ARM_FEATURE_UNALIGNED
++ #error no unaligned support
++ #endif
++ int i;
++ }]
++}
++
+ # Add the options needed for NEON. We need either -mfloat-abi=softfp
+ # or -mfloat-abi=hard, but if one is already specified by the
+ # multilib, use it. Similarly, if a -mfpu option already enables
+
file://linaro/gcc-4.6-linaro-r106806.patch \
file://linaro/gcc-4.6-linaro-r106807.patch \
file://linaro/gcc-4.6-linaro-r106811.patch \
+file://linaro/gcc-4.6-linaro-r106814.patch \
+file://linaro/gcc-4.6-linaro-r106815.patch \
+file://linaro/gcc-4.6-linaro-r106816.patch \
+file://linaro/gcc-4.6-linaro-r106817.patch \
+file://linaro/gcc-4.6-linaro-r106818.patch \
+file://linaro/gcc-4.6-linaro-r106819.patch \
+file://linaro/gcc-4.6-linaro-r106820.patch \
+file://linaro/gcc-4.6-linaro-r106821.patch \
+file://linaro/gcc-4.6-linaro-r106825.patch \
+file://linaro/gcc-4.6-linaro-r106826.patch \
+file://linaro/gcc-4.6-linaro-r106827.patch \
+file://linaro/gcc-4.6-linaro-r106828.patch \
+file://linaro/gcc-4.6-linaro-r106829.patch \
+file://linaro/gcc-4.6-linaro-r106830.patch \
+file://linaro/gcc-4.6-linaro-r106831.patch \
+file://linaro/gcc-4.6-linaro-r106832.patch \
+file://linaro/gcc-4.6-linaro-r106833.patch \
+file://linaro/gcc-4.6-linaro-r106834.patch \
+file://linaro/gcc-4.6-linaro-r106836.patch \
+file://linaro/gcc-4.6-linaro-r106839.patch \
+file://linaro/gcc-4.6-linaro-r106840.patch \
+file://linaro/gcc-4.6-linaro-r106841.patch \
"
# this will prepend this layer to FILESPATH
FILESEXTRAPATHS := "${THISDIR}/gcc-4.6"
-PRINC = "2"
+PRINC = "3"
ARM_INSTRUCTION_SET = "arm"