+2011-11-07 Jakub Jelinek <jakub@redhat.com>
+
+ PR tree-optimization/50789
+ * tree-vect-stmts.c (process_use): Add force argument, avoid
+ exist_non_indexing_operands_for_use_p check if true.
+ (vect_mark_stmts_to_be_vectorized): Adjust callers. Handle
+ STMT_VINFO_GATHER_P.
+ (gen_perm_mask): New function.
+ (perm_mask_for_reverse): Use it.
+ (reverse_vec_element): Rename to...
+ (permute_vec_elements): ... this. Add Y and MASK_VEC arguments,
+ generalize for any permutations.
+ (vectorizable_load): Adjust caller. Handle STMT_VINFO_GATHER_P.
+ * target.def (TARGET_VECTORIZE_BUILTIN_GATHER): New hook.
+ * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_GATHER): Document it.
+ * doc/tm.texi: Regenerate.
+ * tree-data-ref.c (initialize_data_dependence_relation,
+ compute_self_dependence): No longer static.
+ * tree-data-ref.h (initialize_data_dependence_relation,
+ compute_self_dependence): New prototypes.
+ * tree-vect-data-refs.c (vect_check_gather): New function.
+ (vect_analyze_data_refs): Detect possible gather load data
+ refs.
+ * tree-vectorizer.h (struct _stmt_vec_info): Add gather_p field.
+ (STMT_VINFO_GATHER_P): Define.
+ (vect_check_gather): New prototype.
+ * config/i386/i386-builtin-types.def: Add types for alternate
+ gather builtins.
+ * config/i386/sse.md (AVXMODE48P_DI): Remove.
+ (VEC_GATHER_MODE): Rename mode_attr to...
+ (VEC_GATHER_IDXSI): ... this.
+ (VEC_GATHER_IDXDI, VEC_GATHER_SRCDI): New mode_attrs.
+ (avx2_gathersi<mode>, *avx2_gathersi<mode>): Use <VEC_GATHER_IDXSI>
+ instead of <VEC_GATHER_MODE>.
+ (avx2_gatherdi<mode>): Use <VEC_GATHER_IDXDI> instead of
+ <<AVXMODE48P_DI> and <VEC_GATHER_SRCDI> instead of VEC_GATHER_MODE
+ on src and mask operands.
+ (*avx2_gatherdi<mode>): Likewise. Use VEC_GATHER_MODE iterator
+ instead of AVXMODE48P_DI.
+ (avx2_gatherdi<mode>256, *avx2_gatherdi<mode>256): Removed.
+ * config/i386/i386.c (enum ix86_builtins): Add
+ IX86_BUILTIN_GATHERALTSIV4DF, IX86_BUILTIN_GATHERALTDIV8SF,
+ IX86_BUILTIN_GATHERALTSIV4DI and IX86_BUILTIN_GATHERALTDIV8SI.
+ (ix86_init_mmx_sse_builtins): Create those builtins.
+ (ix86_expand_builtin): Handle those builtins and adjust expansions
+ of other gather builtins.
+ (ix86_vectorize_builtin_gather): New function.
+ (TARGET_VECTORIZE_BUILTIN_GATHER): Define.
+
2011-11-07 Uros Bizjak <ubizjak@gmail.com>
* config/i386/f16cintrin: Remove extra _X86INTRIN_H_INCLUDED check.
DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V4SI, V2DF, INT)
DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4SI, V4DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V8SI, V4DF, INT)
DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V2DI, V2DF, INT)
DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4DI, V4DF, INT)
DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4SI, V4SF, INT)
DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V8SI, V8SF, INT)
DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V2DI, V4SF, INT)
DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4DI, V4SF, INT)
+DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V4DI, V8SF, INT)
DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V4SI, V2DI, INT)
DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4SI, V4DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V8SI, V4DI, INT)
DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V2DI, V2DI, INT)
DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4DI, V4DI, INT)
DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4SI, V4SI, INT)
DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V8SI, V8SI, INT)
DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V2DI, V4SI, INT)
DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4DI, V4SI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V4DI, V8SI, INT)
DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND)
DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND)
IX86_BUILTIN_GATHERDIV4SI,
IX86_BUILTIN_GATHERDIV8SI,
+ /* Alternate 4 element gather for the vectorizer where
+ all operands are 32-byte wide. */
+ IX86_BUILTIN_GATHERALTSIV4DF,
+ IX86_BUILTIN_GATHERALTDIV8SF,
+ IX86_BUILTIN_GATHERALTSIV4DI,
+ IX86_BUILTIN_GATHERALTDIV8SI,
+
/* TFmode support builtins. */
IX86_BUILTIN_INFQ,
IX86_BUILTIN_HUGE_VALQ,
V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
IX86_BUILTIN_GATHERDIV8SI);
+ def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
+ V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
+ IX86_BUILTIN_GATHERALTSIV4DF);
+
+ def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
+ V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
+ IX86_BUILTIN_GATHERALTDIV8SF);
+
+ def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
+ V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
+ IX86_BUILTIN_GATHERALTSIV4DI);
+
+ def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
+ V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
+ IX86_BUILTIN_GATHERALTDIV8SI);
+
/* MMX access to the vec_init patterns. */
def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
icode = CODE_FOR_avx2_gatherdiv4sf;
goto gather_gen;
case IX86_BUILTIN_GATHERDIV8SF:
- icode = CODE_FOR_avx2_gatherdiv4sf256;
+ icode = CODE_FOR_avx2_gatherdiv8sf;
goto gather_gen;
case IX86_BUILTIN_GATHERSIV2DI:
icode = CODE_FOR_avx2_gathersiv2di;
icode = CODE_FOR_avx2_gatherdiv4si;
goto gather_gen;
case IX86_BUILTIN_GATHERDIV8SI:
- icode = CODE_FOR_avx2_gatherdiv4si256;
+ icode = CODE_FOR_avx2_gatherdiv8si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTSIV4DF:
+ icode = CODE_FOR_avx2_gathersiv4df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTDIV8SF:
+ icode = CODE_FOR_avx2_gatherdiv8sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTSIV4DI:
+ icode = CODE_FOR_avx2_gathersiv4df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTDIV8SI:
+ icode = CODE_FOR_avx2_gatherdiv8si;
+ goto gather_gen;
gather_gen:
arg0 = CALL_EXPR_ARG (exp, 0);
mode3 = insn_data[icode].operand[4].mode;
mode4 = insn_data[icode].operand[5].mode;
- if (target == NULL_RTX)
- target = gen_reg_rtx (insn_data[icode].operand[0].mode);
+ if (target == NULL_RTX
+ || GET_MODE (target) != insn_data[icode].operand[0].mode)
+ subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
+ else
+ subtarget = target;
+
+ if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
+ || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
+ {
+ rtx half = gen_reg_rtx (V4SImode);
+ if (!nonimmediate_operand (op2, V8SImode))
+ op2 = copy_to_mode_reg (V8SImode, op2);
+ emit_insn (gen_vec_extract_lo_v8si (half, op2));
+ op2 = half;
+ }
+ else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
+ || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
+ {
+ rtx (*gen) (rtx, rtx);
+ rtx half = gen_reg_rtx (mode0);
+ if (mode0 == V4SFmode)
+ gen = gen_vec_extract_lo_v8sf;
+ else
+ gen = gen_vec_extract_lo_v8si;
+ if (!nonimmediate_operand (op0, GET_MODE (op0)))
+ op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+ emit_insn (gen (half, op0));
+ op0 = half;
+ if (!nonimmediate_operand (op3, GET_MODE (op3)))
+ op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+ emit_insn (gen (half, op3));
+ op3 = half;
+ }
/* Force memory operand only with base register here. But we
don't want to do it on memory operand for other builtin
error ("last argument must be scale 1, 2, 4, 8");
return const0_rtx;
}
- pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4);
+ pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
if (! pat)
return const0_rtx;
emit_insn (pat);
+
+ if (fcode == IX86_BUILTIN_GATHERDIV8SF
+ || fcode == IX86_BUILTIN_GATHERDIV8SI)
+ {
+ enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
+ ? V4SFmode : V4SImode;
+ if (target == NULL_RTX)
+ target = gen_reg_rtx (tmode);
+ if (tmode == V4SFmode)
+ emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
+ else
+ emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
+ }
+ else
+ target = subtarget;
+
return target;
default:
return new_fndecl;
}
+/* Returns a decl of a function that implements gather load with
+ memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
+ Return NULL_TREE if it is not available. */
+
+static tree
+ix86_vectorize_builtin_gather (const_tree mem_vectype,
+ const_tree index_type, int scale)
+{
+ bool si;
+ enum ix86_builtins code;
+
+ if (! TARGET_AVX2)
+ return NULL_TREE;
+
+ if ((TREE_CODE (index_type) != INTEGER_TYPE
+ && !POINTER_TYPE_P (index_type))
+ || (TYPE_MODE (index_type) != SImode
+ && TYPE_MODE (index_type) != DImode))
+ return NULL_TREE;
+
+ if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+ return NULL_TREE;
+
+ /* v*gather* insn sign extends index to pointer mode. */
+ if (TYPE_PRECISION (index_type) < POINTER_SIZE
+ && TYPE_UNSIGNED (index_type))
+ return NULL_TREE;
+
+ if (scale <= 0
+ || scale > 8
+ || (scale & (scale - 1)) != 0)
+ return NULL_TREE;
+
+ si = TYPE_MODE (index_type) == SImode;
+ switch (TYPE_MODE (mem_vectype))
+ {
+ case V2DFmode:
+ code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
+ break;
+ case V4DFmode:
+ code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
+ break;
+ case V2DImode:
+ code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
+ break;
+ case V4DImode:
+ code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
+ break;
+ case V4SFmode:
+ code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
+ break;
+ case V8SFmode:
+ code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
+ break;
+ case V4SImode:
+ code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
+ break;
+ case V8SImode:
+ code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
+ break;
+ default:
+ return NULL_TREE;
+ }
+
+ return ix86_builtins[code];
+}
+
/* Returns a code for a target-specific builtin that implements
reciprocal of the function, or NULL_TREE if not available. */
#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
ix86_builtin_vectorized_function
+#undef TARGET_VECTORIZE_BUILTIN_GATHER
+#define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
+
#undef TARGET_BUILTIN_RECIPROCAL
#define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
;; Mix-n-match
(define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
-(define_mode_iterator AVXMODE48P_DI
- [V2DI V2DF V4DI V4DF V4SF V4SI])
-(define_mode_attr AVXMODE48P_DI
- [(V2DI "V2DI") (V2DF "V2DI")
- (V4DI "V4DI") (V4DF "V4DI")
- (V4SI "V2DI") (V4SF "V2DI")
- (V8SI "V4DI") (V8SF "V4DI")])
-
(define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF])
;; Mapping of immediate bits for blend instructions
;; For gather* insn patterns
(define_mode_iterator VEC_GATHER_MODE
[V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF])
-(define_mode_attr VEC_GATHER_MODE
+(define_mode_attr VEC_GATHER_IDXSI
[(V2DI "V4SI") (V2DF "V4SI")
(V4DI "V4SI") (V4DF "V4SI")
(V4SI "V4SI") (V4SF "V4SI")
(V8SI "V8SI") (V8SF "V8SI")])
+(define_mode_attr VEC_GATHER_IDXDI
+ [(V2DI "V2DI") (V2DF "V2DI")
+ (V4DI "V4DI") (V4DF "V4DI")
+ (V4SI "V2DI") (V4SF "V2DI")
+ (V8SI "V4DI") (V8SF "V4DI")])
+(define_mode_attr VEC_GATHER_SRCDI
+ [(V2DI "V2DI") (V2DF "V2DF")
+ (V4DI "V4DI") (V4DF "V4DF")
+ (V4SI "V4SI") (V4SF "V4SF")
+ (V8SI "V4SI") (V8SF "V4SF")])
(define_expand "avx2_gathersi<mode>"
[(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "")
(mem:<ssescalarmode>
(match_par_dup 7
[(match_operand 2 "vsib_address_operand" "")
- (match_operand:<VEC_GATHER_MODE> 3 "register_operand" "")
+ (match_operand:<VEC_GATHER_IDXSI>
+ 3 "register_operand" "")
(match_operand:SI 5 "const1248_operand " "")]))
(mem:BLK (scratch))
(match_operand:VEC_GATHER_MODE 4 "register_operand" "")]
(match_operator:<ssescalarmode> 7 "vsib_mem_operator"
[(unspec:P
[(match_operand:P 3 "vsib_address_operand" "p")
- (match_operand:<VEC_GATHER_MODE> 4 "register_operand" "x")
+ (match_operand:<VEC_GATHER_IDXSI> 4 "register_operand" "x")
(match_operand:SI 6 "const1248_operand" "n")]
UNSPEC_VSIBADDR)])
(mem:BLK (scratch))
(define_expand "avx2_gatherdi<mode>"
[(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "")
(unspec:VEC_GATHER_MODE
- [(match_operand:VEC_GATHER_MODE 1 "register_operand" "")
+ [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand" "")
(mem:<ssescalarmode>
(match_par_dup 7
[(match_operand 2 "vsib_address_operand" "")
- (match_operand:<AVXMODE48P_DI> 3 "register_operand" "")
+ (match_operand:<VEC_GATHER_IDXDI>
+ 3 "register_operand" "")
(match_operand:SI 5 "const1248_operand " "")]))
(mem:BLK (scratch))
- (match_operand:VEC_GATHER_MODE 4 "register_operand" "")]
+ (match_operand:<VEC_GATHER_SRCDI>
+ 4 "register_operand" "")]
UNSPEC_GATHER))
(clobber (match_scratch:VEC_GATHER_MODE 6 ""))])]
"TARGET_AVX2"
})
(define_insn "*avx2_gatherdi<mode>"
- [(set (match_operand:AVXMODE48P_DI 0 "register_operand" "=&x")
- (unspec:AVXMODE48P_DI
- [(match_operand:AVXMODE48P_DI 2 "register_operand" "0")
+ [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
+ (unspec:VEC_GATHER_MODE
+ [(match_operand:<VEC_GATHER_SRCDI> 2 "register_operand" "0")
(match_operator:<ssescalarmode> 7 "vsib_mem_operator"
[(unspec:P
[(match_operand:P 3 "vsib_address_operand" "p")
- (match_operand:<AVXMODE48P_DI> 4 "register_operand" "x")
+ (match_operand:<VEC_GATHER_IDXDI> 4 "register_operand" "x")
(match_operand:SI 6 "const1248_operand" "n")]
UNSPEC_VSIBADDR)])
(mem:BLK (scratch))
- (match_operand:AVXMODE48P_DI 5 "register_operand" "1")]
+ (match_operand:<VEC_GATHER_SRCDI> 5 "register_operand" "1")]
UNSPEC_GATHER))
- (clobber (match_scratch:AVXMODE48P_DI 1 "=&x"))]
- "TARGET_AVX2"
- "v<sseintprefix>gatherq<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}"
- [(set_attr "type" "ssemov")
- (set_attr "prefix" "vex")
- (set_attr "mode" "<sseinsnmode>")])
-
-;; Special handling for VEX.256 with float arguments
-;; since there're still xmms as operands
-(define_expand "avx2_gatherdi<mode>256"
- [(parallel [(set (match_operand:VI4F_128 0 "register_operand" "")
- (unspec:VI4F_128
- [(match_operand:VI4F_128 1 "register_operand" "")
- (mem:<ssescalarmode>
- (match_par_dup 7
- [(match_operand 2 "vsib_address_operand" "")
- (match_operand:V4DI 3 "register_operand" "")
- (match_operand:SI 5 "const1248_operand " "")]))
- (mem:BLK (scratch))
- (match_operand:VI4F_128 4 "register_operand" "")]
- UNSPEC_GATHER))
- (clobber (match_scratch:VI4F_128 6 ""))])]
- "TARGET_AVX2"
-{
- operands[7]
- = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3],
- operands[5]), UNSPEC_VSIBADDR);
-})
-
-(define_insn "*avx2_gatherdi<mode>256"
- [(set (match_operand:VI4F_128 0 "register_operand" "=x")
- (unspec:VI4F_128
- [(match_operand:VI4F_128 2 "register_operand" "0")
- (match_operator:<ssescalarmode> 7 "vsib_mem_operator"
- [(unspec:P
- [(match_operand:P 3 "vsib_address_operand" "p")
- (match_operand:V4DI 4 "register_operand" "x")
- (match_operand:SI 6 "const1248_operand" "n")]
- UNSPEC_VSIBADDR)])
- (mem:BLK (scratch))
- (match_operand:VI4F_128 5 "register_operand" "1")]
- UNSPEC_GATHER))
- (clobber (match_scratch:VI4F_128 1 "=&x"))]
+ (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
"TARGET_AVX2"
- "v<sseintprefix>gatherq<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}"
+ "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}"
[(set_attr "type" "ssemov")
(set_attr "prefix" "vex")
(set_attr "mode" "<sseinsnmode>")])
The default is zero which means to not iterate over other vector sizes.
@end deftypefn
+@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_GATHER (const_tree @var{mem_vectype}, const_tree @var{index_type}, int @var{scale})
+Target builtin that implements vector gather operation. @var{mem_vectype}
+is the vector type of the load and @var{index_type} is scalar type of
+the index, scaled by @var{scale}.
+The default is @code{NULL_TREE} which means to not vectorize gather
+loads.
+@end deftypefn
+
@node Anchored Addresses
@section Anchored Addresses
@cindex anchored addresses
The default is zero which means to not iterate over other vector sizes.
@end deftypefn
+@hook TARGET_VECTORIZE_BUILTIN_GATHER
+Target builtin that implements vector gather operation. @var{mem_vectype}
+is the vector type of the load and @var{index_type} is scalar type of
+the index, scaled by @var{scale}.
+The default is @code{NULL_TREE} which means to not vectorize gather
+loads.
+@end deftypefn
+
@node Anchored Addresses
@section Anchored Addresses
@cindex anchored addresses
(void),
default_autovectorize_vector_sizes)
+/* Target builtin that implements vector gather operation. */
+DEFHOOK
+(builtin_gather,
+ "",
+ tree,
+ (const_tree mem_vectype, const_tree index_type, int scale),
+ NULL)
+
HOOK_VECTOR_END (vectorize)
#undef HOOK_PREFIX
+2011-11-07 Jakub Jelinek <jakub@redhat.com>
+
+ PR tree-optimization/50789
+ * gcc.target/i386/avx2-gather-1.c: New test.
+ * gcc.target/i386/avx2-gather-2.c: New test.
+ * gcc.target/i386/avx2-gather-3.c: New test.
+ * gcc.target/i386/avx2-gather-4.c: New test.
+
2011-11-07 Uros Bizjak <ubizjak@gmail.com>
* gcc.target/i386/pr49781-1.c (dg-options): Add -mtune=generic.
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O3 -mavx2" } */
+
+#include "avx2-check.h"
+
+#define N 1024
+float vf1[N+16], vf2[N];
+double vd1[N+16], vd2[N];
+int k[N];
+long l[N];
+short n[N];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vf2[i] = vf1[k[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vf1[k[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vf2[i] = vf1[k[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vf1[k[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vd2[i] = vd1[k[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vd1[k[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vd2[i] = vd1[k[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vd1[k[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vf2[i] = vf1[l[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vf1[l[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f11 (long x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vf2[i] = vf1[l[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f12 (long x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vf1[l[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f13 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vd2[i] = vd1[l[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f14 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vd1[l[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f15 (long x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vd2[i] = vd1[l[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f16 (long x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vd1[l[i] + x];
+}
+
+static void
+avx2_test (void)
+{
+ int i;
+
+ for (i = 0; i < N + 16; i++)
+ {
+ asm ("");
+ vf1[i] = 17.0f + i;
+ vd1[i] = 19.0 + i;
+ }
+ for (i = 0; i < N; i++)
+ {
+ asm ("");
+ k[i] = (i * 731) & (N - 1);
+ l[i] = (i * 657) & (N - 1);
+ }
+
+ f1 ();
+ f2 ();
+ for (i = 0; i < N; i++)
+ if (vf2[i] != ((i * 731) & (N - 1)) + 17
+ || n[i] != ((i * 731) & (N - 1)) + 17)
+ abort ();
+
+ f3 (12);
+ f4 (14);
+ for (i = 0; i < N; i++)
+ if (vf2[i] != ((i * 731) & (N - 1)) + 17 + 12
+ || n[i] != ((i * 731) & (N - 1)) + 17 + 14)
+ abort ();
+
+ f5 ();
+ f6 ();
+ for (i = 0; i < N; i++)
+ if (vd2[i] != ((i * 731) & (N - 1)) + 19
+ || n[i] != ((i * 731) & (N - 1)) + 19)
+ abort ();
+
+ f7 (7);
+ f8 (9);
+ for (i = 0; i < N; i++)
+ if (vd2[i] != ((i * 731) & (N - 1)) + 19 + 7
+ || n[i] != ((i * 731) & (N - 1)) + 19 + 9)
+ abort ();
+
+ f9 ();
+ f10 ();
+ for (i = 0; i < N; i++)
+ if (vf2[i] != ((i * 657) & (N - 1)) + 17
+ || n[i] != ((i * 657) & (N - 1)) + 17)
+ abort ();
+
+ f11 (2);
+ f12 (4);
+ for (i = 0; i < N; i++)
+ if (vf2[i] != ((i * 657) & (N - 1)) + 17 + 2
+ || n[i] != ((i * 657) & (N - 1)) + 17 + 4)
+ abort ();
+
+ f13 ();
+ f14 ();
+ for (i = 0; i < N; i++)
+ if (vd2[i] != ((i * 657) & (N - 1)) + 19
+ || n[i] != ((i * 657) & (N - 1)) + 19)
+ abort ();
+
+ f15 (13);
+ f16 (15);
+ for (i = 0; i < N; i++)
+ if (vd2[i] != ((i * 657) & (N - 1)) + 19 + 13
+ || n[i] != ((i * 657) & (N - 1)) + 19 + 15)
+ abort ();
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */
+
+#include "avx2-gather-1.c"
+
+/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops in function" 16 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O3 -mavx2 -ffast-math" } */
+
+#include "avx2-check.h"
+
+#define N 1024
+float f[N];
+double d[N];
+int k[N];
+float *l[N];
+double *n[N];
+int **m[N];
+long **o[N];
+long q[N];
+long *r[N];
+int *s[N];
+
+__attribute__((noinline, noclone)) float
+f1 (void)
+{
+ int i;
+ float g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += f[k[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) float
+f2 (float *p)
+{
+ int i;
+ float g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += p[k[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) float
+f3 (void)
+{
+ int i;
+ float g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += *l[i];
+ return g;
+}
+
+__attribute__((noinline, noclone)) int
+f4 (void)
+{
+ int i;
+ int g = 0;
+ for (i = 0; i < N / 2; i++)
+ g += **m[i];
+ return g;
+}
+
+__attribute__((noinline, noclone)) double
+f5 (void)
+{
+ int i;
+ double g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += d[k[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) double
+f6 (double *p)
+{
+ int i;
+ double g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += p[k[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) double
+f7 (void)
+{
+ int i;
+ double g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += *n[i];
+ return g;
+}
+
+__attribute__((noinline, noclone)) int
+f8 (void)
+{
+ int i;
+ int g = 0;
+ for (i = 0; i < N / 2; i++)
+ g += **o[i];
+ return g;
+}
+
+__attribute__((noinline, noclone)) float
+f9 (void)
+{
+ int i;
+ float g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += f[q[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) float
+f10 (float *p)
+{
+ int i;
+ float g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += p[q[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) double
+f11 (void)
+{
+ int i;
+ double g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += d[q[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) double
+f12 (double *p)
+{
+ int i;
+ double g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += p[q[i]];
+ return g;
+}
+
+static void
+avx2_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ {
+ asm ("");
+ f[i] = -256.0f + i;
+ d[i] = -258.0 + i;
+ k[i] = (i * 731) & (N - 1);
+ q[i] = (i * 657) & (N - 1);
+ l[i] = &f[(i * 239) & (N - 1)];
+ n[i] = &d[(i * 271) & (N - 1)];
+ r[i] = &q[(i * 323) & (N - 1)];
+ s[i] = &k[(i * 565) & (N - 1)];
+ m[i] = &s[(i * 13) & (N - 1)];
+ o[i] = &r[(i * 19) & (N - 1)];
+ }
+
+ if (f1 () != 136448.0f || f2 (f) != 136448.0f || f3 () != 130304.0)
+ abort ();
+ if (f4 () != 261376 || f5 () != 135424.0 || f6 (d) != 135424.0)
+ abort ();
+ if (f7 () != 129280.0 || f8 () != 259840L || f9 () != 130816.0f)
+ abort ();
+ if (f10 (f) != 130816.0f || f11 () != 129792.0 || f12 (d) != 129792.0)
+ abort ();
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O3 -mavx2" } */
+
+#include "avx2-check.h"
+
+#define N 1024
+int a[N], b[N], c[N], d[N];
+
+__attribute__((noinline, noclone)) void
+foo (float *__restrict p, float *__restrict q, float *__restrict r,
+ long s1, long s2, long s3)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ p[i] = q[a[i] * s1 + b[i] * s2 + s3] * r[c[i] * s1 + d[i] * s2 + s3];
+}
+
+static void
+avx2_test (void)
+{
+ int i;
+ float e[N], f[N], g[N];
+ for (i = 0; i < N; i++)
+ {
+ a[i] = (i * 7) & (N / 8 - 1);
+ b[i] = (i * 13) & (N / 8 - 1);
+ c[i] = (i * 23) & (N / 8 - 1);
+ d[i] = (i * 5) & (N / 8 - 1);
+ e[i] = 16.5 + i;
+ f[i] = 127.5 - i;
+ }
+ foo (g, e, f, 3, 2, 4);
+ for (i = 0; i < N; i++)
+ if (g[i] != (float) ((20.5 + a[i] * 3 + b[i] * 2)
+ * (123.5 - c[i] * 3 - d[i] * 2)))
+ abort ();
+}
return refs_may_alias_p (addr_a, addr_b);
}
-static void compute_self_dependence (struct data_dependence_relation *);
-
/* Initialize a data dependence relation between data accesses A and
B. NB_LOOPS is the number of loops surrounding the references: the
size of the classic distance/direction vectors. */
-static struct data_dependence_relation *
+struct data_dependence_relation *
initialize_data_dependence_relation (struct data_reference *a,
struct data_reference *b,
VEC (loop_p, heap) *loop_nest)
/* This computes the dependence relation for the same data
reference into DDR. */
-static void
+void
compute_self_dependence (struct data_dependence_relation *ddr)
{
unsigned int i;
/* Data references and dependences detectors.
- Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+ Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
Free Software Foundation, Inc.
Contributed by Sebastian Pop <pop@cri.ensmp.fr>
VEC (data_reference_p, heap) **);
struct data_reference *create_data_ref (loop_p, loop_p, tree, gimple, bool);
extern bool find_loop_nest (struct loop *, VEC (loop_p, heap) **);
+extern struct data_dependence_relation *initialize_data_dependence_relation
+ (struct data_reference *, struct data_reference *, VEC (loop_p, heap) *);
+extern void compute_self_dependence (struct data_dependence_relation *);
extern void compute_all_dependences (VEC (data_reference_p, heap) *,
VEC (ddr_p, heap) **, VEC (loop_p, heap) *,
bool);
return true;
}
+/* Check whether a non-affine read in stmt is suitable for gather load
+ and if so, return a builtin decl for that operation. */
+
+tree
+vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
+ tree *offp, int *scalep)
+{
+ HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ tree offtype = NULL_TREE;
+ tree decl, base, off;
+ enum machine_mode pmode;
+ int punsignedp, pvolatilep;
+
+ /* The gather builtins need address of the form
+ loop_invariant + vector * {1, 2, 4, 8}
+ or
+ loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
+ Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
+ of loop invariants/SSA_NAMEs defined in the loop, with casts,
+ multiplications and additions in it. To get a vector, we need
+ a single SSA_NAME that will be defined in the loop and will
+ contain everything that is not loop invariant and that can be
+ vectorized. The following code attempts to find such a preexistng
+ SSA_NAME OFF and put the loop invariants into a tree BASE
+ that can be gimplified before the loop. */
+ base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
+ &pmode, &punsignedp, &pvolatilep, false);
+ gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
+
+ if (TREE_CODE (base) == MEM_REF)
+ {
+ if (!integer_zerop (TREE_OPERAND (base, 1)))
+ {
+ if (off == NULL_TREE)
+ {
+ double_int moff = mem_ref_offset (base);
+ off = double_int_to_tree (sizetype, moff);
+ }
+ else
+ off = size_binop (PLUS_EXPR, off,
+ fold_convert (sizetype, TREE_OPERAND (base, 1)));
+ }
+ base = TREE_OPERAND (base, 0);
+ }
+ else
+ base = build_fold_addr_expr (base);
+
+ if (off == NULL_TREE)
+ off = size_zero_node;
+
+ /* If base is not loop invariant, either off is 0, then we start with just
+ the constant offset in the loop invariant BASE and continue with base
+ as OFF, otherwise give up.
+ We could handle that case by gimplifying the addition of base + off
+ into some SSA_NAME and use that as off, but for now punt. */
+ if (!expr_invariant_in_loop_p (loop, base))
+ {
+ if (!integer_zerop (off))
+ return NULL_TREE;
+ off = base;
+ base = size_int (pbitpos / BITS_PER_UNIT);
+ }
+ /* Otherwise put base + constant offset into the loop invariant BASE
+ and continue with OFF. */
+ else
+ {
+ base = fold_convert (sizetype, base);
+ base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
+ }
+
+ /* OFF at this point may be either a SSA_NAME or some tree expression
+ from get_inner_reference. Try to peel off loop invariants from it
+ into BASE as long as possible. */
+ STRIP_NOPS (off);
+ while (offtype == NULL_TREE)
+ {
+ enum tree_code code;
+ tree op0, op1, add = NULL_TREE;
+
+ if (TREE_CODE (off) == SSA_NAME)
+ {
+ gimple def_stmt = SSA_NAME_DEF_STMT (off);
+
+ if (expr_invariant_in_loop_p (loop, off))
+ return NULL_TREE;
+
+ if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
+ break;
+
+ op0 = gimple_assign_rhs1 (def_stmt);
+ code = gimple_assign_rhs_code (def_stmt);
+ op1 = gimple_assign_rhs2 (def_stmt);
+ }
+ else
+ {
+ if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
+ return NULL_TREE;
+ code = TREE_CODE (off);
+ extract_ops_from_tree (off, &code, &op0, &op1);
+ }
+ switch (code)
+ {
+ case POINTER_PLUS_EXPR:
+ case PLUS_EXPR:
+ if (expr_invariant_in_loop_p (loop, op0))
+ {
+ add = op0;
+ off = op1;
+ do_add:
+ add = fold_convert (sizetype, add);
+ if (scale != 1)
+ add = size_binop (MULT_EXPR, add, size_int (scale));
+ base = size_binop (PLUS_EXPR, base, add);
+ continue;
+ }
+ if (expr_invariant_in_loop_p (loop, op1))
+ {
+ add = op1;
+ off = op0;
+ goto do_add;
+ }
+ break;
+ case MINUS_EXPR:
+ if (expr_invariant_in_loop_p (loop, op1))
+ {
+ add = fold_convert (sizetype, op1);
+ add = size_binop (MINUS_EXPR, size_zero_node, add);
+ off = op0;
+ goto do_add;
+ }
+ break;
+ case MULT_EXPR:
+ if (scale == 1 && host_integerp (op1, 0))
+ {
+ scale = tree_low_cst (op1, 0);
+ off = op0;
+ continue;
+ }
+ break;
+ case SSA_NAME:
+ off = op0;
+ continue;
+ CASE_CONVERT:
+ if (!POINTER_TYPE_P (TREE_TYPE (op0))
+ && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
+ break;
+ if (TYPE_PRECISION (TREE_TYPE (op0))
+ == TYPE_PRECISION (TREE_TYPE (off)))
+ {
+ off = op0;
+ continue;
+ }
+ if (TYPE_PRECISION (TREE_TYPE (op0))
+ < TYPE_PRECISION (TREE_TYPE (off)))
+ {
+ off = op0;
+ offtype = TREE_TYPE (off);
+ STRIP_NOPS (off);
+ continue;
+ }
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+
+ /* If at the end OFF still isn't a SSA_NAME or isn't
+ defined in the loop, punt. */
+ if (TREE_CODE (off) != SSA_NAME
+ || expr_invariant_in_loop_p (loop, off))
+ return NULL_TREE;
+
+ if (offtype == NULL_TREE)
+ offtype = TREE_TYPE (off);
+
+ decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
+ offtype, scale);
+ if (decl == NULL_TREE)
+ return NULL_TREE;
+
+ if (basep)
+ *basep = base;
+ if (offp)
+ *offp = off;
+ if (scalep)
+ *scalep = scale;
+ return decl;
+}
+
/* Function vect_analyze_data_refs.
gimple stmt;
stmt_vec_info stmt_info;
tree base, offset, init;
+ bool gather = false;
int vf;
if (!dr || !DR_REF (dr))
/* Check that analysis of the data-ref succeeded. */
if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
- || !DR_STEP (dr))
+ || !DR_STEP (dr))
{
- if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
- {
- fprintf (vect_dump, "not vectorized: data ref analysis failed ");
- print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
- }
+ /* If target supports vector gather loads, see if they can't
+ be used. */
+ if (loop_vinfo
+ && DR_IS_READ (dr)
+ && !TREE_THIS_VOLATILE (DR_REF (dr))
+ && targetm.vectorize.builtin_gather != NULL
+ && !nested_in_vect_loop_p (loop, stmt))
+ {
+ struct data_reference *newdr
+ = create_data_ref (NULL, loop_containing_stmt (stmt),
+ DR_REF (dr), stmt, true);
+ gcc_assert (newdr != NULL && DR_REF (newdr));
+ if (DR_BASE_ADDRESS (newdr)
+ && DR_OFFSET (newdr)
+ && DR_INIT (newdr)
+ && DR_STEP (newdr)
+ && integer_zerop (DR_STEP (newdr)))
+ {
+ dr = newdr;
+ gather = true;
+ }
+ else
+ free_data_ref (newdr);
+ }
- if (bb_vinfo)
- {
- STMT_VINFO_VECTORIZABLE (stmt_info) = false;
- stop_bb_analysis = true;
- continue;
- }
+ if (!gather)
+ {
+ if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+ {
+ fprintf (vect_dump, "not vectorized: data ref analysis "
+ "failed ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
- return false;
+ if (bb_vinfo)
+ {
+ STMT_VINFO_VECTORIZABLE (stmt_info) = false;
+ stop_bb_analysis = true;
+ continue;
+ }
+
+ return false;
+ }
}
if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
continue;
}
- return false;
+ if (gather)
+ free_data_ref (dr);
+ return false;
}
if (TREE_THIS_VOLATILE (DR_REF (dr)))
continue;
}
+ if (gather)
+ free_data_ref (dr);
return false;
}
continue;
}
+ if (gather)
+ free_data_ref (dr);
return false;
}
stop_bb_analysis = true;
continue;
}
- else
- return false;
+
+ if (gather)
+ {
+ STMT_VINFO_DATA_REF (stmt_info) = NULL;
+ free_data_ref (dr);
+ }
+ return false;
}
/* Adjust the minimal vectorization factor according to the
vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
if (vf > *min_vf)
*min_vf = vf;
+
+ if (gather)
+ {
+ unsigned int j, k, n;
+ struct data_reference *olddr
+ = VEC_index (data_reference_p, datarefs, i);
+ VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo);
+ struct data_dependence_relation *ddr, *newddr;
+ bool bad = false;
+ tree off;
+ VEC (loop_p, heap) *nest = LOOP_VINFO_LOOP_NEST (loop_vinfo);
+
+ if (!vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL)
+ || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
+ {
+ if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+ {
+ fprintf (vect_dump,
+ "not vectorized: not suitable for gather ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+ return false;
+ }
+
+ n = VEC_length (data_reference_p, datarefs) - 1;
+ for (j = 0, k = i - 1; j < i; j++)
+ {
+ ddr = VEC_index (ddr_p, ddrs, k);
+ gcc_assert (DDR_B (ddr) == olddr);
+ newddr = initialize_data_dependence_relation (DDR_A (ddr), dr,
+ nest);
+ VEC_replace (ddr_p, ddrs, k, newddr);
+ free_dependence_relation (ddr);
+ if (!bad
+ && DR_IS_WRITE (DDR_A (newddr))
+ && DDR_ARE_DEPENDENT (newddr) != chrec_known)
+ bad = true;
+ k += --n;
+ }
+
+ k++;
+ n = k + VEC_length (data_reference_p, datarefs) - i - 1;
+ for (; k < n; k++)
+ {
+ ddr = VEC_index (ddr_p, ddrs, k);
+ gcc_assert (DDR_A (ddr) == olddr);
+ newddr = initialize_data_dependence_relation (dr, DDR_B (ddr),
+ nest);
+ VEC_replace (ddr_p, ddrs, k, newddr);
+ free_dependence_relation (ddr);
+ if (!bad
+ && DR_IS_WRITE (DDR_B (newddr))
+ && DDR_ARE_DEPENDENT (newddr) != chrec_known)
+ bad = true;
+ }
+
+ k = VEC_length (ddr_p, ddrs)
+ - VEC_length (data_reference_p, datarefs) + i;
+ ddr = VEC_index (ddr_p, ddrs, k);
+ gcc_assert (DDR_A (ddr) == olddr && DDR_B (ddr) == olddr);
+ newddr = initialize_data_dependence_relation (dr, dr, nest);
+ compute_self_dependence (newddr);
+ VEC_replace (ddr_p, ddrs, k, newddr);
+ free_dependence_relation (ddr);
+ VEC_replace (data_reference_p, datarefs, i, dr);
+
+ if (bad)
+ {
+ if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+ {
+ fprintf (vect_dump,
+ "not vectorized: data dependence conflict"
+ " prevents gather");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+ return false;
+ }
+
+ STMT_VINFO_GATHER_P (stmt_info) = true;
+ }
}
return true;
- LIVE_P, RELEVANT - enum values to be set in the STMT_VINFO of the stmt
that defined USE. This is done by calling mark_relevant and passing it
the WORKLIST (to add DEF_STMT to the WORKLIST in case it is relevant).
+ - FORCE is true if exist_non_indexing_operands_for_use_p check shouldn't
+ be performed.
Outputs:
Generally, LIVE_P and RELEVANT are used to define the liveness and
static bool
process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
- enum vect_relevant relevant, VEC(gimple,heap) **worklist)
+ enum vect_relevant relevant, VEC(gimple,heap) **worklist,
+ bool force)
{
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
/* case 1: we are only interested in uses that need to be vectorized. Uses
that are used for address computation are not considered relevant. */
- if (!exist_non_indexing_operands_for_use_p (use, stmt))
+ if (!force && !exist_non_indexing_operands_for_use_p (use, stmt))
return true;
if (!vect_is_simple_use (use, loop_vinfo, NULL, &def_stmt, &def, &dt))
break;
}
- if (is_pattern_stmt_p (vinfo_for_stmt (stmt)))
+ if (is_pattern_stmt_p (stmt_vinfo))
{
/* Pattern statements are not inserted into the code, so
FOR_EACH_PHI_OR_STMT_USE optimizes their operands out, and we
if (rhs_code == COND_EXPR && COMPARISON_CLASS_P (op))
{
if (!process_use (stmt, TREE_OPERAND (op, 0), loop_vinfo,
- live_p, relevant, &worklist)
+ live_p, relevant, &worklist, false)
|| !process_use (stmt, TREE_OPERAND (op, 1), loop_vinfo,
- live_p, relevant, &worklist))
+ live_p, relevant, &worklist, false))
{
VEC_free (gimple, heap, worklist);
return false;
{
op = gimple_op (stmt, i);
if (!process_use (stmt, op, loop_vinfo, live_p, relevant,
- &worklist))
+ &worklist, false))
{
VEC_free (gimple, heap, worklist);
return false;
{
tree arg = gimple_call_arg (stmt, i);
if (!process_use (stmt, arg, loop_vinfo, live_p, relevant,
- &worklist))
+ &worklist, false))
{
VEC_free (gimple, heap, worklist);
return false;
{
tree op = USE_FROM_PTR (use_p);
if (!process_use (stmt, op, loop_vinfo, live_p, relevant,
- &worklist))
+ &worklist, false))
{
VEC_free (gimple, heap, worklist);
return false;
}
}
+
+ if (STMT_VINFO_GATHER_P (stmt_vinfo))
+ {
+ tree off;
+ tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
+ gcc_assert (decl);
+ if (!process_use (stmt, off, loop_vinfo, live_p, relevant,
+ &worklist, true))
+ {
+ VEC_free (gimple, heap, worklist);
+ return false;
+ }
+ }
} /* while worklist */
VEC_free (gimple, heap, worklist);
return true;
}
-/* Given a vector type VECTYPE returns a builtin DECL to be used
- for vector permutation and returns the mask that implements
- reversal of the vector elements. If that is impossible to do,
- returns NULL. */
+/* Given a vector type VECTYPE and permutation SEL returns
+ the VECTOR_CST mask that implements the permutation of the
+ vector elements. If that is impossible to do, returns NULL. */
static tree
-perm_mask_for_reverse (tree vectype)
+gen_perm_mask (tree vectype, unsigned char *sel)
{
tree mask_elt_type, mask_type, mask_vec;
int i, nunits;
- unsigned char *sel;
nunits = TYPE_VECTOR_SUBPARTS (vectype);
- sel = XALLOCAVEC (unsigned char, nunits);
-
- for (i = 0; i < nunits; ++i)
- sel[i] = nunits - 1 - i;
if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
return NULL;
mask_type = get_vectype_for_scalar_type (mask_elt_type);
mask_vec = NULL;
- for (i = 0; i < nunits; i++)
- mask_vec = tree_cons (NULL, build_int_cst (mask_elt_type, i), mask_vec);
+ for (i = nunits - 1; i >= 0; i--)
+ mask_vec = tree_cons (NULL, build_int_cst (mask_elt_type, sel[i]),
+ mask_vec);
mask_vec = build_vector (mask_type, mask_vec);
return mask_vec;
}
-/* Given a vector variable X, that was generated for the scalar LHS of
- STMT, generate instructions to reverse the vector elements of X,
- insert them a *GSI and return the permuted vector variable. */
+/* Given a vector type VECTYPE returns the VECTOR_CST mask that implements
+ reversal of the vector elements. If that is impossible to do,
+ returns NULL. */
static tree
-reverse_vec_elements (tree x, gimple stmt, gimple_stmt_iterator *gsi)
+perm_mask_for_reverse (tree vectype)
+{
+ int i, nunits;
+ unsigned char *sel;
+
+ nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ sel = XALLOCAVEC (unsigned char, nunits);
+
+ for (i = 0; i < nunits; ++i)
+ sel[i] = nunits - 1 - i;
+
+ return gen_perm_mask (vectype, sel);
+}
+
+/* Given a vector variable X and Y, that was generated for the scalar
+ STMT, generate instructions to permute the vector elements of X and Y
+ using permutation mask MASK_VEC, insert them at *GSI and return the
+ permuted vector variable. */
+
+static tree
+permute_vec_elements (tree x, tree y, tree mask_vec, gimple stmt,
+ gimple_stmt_iterator *gsi)
{
tree vectype = TREE_TYPE (x);
- tree mask_vec, perm_dest, data_ref;
+ tree perm_dest, data_ref;
gimple perm_stmt;
- mask_vec = perm_mask_for_reverse (vectype);
-
perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
+ data_ref = make_ssa_name (perm_dest, NULL);
/* Generate the permute statement. */
- perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, perm_dest,
- x, x, mask_vec);
- data_ref = make_ssa_name (perm_dest, perm_stmt);
- gimple_set_lhs (perm_stmt, data_ref);
+ perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, data_ref,
+ x, y, mask_vec);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
return data_ref;
bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
int vf;
tree aggr_type;
+ tree gather_base = NULL_TREE, gather_off = NULL_TREE;
+ tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE;
+ int gather_scale = 1;
+ enum vect_def_type gather_dt = vect_unknown_def_type;
if (loop_vinfo)
{
{
strided_load = true;
/* FORNOW */
- gcc_assert (! nested_in_vect_loop);
+ gcc_assert (! nested_in_vect_loop && !STMT_VINFO_GATHER_P (stmt_info));
first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
if (!slp && !PURE_SLP_STMT (stmt_info))
if (negative)
{
- gcc_assert (!strided_load);
+ gcc_assert (!strided_load && !STMT_VINFO_GATHER_P (stmt_info));
alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
if (alignment_support_scheme != dr_aligned
&& alignment_support_scheme != dr_unaligned_supported)
}
}
+ if (STMT_VINFO_GATHER_P (stmt_info))
+ {
+ gimple def_stmt;
+ tree def;
+ gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
+ &gather_off, &gather_scale);
+ gcc_assert (gather_decl);
+ if (!vect_is_simple_use_1 (gather_off, loop_vinfo, bb_vinfo,
+ &def_stmt, &def, &gather_dt,
+ &gather_off_vectype))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "gather index use not simple.");
+ return false;
+ }
+ }
+
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
/** Transform. **/
+ if (STMT_VINFO_GATHER_P (stmt_info))
+ {
+ tree vec_oprnd0 = NULL_TREE, op;
+ tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl));
+ tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
+ tree ptr, mask, var, scale, perm_mask = NULL_TREE, prev_res = NULL_TREE;
+ edge pe = loop_preheader_edge (loop);
+ gimple_seq seq;
+ basic_block new_bb;
+ enum { NARROW, NONE, WIDEN } modifier;
+ int gather_off_nunits = TYPE_VECTOR_SUBPARTS (gather_off_vectype);
+
+ if (nunits == gather_off_nunits)
+ modifier = NONE;
+ else if (nunits == gather_off_nunits / 2)
+ {
+ unsigned char *sel = XALLOCAVEC (unsigned char, gather_off_nunits);
+ modifier = WIDEN;
+
+ for (i = 0; i < gather_off_nunits; ++i)
+ sel[i] = i | nunits;
+
+ perm_mask = gen_perm_mask (gather_off_vectype, sel);
+ gcc_assert (perm_mask != NULL_TREE);
+ }
+ else if (nunits == gather_off_nunits * 2)
+ {
+ unsigned char *sel = XALLOCAVEC (unsigned char, nunits);
+ modifier = NARROW;
+
+ for (i = 0; i < nunits; ++i)
+ sel[i] = i < gather_off_nunits
+ ? i : i + nunits - gather_off_nunits;
+
+ perm_mask = gen_perm_mask (vectype, sel);
+ gcc_assert (perm_mask != NULL_TREE);
+ ncopies *= 2;
+ }
+ else
+ gcc_unreachable ();
+
+ rettype = TREE_TYPE (TREE_TYPE (gather_decl));
+ srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+ ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+ idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+ masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+ scaletype = TREE_VALUE (arglist);
+ gcc_checking_assert (types_compatible_p (srctype, rettype)
+ && types_compatible_p (srctype, masktype));
+
+ vec_dest = vect_create_destination_var (scalar_dest, vectype);
+
+ ptr = fold_convert (ptrtype, gather_base);
+ if (!is_gimple_min_invariant (ptr))
+ {
+ ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE);
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+ gcc_assert (!new_bb);
+ }
+
+ /* Currently we support only unconditional gather loads,
+ so mask should be all ones. */
+ if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+ mask = build_int_cst (TREE_TYPE (masktype), -1);
+ else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype)))
+ {
+ REAL_VALUE_TYPE r;
+ long tmp[6];
+ for (j = 0; j < 6; ++j)
+ tmp[j] = -1;
+ real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype)));
+ mask = build_real (TREE_TYPE (masktype), r);
+ }
+ else
+ gcc_unreachable ();
+ mask = build_vector_from_val (masktype, mask);
+ mask = vect_init_vector (stmt, mask, masktype, NULL);
+
+ scale = build_int_cst (scaletype, gather_scale);
+
+ prev_stmt_info = NULL;
+ for (j = 0; j < ncopies; ++j)
+ {
+ if (modifier == WIDEN && (j & 1))
+ op = permute_vec_elements (vec_oprnd0, vec_oprnd0,
+ perm_mask, stmt, gsi);
+ else if (j == 0)
+ op = vec_oprnd0
+ = vect_get_vec_def_for_operand (gather_off, stmt, NULL);
+ else
+ op = vec_oprnd0
+ = vect_get_vec_def_for_stmt_copy (gather_dt, vec_oprnd0);
+
+ if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
+ {
+ gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op))
+ == TYPE_VECTOR_SUBPARTS (idxtype));
+ var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL);
+ add_referenced_var (var);
+ var = make_ssa_name (var, NULL);
+ op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
+ new_stmt
+ = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var,
+ op, NULL_TREE);
+ vect_finish_stmt_generation (stmt, new_stmt, gsi);
+ op = var;
+ }
+
+ new_stmt
+ = gimple_build_call (gather_decl, 5, mask, ptr, op, mask, scale);
+
+ if (!useless_type_conversion_p (vectype, rettype))
+ {
+ gcc_assert (TYPE_VECTOR_SUBPARTS (vectype)
+ == TYPE_VECTOR_SUBPARTS (rettype));
+ var = vect_get_new_vect_var (rettype, vect_simple_var, NULL);
+ add_referenced_var (var);
+ op = make_ssa_name (var, new_stmt);
+ gimple_call_set_lhs (new_stmt, op);
+ vect_finish_stmt_generation (stmt, new_stmt, gsi);
+ var = make_ssa_name (vec_dest, NULL);
+ op = build1 (VIEW_CONVERT_EXPR, vectype, op);
+ new_stmt
+ = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, op,
+ NULL_TREE);
+ }
+ else
+ {
+ var = make_ssa_name (vec_dest, new_stmt);
+ gimple_call_set_lhs (new_stmt, var);
+ }
+
+ vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+ if (modifier == NARROW)
+ {
+ if ((j & 1) == 0)
+ {
+ prev_res = var;
+ continue;
+ }
+ var = permute_vec_elements (prev_res, var,
+ perm_mask, stmt, gsi);
+ new_stmt = SSA_NAME_DEF_STMT (var);
+ }
+
+ if (prev_stmt_info == NULL)
+ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ }
+ return true;
+ }
+
if (strided_load)
{
first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
if (negative)
{
- new_temp = reverse_vec_elements (new_temp, stmt, gsi);
+ tree perm_mask = perm_mask_for_reverse (vectype);
+ new_temp = permute_vec_elements (new_temp, new_temp,
+ perm_mask, stmt, gsi);
new_stmt = SSA_NAME_DEF_STMT (new_temp);
}
/* Is this statement vectorizable or should it be skipped in (partial)
vectorization. */
bool vectorizable;
+
+ /* For loads only, true if this is a gather load. */
+ bool gather_p;
} *stmt_vec_info;
/* Access Functions. */
#define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt
#define STMT_VINFO_VECTORIZABLE(S) (S)->vectorizable
#define STMT_VINFO_DATA_REF(S) (S)->data_ref_info
+#define STMT_VINFO_GATHER_P(S) (S)->gather_p
#define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address
#define STMT_VINFO_DR_INIT(S) (S)->dr_init
extern bool vect_verify_datarefs_alignment (loop_vec_info, bb_vec_info);
extern bool vect_analyze_data_ref_accesses (loop_vec_info, bb_vec_info);
extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
+extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *,
+ int *);
extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *);
extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree,
tree *, gimple_stmt_iterator *,