From cf7aa6a3b79ac25df266aa4fcfe6c059243602aa Mon Sep 17 00:00:00 2001 From: Alan Lawrence Date: Thu, 13 Nov 2014 16:27:37 +0000 Subject: [PATCH] [Vectorizer] Use a VEC_PERM_EXPR instead of VEC_RSHIFT_EXPR; expand appropriate VEC_PERM_EXPRs using vec_shr_optab * optabs.c (can_vec_perm_p): Update comment, does not consider vec_shr. (shift_amt_for_vec_perm_mask): New. (expand_vec_perm_1): Use vec_shr_optab if second vector is const0_rtx and mask appropriate. * tree-vect-loop.c (calc_vec_perm_mask_for_shift): New. (have_whole_vector_shift): New. (vect_model_reduction_cost): Call have_whole_vector_shift instead of looking for vec_shr_optab. (vect_create_epilog_for_reduction): Likewise; also rename local variable have_whole_vector_shift to reduce_with_shift; output VEC_PERM_EXPRs instead of VEC_RSHIFT_EXPRs. * tree-vect-stmts.c (vect_gen_perm_mask_checked): Extend comment. From-SVN: r217509 --- gcc/ChangeLog | 17 +++++++++++ gcc/optabs.c | 48 ++++++++++++++++++++++++++++-- gcc/tree-vect-loop.c | 81 ++++++++++++++++++++++++++++++++++++--------------- gcc/tree-vect-stmts.c | 3 +- 4 files changed, 122 insertions(+), 27 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1ff6318..99f18b4 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,22 @@ 2014-11-13 Alan Lawrence + * optabs.c (can_vec_perm_p): Update comment, does not consider vec_shr. + (shift_amt_for_vec_perm_mask): New. + (expand_vec_perm_1): Use vec_shr_optab if second vector is const0_rtx + and mask appropriate. + + * tree-vect-loop.c (calc_vec_perm_mask_for_shift): New. + (have_whole_vector_shift): New. + (vect_model_reduction_cost): Call have_whole_vector_shift instead of + looking for vec_shr_optab. + (vect_create_epilog_for_reduction): Likewise; also rename local variable + have_whole_vector_shift to reduce_with_shift; output VEC_PERM_EXPRs + instead of VEC_RSHIFT_EXPRs. + + * tree-vect-stmts.c (vect_gen_perm_mask_checked): Extend comment. + +2014-11-13 Alan Lawrence + * tree-vectorizer.h (vect_gen_perm_mask): Remove. (vect_gen_perm_mask_checked, vect_gen_perm_mask_any): New. diff --git a/gcc/optabs.c b/gcc/optabs.c index 3376f2d..4ddd9cc 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -6567,8 +6567,11 @@ vector_compare_rtx (enum tree_code tcode, tree t_op0, tree t_op1, return gen_rtx_fmt_ee (rcode, VOIDmode, ops[0].value, ops[1].value); } -/* Return true if VEC_PERM_EXPR can be expanded using SIMD extensions - of the CPU. SEL may be NULL, which stands for an unknown constant. */ +/* Return true if VEC_PERM_EXPR of arbitrary input vectors can be expanded using + SIMD extensions of the CPU. SEL may be NULL, which stands for an unknown + constant. Note that additional permutations representing whole-vector shifts + may also be handled via the vec_shr optab, but only where the second input + vector is entirely constant zeroes; this case is not dealt with here. */ bool can_vec_perm_p (machine_mode mode, bool variable, @@ -6621,6 +6624,36 @@ can_vec_perm_p (machine_mode mode, bool variable, return true; } +/* Checks if vec_perm mask SEL is a constant equivalent to a shift of the first + vec_perm operand, assuming the second operand is a constant vector of zeroes. + Return the shift distance in bits if so, or NULL_RTX if the vec_perm is not a + shift. */ +static rtx +shift_amt_for_vec_perm_mask (rtx sel) +{ + unsigned int i, first, nelt = GET_MODE_NUNITS (GET_MODE (sel)); + unsigned int bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (GET_MODE (sel))); + + if (GET_CODE (sel) != CONST_VECTOR) + return NULL_RTX; + + first = INTVAL (CONST_VECTOR_ELT (sel, 0)); + if (first >= 2*nelt) + return NULL_RTX; + for (i = 1; i < nelt; i++) + { + int idx = INTVAL (CONST_VECTOR_ELT (sel, i)); + unsigned int expected = (i + first) & (2 * nelt - 1); + /* Indices into the second vector are all equivalent. */ + if (idx < 0 || (MIN (nelt, (unsigned) idx) != MIN (nelt, expected))) + return NULL_RTX; + } + + if (BYTES_BIG_ENDIAN) + first = (2 * nelt) - first; + return GEN_INT (first * bitsize); +} + /* A subroutine of expand_vec_perm for expanding one vec_perm insn. */ static rtx @@ -6649,6 +6682,17 @@ expand_vec_perm_1 (enum insn_code icode, rtx target, else { create_input_operand (&ops[1], v0, tmode); + /* See if this can be handled with a vec_shr. We only do this if the + second vector is all zeroes. */ + enum insn_code shift_code = optab_handler (vec_shr_optab, GET_MODE (v0)); + if (v1 == CONST0_RTX (GET_MODE (v1)) && shift_code) + if (rtx shift_amt = shift_amt_for_vec_perm_mask (sel)) + { + create_convert_operand_from_type (&ops[2], shift_amt, + sizetype_tab[(int) stk_sizetype]); + if (maybe_expand_insn (shift_code, 3, ops)) + return ops[0].value; + } create_input_operand (&ops[2], v1, tmode); } diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index f3f02fa..7aa5862 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -3083,6 +3083,41 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, *ret_min_profitable_estimate = min_profitable_estimate; } +/* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET + vector elements (not bits) for a vector of mode MODE. */ +static void +calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset, + unsigned char *sel) +{ + unsigned int i, nelt = GET_MODE_NUNITS (mode); + + for (i = 0; i < nelt; i++) + sel[i] = (BYTES_BIG_ENDIAN ? i - offset : i + offset) & (2*nelt - 1); +} + +/* Checks whether the target supports whole-vector shifts for vectors of mode + MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_ + it supports vec_perm_const with masks for all necessary shift amounts. */ +static bool +have_whole_vector_shift (enum machine_mode mode) +{ + if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) + return true; + + if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing) + return false; + + unsigned int i, nelt = GET_MODE_NUNITS (mode); + unsigned char *sel = XALLOCAVEC (unsigned char, nelt); + + for (i = nelt/2; i >= 1; i/=2) + { + calc_vec_perm_mask_for_shift (mode, i, sel); + if (!can_vec_perm_p (mode, false, sel)) + return false; + } + return true; +} /* TODO: Close dependency between vect_model_*_cost and vectorizable_* functions. Design better to avoid maintenance issues. */ @@ -3185,7 +3220,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, /* We have a whole vector shift available. */ if (VECTOR_MODE_P (mode) && optab_handler (optab, mode) != CODE_FOR_nothing - && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) + && have_whole_vector_shift (mode)) { /* Final reduction via vector shifts and the reduction operator. Also requires scalar extract. */ @@ -3788,7 +3823,6 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, return init_def; } - /* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction @@ -4212,18 +4246,11 @@ vect_create_epilog_for_reduction (vec vect_defs, gimple stmt, } else { - enum tree_code shift_code = ERROR_MARK; - bool have_whole_vector_shift = true; - int bit_offset; + bool reduce_with_shift = have_whole_vector_shift (mode); int element_bitsize = tree_to_uhwi (bitsize); int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); tree vec_temp; - if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) - shift_code = VEC_RSHIFT_EXPR; - else - have_whole_vector_shift = false; - /* Regardless of whether we have a whole vector shift, if we're emulating the operation via tree-vect-generic, we don't want to use it. Only the first round of the reduction is likely @@ -4231,18 +4258,24 @@ vect_create_epilog_for_reduction (vec vect_defs, gimple stmt, /* ??? It might be better to emit a reduction tree code here, so that tree-vect-generic can expand the first round via bit tricks. */ if (!VECTOR_MODE_P (mode)) - have_whole_vector_shift = false; + reduce_with_shift = false; else { optab optab = optab_for_tree_code (code, vectype, optab_default); if (optab_handler (optab, mode) == CODE_FOR_nothing) - have_whole_vector_shift = false; + reduce_with_shift = false; } - if (have_whole_vector_shift && !slp_reduc) + if (reduce_with_shift && !slp_reduc) { + int nelements = vec_size_in_bits / element_bitsize; + unsigned char *sel = XALLOCAVEC (unsigned char, nelements); + + int elt_offset; + + tree zero_vec = build_zero_cst (vectype); /*** Case 2: Create: - for (offset = VS/2; offset >= element_size; offset/=2) + for (offset = nelements/2; offset >= 1; offset/=2) { Create: va' = vec_shift Create: va = vop @@ -4254,14 +4287,15 @@ vect_create_epilog_for_reduction (vec vect_defs, gimple stmt, vec_dest = vect_create_destination_var (scalar_dest, vectype); new_temp = new_phi_result; - for (bit_offset = vec_size_in_bits/2; - bit_offset >= element_bitsize; - bit_offset /= 2) + for (elt_offset = nelements / 2; + elt_offset >= 1; + elt_offset /= 2) { - tree bitpos = size_int (bit_offset); - - epilog_stmt = gimple_build_assign_with_ops (shift_code, - vec_dest, new_temp, bitpos); + calc_vec_perm_mask_for_shift (mode, elt_offset, sel); + tree mask = vect_gen_perm_mask_any (vectype, sel); + epilog_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, + vec_dest, new_temp, + zero_vec, mask); new_name = make_ssa_name (vec_dest, epilog_stmt); gimple_assign_set_lhs (epilog_stmt, new_name); gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); @@ -4277,8 +4311,6 @@ vect_create_epilog_for_reduction (vec vect_defs, gimple stmt, } else { - tree rhs; - /*** Case 3: Create: s = extract_field for (offset = element_size; @@ -4296,11 +4328,12 @@ vect_create_epilog_for_reduction (vec vect_defs, gimple stmt, vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); FOR_EACH_VEC_ELT (new_phis, i, new_phi) { + int bit_offset; if (gimple_code (new_phi) == GIMPLE_PHI) vec_temp = PHI_RESULT (new_phi); else vec_temp = gimple_assign_lhs (new_phi); - rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, + tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, bitsize_zero_node); epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 3d40f0f..f877fea 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -5495,7 +5495,8 @@ vect_gen_perm_mask_any (tree vectype, const unsigned char *sel) return mask_vec; } -/* Checked version of vect_gen_perm_mask_any. Asserts can_vec_perm_p. */ +/* Checked version of vect_gen_perm_mask_any. Asserts can_vec_perm_p, + i.e. that the target supports the pattern _for arbitrary input vectors_. */ tree vect_gen_perm_mask_checked (tree vectype, const unsigned char *sel) -- 2.7.4