From dae00b1695df005699a0e1d3b39405c3674c2904 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 5 Oct 2011 10:40:58 -0700 Subject: [PATCH] i386: Rewrite ix86_expand_vshuffle. 1: Handle TARGET_XOP. 2: Reduce code duplication. 3: Use ASHIFT instead of MULT for scaling. 4: Fix errors in building convert-to-v16qi indicies. 5: Handle v2di without sse4.1. From-SVN: r179564 --- gcc/ChangeLog | 6 ++ gcc/config/i386/i386-protos.h | 2 +- gcc/config/i386/i386.c | 208 ++++++++++++++++++++---------------------- gcc/config/i386/sse.md | 4 +- 4 files changed, 109 insertions(+), 111 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b4f9ac2..800d69a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -6,6 +6,12 @@ code duplication. Do update_stmt here ... (expand_vector_operations_1): ... not here. + * config/i386/i386.c (ix86_expand_vshuffle): Never fail. Handle + TARGET_XOP. Fix pshufb constant vector creation. Reduce code + duplication. Handle V2DI without SSE4.1. + * config/i386/i386-protos.h (ix86_expand_vshuffle): Update decl. + * config/i386/i386.md (vshuffle): Remove assert for ok. + 2011-10-05 Uros Bizjak * config/i386/i386.c (distance_non_agu_define): Simplify calculation diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 99327ed..0bbfa9b 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -123,7 +123,7 @@ extern bool ix86_expand_int_movcc (rtx[]); extern bool ix86_expand_fp_movcc (rtx[]); extern bool ix86_expand_fp_vcond (rtx[]); extern bool ix86_expand_int_vcond (rtx[]); -extern bool ix86_expand_vshuffle (rtx[]); +extern void ix86_expand_vshuffle (rtx[]); extern void ix86_expand_sse_unpack (rtx[], bool, bool); extern bool ix86_expand_int_addcc (rtx[]); extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ba24400..faad3a5 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19237,145 +19237,139 @@ ix86_expand_int_vcond (rtx operands[]) return true; } -bool +void ix86_expand_vshuffle (rtx operands[]) { rtx target = operands[0]; rtx op0 = operands[1]; rtx op1 = operands[2]; rtx mask = operands[3]; - rtx new_mask, vt, t1, t2, w_vector; + rtx vt, vec[16]; enum machine_mode mode = GET_MODE (op0); enum machine_mode maskmode = GET_MODE (mask); - enum machine_mode maskinner = GET_MODE_INNER (mode); - rtx vec[16]; - int w, i, j; - bool one_operand_shuffle = op0 == op1; + int w, e, i; + bool one_operand_shuffle = rtx_equal_p (op0, op1); - gcc_assert ((TARGET_SSSE3 || TARGET_AVX) && GET_MODE_BITSIZE (mode) == 128); + gcc_checking_assert (GET_MODE_BITSIZE (mode) == 128); /* Number of elements in the vector. */ - w = GET_MODE_BITSIZE (maskmode) / GET_MODE_BITSIZE (maskinner); - - /* generate w_vector = {w, w, ...} */ - for (i = 0; i < w; i++) - vec[i] = GEN_INT (w); - w_vector = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); - - /* mask = mask & {w-1, w-1, w-1,...} */ - for (i = 0; i < w; i++) - vec[i] = GEN_INT (w - 1); + w = GET_MODE_NUNITS (mode); + e = GET_MODE_UNIT_SIZE (mode); - vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); - new_mask = expand_simple_binop (maskmode, AND, mask, vt, - NULL_RTX, 0, OPTAB_DIRECT); - - /* If the original vector mode is V16QImode, we can just - use pshufb directly. */ - if (mode == V16QImode && one_operand_shuffle) + if (TARGET_XOP) { - t1 = gen_reg_rtx (V16QImode); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask)); - emit_insn (gen_rtx_SET (VOIDmode, target, t1)); - return true; + /* The XOP VPPERM insn supports three inputs. By ignoring the + one_operand_shuffle special case, we avoid creating another + set of constant vectors in memory. */ + one_operand_shuffle = false; + + /* mask = mask & {2*w-1, ...} */ + vt = GEN_INT (2*w - 1); } - else if (mode == V16QImode) + else { - rtx xops[6]; - - t1 = gen_reg_rtx (V16QImode); - t2 = gen_reg_rtx (V16QImode); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask)); - emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, new_mask)); - - /* mask = mask & {w, w, ...} */ - mask = expand_simple_binop (V16QImode, AND, mask, w_vector, - NULL_RTX, 0, OPTAB_DIRECT); - xops[0] = target; - xops[1] = operands[1]; - xops[2] = operands[2]; - xops[3] = gen_rtx_EQ (mode, mask, w_vector); - xops[4] = t1; - xops[5] = t2; - - return ix86_expand_int_vcond (xops); + /* mask = mask & {w-1, ...} */ + vt = GEN_INT (w - 1); } - /* mask = mask * {w, w, ...} */ - new_mask = expand_simple_binop (maskmode, MULT, new_mask, w_vector, - NULL_RTX, 0, OPTAB_DIRECT); - - /* Convert mask to vector of chars. */ - new_mask = simplify_gen_subreg (V16QImode, new_mask, maskmode, 0); - new_mask = force_reg (V16QImode, new_mask); - - /* Build a helper mask wich we will use in pshufb - (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12} - (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...} - ... */ - for (i = 0; i < w; i++) - for (j = 0; j < 16/w; j++) - vec[i*w+j] = GEN_INT (i*16/w); - vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); - vt = force_reg (V16QImode, vt); - - t1 = gen_reg_rtx (V16QImode); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, new_mask, vt)); - new_mask = t1; - - /* Convert it into the byte positions by doing - new_mask = new_mask + {0,1,..,16/w, 0,1,..,16/w, ...} */ for (i = 0; i < w; i++) - for (j = 0; j < 16/w; j++) - vec[i*w+j] = GEN_INT (j); + vec[i] = vt; + vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); + mask = expand_simple_binop (maskmode, AND, mask, vt, + NULL_RTX, 0, OPTAB_DIRECT); - vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); - new_mask = expand_simple_binop (V16QImode, PLUS, new_mask, vt, + /* For non-QImode operations, convert the word permutation control + into a byte permutation control. */ + if (mode != V16QImode) + { + mask = expand_simple_binop (maskmode, ASHIFT, mask, + GEN_INT (exact_log2 (e)), NULL_RTX, 0, OPTAB_DIRECT); - t1 = gen_reg_rtx (V16QImode); + /* Convert mask to vector of chars. */ + mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask)); + + /* Replicate each of the input bytes into byte positions: + (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8} + (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12} + (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */ + for (i = 0; i < 16; ++i) + vec[i] = GEN_INT (i/e * e); + vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); + vt = force_const_mem (V16QImode, vt); + if (TARGET_XOP) + emit_insn (gen_xop_pperm (mask, mask, mask, vt)); + else + emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt)); - /* Convert OP0 to vector of chars. */ - op0 = simplify_gen_subreg (V16QImode, op0, mode, 0); - op0 = force_reg (V16QImode, op0); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask)); + /* Convert it into the byte positions by doing + mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */ + for (i = 0; i < 16; ++i) + vec[i] = GEN_INT (i % e); + vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); + vt = force_const_mem (V16QImode, vt); + emit_insn (gen_addv16qi3 (mask, mask, vt)); + } - if (one_operand_shuffle) + /* The actual shuffle operations all operate on V16QImode. */ + op0 = gen_lowpart (V16QImode, op0); + op1 = gen_lowpart (V16QImode, op1); + target = gen_lowpart (V16QImode, target); + + if (TARGET_XOP) { - /* Convert it back from vector of chars to the original mode. */ - t1 = simplify_gen_subreg (mode, t1, V16QImode, 0); - emit_insn (gen_rtx_SET (VOIDmode, target, t1)); - return true; + emit_insn (gen_xop_pperm (target, op0, op1, mask)); + } + else if (one_operand_shuffle) + { + emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask)); } else { - rtx xops[6]; + rtx xops[6], t1, t2; + bool ok; + /* Shuffle the two input vectors independently. */ + t1 = gen_reg_rtx (V16QImode); t2 = gen_reg_rtx (V16QImode); - - /* Convert OP1 to vector of chars. */ - op1 = simplify_gen_subreg (V16QImode, op1, mode, 0); - op1 = force_reg (V16QImode, op1); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, op1, new_mask)); - - /* mask = mask & {w, w, ...} */ - mask = expand_simple_binop (V16QImode, AND, mask, w_vector, + emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask)); + emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask)); + + /* Then merge them together. The key is whether any given control + element contained a bit set that indicates the second word. */ + mask = operands[3]; + vt = GEN_INT (w); + if (maskmode == V2DImode && !TARGET_SSE4_1) + { + /* Without SSE4.1, we don't have V2DImode EQ. Perform one + more shuffle to convert the V2DI input mask into a V4SI + input mask. At which point the masking that expand_int_vcond + will work as desired. */ + rtx t3 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask), + const0_rtx, const0_rtx, + const2_rtx, const2_rtx)); + mask = t3; + maskmode = V4SImode; + e = w = 4; + } + + for (i = 0; i < w; i++) + vec[i] = vt; + vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); + vt = force_reg (maskmode, vt); + mask = expand_simple_binop (maskmode, AND, mask, vt, NULL_RTX, 0, OPTAB_DIRECT); - t1 = simplify_gen_subreg (mode, t1, V16QImode, 0); - t2 = simplify_gen_subreg (mode, t2, V16QImode, 0); - - xops[0] = target; - xops[1] = operands[1]; - xops[2] = operands[2]; - xops[3] = gen_rtx_EQ (mode, mask, w_vector); - xops[4] = t1; - xops[5] = t2; - - return ix86_expand_int_vcond (xops); + xops[0] = gen_lowpart (maskmode, operands[0]); + xops[1] = gen_lowpart (maskmode, t2); + xops[2] = gen_lowpart (maskmode, t1); + xops[3] = gen_rtx_EQ (maskmode, mask, vt); + xops[4] = mask; + xops[5] = vt; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); } - - return false; } /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 251cdde..ee9cf0b 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -6229,12 +6229,10 @@ (match_operand: 3 "register_operand" "")] "TARGET_SSSE3 || TARGET_AVX" { - bool ok = ix86_expand_vshuffle (operands); - gcc_assert (ok); + ix86_expand_vshuffle (operands); DONE; }) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel bitwise logical operations -- 2.7.4