From: Richard Henderson Date: Mon, 30 Nov 2009 18:26:55 +0000 (-0800) Subject: Implement vec_perm broadcast, and tidy lots of patterns to help. X-Git-Tag: upstream/12.2.0~95757 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5e04b3b69417b07f66ed919b64e47055d3a2f9db;p=platform%2Fupstream%2Fgcc.git Implement vec_perm broadcast, and tidy lots of patterns to help. From-SVN: r154836 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d5fb075..9c8294c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,50 @@ +2009-11-30 Richard Henderson + + * config/i386/i386.c (ix86_vec_interleave_v2df_operator_ok): New. + (bdesc_special_args): Update insn codes. + (avx_vpermilp_parallel): Correct range check. + (ix86_rtx_costs): Handle vector permutation rtx codes. + (struct expand_vec_perm_d): Move earlier. + (get_mode_wider_vector): New. + (expand_vec_perm_broadcast_1): New. + (ix86_expand_vector_init_duplicate): Use it. Tidy AVX modes. + (expand_vec_perm_broadcast): New. + (ix86_expand_vec_perm_builtin_1): Use it. + * config/i386/i386-protos.h: Update. + * config/i386/predicates.md (avx_vbroadcast_operand): New. + * config/i386/sse.md (AVX256MODE24P): New. + (ssescalarmodesuffix2s): New. + (avxhalfvecmode, avxscalarmode): Fill out to all modes. + (avxmodesuffixf2c): Add V8SI, V4DI. + (vec_dupv4sf): New expander. + (*vec_dupv4sf_avx): Add vbroadcastss alternative. + (*vec_set_0_avx, **vec_set_0_sse4_1): Macro-ize for + V4SF and V4SI. Move C alternatives to front. Add insertps and + pinsrd alternatives. + (*vec_set_0_sse2): Split out from ... + (vec_set_0): Macro-ize for V4SF and V4SI. + (vec_interleave_highv2df, vec_interleave_lowv2df): Require register + destination; use ix86_vec_interleave_v2df_operator_ok, instead of + ix86_fixup_binary_operands. + (*avx_interleave_highv2df, avx_interleave_lowv2df): Add movddup. + (*sse3_interleave_highv2df, sse3_interleave_lowv2df): New. + (*avx_movddup, *sse3_movddup): Remove. New splitter from + vec_select form to vec_duplicate form. + (*sse2_interleave_highv2df, sse2_interleave_lowv2df): Use + ix86_vec_interleave_v2df_operator_ok. + (avx_movddup256, avx_unpcklpd256): Change to expanders, merge into ... + (*avx_unpcklpd256): ... here. + (*vec_dupv4si_avx): New. + (*vec_dupv2di_avx): Add movddup alternative. + (*vec_dupv2di_sse3): New. + (vec_dup): Replace avx_vbroadcasts and + avx_vbroadcastss256; represent with vec_duplicate instead of + nested vec_concat operations. + (avx_vbroadcastf128_): Rename from + avx_vbroadcastf128_p256. + (*avx_vperm_broadcast_v4sf): New. + (*avx_vperm_broadcast_): New. + 2009-11-30 Martin Jambor PR middle-end/42196 diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 1e94cde..1451e79 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -86,6 +86,7 @@ extern void ix86_expand_binary_operator (enum rtx_code, enum machine_mode, rtx[]); extern int ix86_binary_operator_ok (enum rtx_code, enum machine_mode, rtx[]); extern bool ix86_lea_for_add_ok (enum rtx_code, rtx, rtx[]); +extern bool ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high); extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn); extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn); extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode, diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index fa84e32..21be89f 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -13849,6 +13849,19 @@ ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED, return TRUE; } +/* Return TRUE if the operands to a vec_interleave_{high,low}v2df + are ok, keeping in mind the possible movddup alternative. */ + +bool +ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high) +{ + if (MEM_P (operands[0])) + return rtx_equal_p (operands[0], operands[1 + high]); + if (MEM_P (operands[1]) && MEM_P (operands[2])) + return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]); + return true; +} + /* Post-reload splitter for converting an SF or DFmode value in an SSE register into an unsigned SImode. */ @@ -21480,11 +21493,11 @@ static const struct builtin_description bdesc_special_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastsd256, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss256, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_pd256, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_ps256, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE }, + { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT }, @@ -24597,7 +24610,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode) if (!CONST_INT_P (er)) return 0; ei = INTVAL (er); - if (ei >= nelt) + if (ei >= 2 * nelt) return 0; ipar[i] = ei; } @@ -25713,6 +25726,16 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total, bool speed) *total = 0; return false; + case VEC_SELECT: + case VEC_CONCAT: + case VEC_MERGE: + case VEC_DUPLICATE: + /* ??? Assume all of these vector manipulation patterns are + recognizable. In which case they all pretty much have the + same cost. */ + *total = COSTS_N_INSNS (1); + return true; + default: return false; } @@ -26547,16 +26570,43 @@ x86_emit_floatuns (rtx operands[2]) emit_label (donelab); } +/* AVX does not support 32-byte integer vector operations, + thus the longest vector we are faced with is V16QImode. */ +#define MAX_VECT_LEN 16 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + enum machine_mode vmode; + unsigned char nelt; + bool testing_p; +}; + +static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); +static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d); + +/* Get a vector mode of the same size as the original but with elements + twice as wide. This is only guaranteed to apply to integral vectors. */ + +static inline enum machine_mode +get_mode_wider_vector (enum machine_mode o) +{ + /* ??? Rely on the ordering that genmodes.c gives to vectors. */ + enum machine_mode n = GET_MODE_WIDER_MODE (o); + gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2); + gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n)); + return n; +} + /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector with all elements equal to VAR. Return true if successful. */ -/* ??? Call into the vec_perm support to implement the broadcast. */ static bool ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, rtx target, rtx val) { - enum machine_mode hmode, smode, wsmode, wvmode; - rtx x; + bool ok; switch (mode) { @@ -26566,13 +26616,28 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, return false; /* FALLTHRU */ + case V4DFmode: + case V4DImode: + case V8SFmode: + case V8SImode: case V2DFmode: case V2DImode: case V4SFmode: case V4SImode: - val = force_reg (GET_MODE_INNER (mode), val); - x = gen_rtx_VEC_DUPLICATE (mode, val); - emit_insn (gen_rtx_SET (VOIDmode, target, x)); + { + rtx insn, dup; + + /* First attempt to recognize VAL as-is. */ + dup = gen_rtx_VEC_DUPLICATE (mode, val); + insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup)); + if (recog_memoized (insn) < 0) + { + /* If that fails, force VAL into a register. */ + XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val); + ok = recog_memoized (insn) >= 0; + gcc_assert (ok); + } + } return true; case V4HImode: @@ -26580,130 +26645,87 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, return false; if (TARGET_SSE || TARGET_3DNOW_A) { + rtx x; + val = gen_lowpart (SImode, val); x = gen_rtx_TRUNCATE (HImode, val); x = gen_rtx_VEC_DUPLICATE (mode, x); emit_insn (gen_rtx_SET (VOIDmode, target, x)); return true; } - else - { - smode = HImode; - wsmode = SImode; - wvmode = V2SImode; - goto widen; - } + goto widen; case V8QImode: if (!mmx_ok) return false; - smode = QImode; - wsmode = HImode; - wvmode = V4HImode; goto widen; + case V8HImode: if (TARGET_SSE2) { + struct expand_vec_perm_d dperm; rtx tmp1, tmp2; - /* Extend HImode to SImode using a paradoxical SUBREG. */ + + permute: + memset (&dperm, 0, sizeof (dperm)); + dperm.target = target; + dperm.vmode = mode; + dperm.nelt = GET_MODE_NUNITS (mode); + dperm.op0 = dperm.op1 = gen_reg_rtx (mode); + + /* Extend to SImode using a paradoxical SUBREG. */ tmp1 = gen_reg_rtx (SImode); emit_move_insn (tmp1, gen_lowpart (SImode, val)); - /* Insert the SImode value as low element of V4SImode vector. */ - tmp2 = gen_reg_rtx (V4SImode); - tmp1 = gen_rtx_VEC_MERGE (V4SImode, - gen_rtx_VEC_DUPLICATE (V4SImode, tmp1), - CONST0_RTX (V4SImode), - const1_rtx); - emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1)); - /* Cast the V4SImode vector back to a V8HImode vector. */ - tmp1 = gen_reg_rtx (V8HImode); - emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2)); - /* Duplicate the low short through the whole low SImode word. */ - emit_insn (gen_vec_interleave_lowv8hi (tmp1, tmp1, tmp1)); - /* Cast the V8HImode vector back to a V4SImode vector. */ - tmp2 = gen_reg_rtx (V4SImode); - emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1)); - /* Replicate the low element of the V4SImode vector. */ - emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx)); - /* Cast the V2SImode back to V8HImode, and store in target. */ - emit_move_insn (target, gen_lowpart (V8HImode, tmp2)); - return true; + + /* Insert the SImode value as low element of a V4SImode vector. */ + tmp2 = gen_lowpart (V4SImode, dperm.op0); + emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1)); + + ok = (expand_vec_perm_1 (&dperm) + || expand_vec_perm_broadcast_1 (&dperm)); + gcc_assert (ok); + return ok; } - smode = HImode; - wsmode = SImode; - wvmode = V4SImode; goto widen; + case V16QImode: if (TARGET_SSE2) - { - rtx tmp1, tmp2; - /* Extend QImode to SImode using a paradoxical SUBREG. */ - tmp1 = gen_reg_rtx (SImode); - emit_move_insn (tmp1, gen_lowpart (SImode, val)); - /* Insert the SImode value as low element of V4SImode vector. */ - tmp2 = gen_reg_rtx (V4SImode); - tmp1 = gen_rtx_VEC_MERGE (V4SImode, - gen_rtx_VEC_DUPLICATE (V4SImode, tmp1), - CONST0_RTX (V4SImode), - const1_rtx); - emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1)); - /* Cast the V4SImode vector back to a V16QImode vector. */ - tmp1 = gen_reg_rtx (V16QImode); - emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2)); - /* Duplicate the low byte through the whole low SImode word. */ - emit_insn (gen_vec_interleave_lowv16qi (tmp1, tmp1, tmp1)); - emit_insn (gen_vec_interleave_lowv16qi (tmp1, tmp1, tmp1)); - /* Cast the V16QImode vector back to a V4SImode vector. */ - tmp2 = gen_reg_rtx (V4SImode); - emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1)); - /* Replicate the low element of the V4SImode vector. */ - emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx)); - /* Cast the V2SImode back to V16QImode, and store in target. */ - emit_move_insn (target, gen_lowpart (V16QImode, tmp2)); - return true; - } - smode = QImode; - wsmode = HImode; - wvmode = V8HImode; + goto permute; goto widen; + widen: /* Replicate the value once into the next wider mode and recurse. */ - val = convert_modes (wsmode, smode, val, true); - x = expand_simple_binop (wsmode, ASHIFT, val, - GEN_INT (GET_MODE_BITSIZE (smode)), - NULL_RTX, 1, OPTAB_LIB_WIDEN); - val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN); - - x = gen_reg_rtx (wvmode); - if (!ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val)) - gcc_unreachable (); - emit_move_insn (target, gen_lowpart (mode, x)); - return true; + { + enum machine_mode smode, wsmode, wvmode; + rtx x; + + smode = GET_MODE_INNER (mode); + wvmode = get_mode_wider_vector (mode); + wsmode = GET_MODE_INNER (wvmode); + + val = convert_modes (wsmode, smode, val, true); + x = expand_simple_binop (wsmode, ASHIFT, val, + GEN_INT (GET_MODE_BITSIZE (smode)), + NULL_RTX, 1, OPTAB_LIB_WIDEN); + val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN); + + x = gen_lowpart (wvmode, target); + ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val); + gcc_assert (ok); + return ok; + } - case V4DFmode: - hmode = V2DFmode; - goto half; - case V4DImode: - hmode = V2DImode; - goto half; - case V8SFmode: - hmode = V4SFmode; - goto half; - case V8SImode: - hmode = V4SImode; - goto half; case V16HImode: - hmode = V8HImode; - goto half; case V32QImode: - hmode = V16QImode; - goto half; -half: { - rtx tmp = gen_reg_rtx (hmode); - ix86_expand_vector_init_duplicate (mmx_ok, hmode, tmp, val); - emit_insn (gen_rtx_SET (VOIDmode, target, - gen_rtx_VEC_CONCAT (mode, tmp, tmp))); + enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode); + rtx x = gen_reg_rtx (hvmode); + + ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); + gcc_assert (ok); + + x = gen_rtx_VEC_CONCAT (mode, x, x); + emit_insn (gen_rtx_SET (VOIDmode, target, x)); } return true; @@ -29085,19 +29107,6 @@ ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type) return ix86_builtins[(int) fcode]; } -/* AVX does not support 32-byte integer vector operations, - thus the longest vector we are faced with is V16QImode. */ -#define MAX_VECT_LEN 16 - -struct expand_vec_perm_d -{ - rtx target, op0, op1; - unsigned char perm[MAX_VECT_LEN]; - enum machine_mode vmode; - unsigned char nelt; - bool testing_p; -}; - /* Return a vector mode with twice as many elements as VMODE. */ /* ??? Consider moving this to a table generated by genmodes.c. */ @@ -29739,8 +29748,8 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) return true; } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match - extract-even and extract-odd permutations. */ +/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even + and extract-odd permutations. */ static bool expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) @@ -29855,6 +29864,9 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) return true; } +/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match + extract-even and extract-odd permutations. */ + static bool expand_vec_perm_even_odd (struct expand_vec_perm_d *d) { @@ -29871,6 +29883,84 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d) return expand_vec_perm_even_odd_1 (d, odd); } +/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast + permutations. We assume that expand_vec_perm_1 has already failed. */ + +static bool +expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) +{ + unsigned elt = d->perm[0], nelt2 = d->nelt / 2; + enum machine_mode vmode = d->vmode; + unsigned char perm2[4]; + rtx op0 = d->op0; + bool ok; + + switch (vmode) + { + case V4DFmode: + case V8SFmode: + /* These are special-cased in sse.md so that we can optionally + use the vbroadcast instruction. They expand to two insns + if the input happens to be in a register. */ + gcc_unreachable (); + + case V2DFmode: + case V2DImode: + case V4SFmode: + case V4SImode: + /* These are always implementable using standard shuffle patterns. */ + gcc_unreachable (); + + case V8HImode: + case V16QImode: + /* These can be implemented via interleave. We save one insn by + stopping once we have promoted to V4SImode and then use pshufd. */ + do + { + optab otab = vec_interleave_low_optab; + + if (elt >= nelt2) + { + otab = vec_interleave_high_optab; + elt -= nelt2; + } + nelt2 /= 2; + + op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT); + vmode = get_mode_wider_vector (vmode); + op0 = gen_lowpart (vmode, op0); + } + while (vmode != V4SImode); + + memset (perm2, elt, 4); + ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4); + gcc_assert (ok); + return true; + + default: + gcc_unreachable (); + } +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match + broadcast permutations. */ + +static bool +expand_vec_perm_broadcast (struct expand_vec_perm_d *d) +{ + unsigned i, elt, nelt = d->nelt; + + if (d->op0 != d->op1) + return false; + + elt = d->perm[0]; + for (i = 1; i < nelt; ++i) + if (d->perm[i] != elt) + return false; + + return expand_vec_perm_broadcast_1 (d); +} + /* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook. With all of the interface bits taken care of, perform the expansion in D and return true on success. */ @@ -29878,8 +29968,7 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d) static bool ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d) { - /* First things first -- check if the instruction is implementable - with a single instruction. */ + /* Try a single instruction expansion. */ if (expand_vec_perm_1 (d)) return true; @@ -29894,13 +29983,16 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_interleave2 (d)) return true; + if (expand_vec_perm_broadcast (d)) + return true; + /* Try sequences of three instructions. */ if (expand_vec_perm_pshufb2 (d)) return true; /* ??? Look for narrow permutations whose element orderings would - allow the promition to a wider mode. */ + allow the promotion to a wider mode. */ /* ??? Look for sequences of interleave or a wider permute that place the data into the correct lanes for a half-vector shuffle like @@ -29912,8 +30004,6 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_even_odd (d)) return true; - /* ??? Pattern match broadcast. */ - return false; } diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 50a68d9..8f901cd 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1241,3 +1241,20 @@ (define_predicate "avx_vperm2f128_v4df_operand" (and (match_code "parallel") (match_test "avx_vperm2f128_parallel (op, V4DFmode)"))) + +;; Return 1 if OP is a parallel for a vbroadcast permute. + +(define_predicate "avx_vbroadcast_operand" + (and (match_code "parallel") + (match_code "const_int" "a")) +{ + rtx elt = XVECEXP (op, 0, 0); + int i, nelt = XVECLEN (op, 0); + + /* Don't bother checking there are the right number of operands, + merely that they're all identical. */ + for (i = 1; i < nelt; ++i) + if (XVECEXP (op, 0, i) != elt) + return false; + return true; +}) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b73820b..08a3b5b 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -54,6 +54,7 @@ (define_mode_iterator AVX256MODEF2P [V8SF V4DF]) (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF]) +(define_mode_iterator AVX256MODE24P [V8SI V8SF V4DI V4DF]) (define_mode_iterator AVX256MODE4P [V4DI V4DF]) (define_mode_iterator AVX256MODE8P [V8SI V8SF]) (define_mode_iterator AVXMODEF2P [V4SF V2DF V8SF V4DF]) @@ -96,6 +97,8 @@ (define_mode_attr ssemodesuffixf2c [(V4SF "s") (V2DF "d")]) +(define_mode_attr ssescalarmodesuffix2s [(V4SF "ss") (V4SI "d")]) + ;; Mapping of the max integer size for xop rotate immediate constraint (define_mode_attr sserotatemax [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")]) @@ -125,17 +128,18 @@ [(V16QI "V4SF") (V8HI "V4SF") (V4SI "V4SF") (V2DI "V4SF") (V32QI "V8SF") (V16HI "V8SF") (V8SI "V8SF") (V4DI "V8SF")]) (define_mode_attr avxhalfvecmode - [(V4SF "V2SF") (V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") - (V4DI "V2DI") (V8SF "V4SF") (V4DF "V2DF")]) + [(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI") + (V8SF "V4SF") (V4DF "V2DF") + (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V4SF "V2SF")]) (define_mode_attr avxscalarmode - [(V16QI "QI") (V8HI "HI") (V4SI "SI") (V4SF "SF") (V2DF "DF") - (V8SF "SF") (V4DF "DF")]) + [(V16QI "QI") (V8HI "HI") (V4SI "SI") (V2DI "DI") (V4SF "SF") (V2DF "DF") + (V32QI "QI") (V16HI "HI") (V8SI "SI") (V4DI "DI") (V8SF "SF") (V4DF "DF")]) (define_mode_attr avxcvtvecmode [(V4SF "V4SI") (V8SF "V8SI") (V4SI "V4SF") (V8SI "V8SF")]) (define_mode_attr avxpermvecmode [(V2DF "V2DI") (V4SF "V4SI") (V4DF "V4DI") (V8SF "V8SI")]) (define_mode_attr avxmodesuffixf2c - [(V4SF "s") (V2DF "d") (V8SF "s") (V4DF "d")]) + [(V4SF "s") (V2DF "d") (V8SI "s") (V8SF "s") (V4DI "d") (V4DF "d")]) (define_mode_attr avxmodesuffixp [(V2DF "pd") (V4SI "si") (V4SF "ps") (V8SF "ps") (V8SI "si") (V4DF "pd")]) @@ -4012,14 +4016,27 @@ [(set_attr "type" "ssemov") (set_attr "mode" "SF")]) +(define_expand "vec_dupv4sf" + [(set (match_operand:V4SF 0 "register_operand" "") + (vec_duplicate:V4SF + (match_operand:SF 1 "nonimmediate_operand" "")))] + "TARGET_SSE" +{ + if (!TARGET_AVX) + operands[1] = force_reg (V4SFmode, operands[1]); +}) + (define_insn "*vec_dupv4sf_avx" - [(set (match_operand:V4SF 0 "register_operand" "=x") + [(set (match_operand:V4SF 0 "register_operand" "=x,x") (vec_duplicate:V4SF - (match_operand:SF 1 "register_operand" "x")))] + (match_operand:SF 1 "nonimmediate_operand" "x,m")))] "TARGET_AVX" - "vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}" - [(set_attr "type" "sselog1") - (set_attr "length_immediate" "1") + "@ + vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} + vbroadcastss\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1,ssemov") + (set_attr "length_immediate" "1,0") + (set_attr "prefix_extra" "0,1") (set_attr "prefix" "vex") (set_attr "mode" "V4SF")]) @@ -4125,35 +4142,78 @@ DONE; }) -(define_insn "*vec_setv4sf_0_avx" - [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,x,m") - (vec_merge:V4SF - (vec_duplicate:V4SF - (match_operand:SF 2 "general_operand" " x,m,*r,x*rfF")) - (match_operand:V4SF 1 "vector_move_operand" " x,C,C ,0") +(define_insn "*vec_set_0_avx" + [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x, x,x, x,m") + (vec_merge:SSEMODE4S + (vec_duplicate:SSEMODE4S + (match_operand: 2 + "general_operand" " x,m,*r,x,*rm,x*rfF")) + (match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,x, x,0") (const_int 1)))] "TARGET_AVX" "@ - vmovss\t{%2, %1, %0|%0, %1, %2} - vmovss\t{%2, %0|%0, %2} + vinsertps\t{$0xe, %2, %2, %0|%0, %2, %2, 0xe} + vmov\t{%2, %0|%0, %2} vmovd\t{%2, %0|%0, %2} + vmovss\t{%2, %1, %0|%0, %1, %2} + vpinsrd\t{$0, %2, %1, %0|%0, %1, %2, 0} + #" + [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*") + (set_attr "prefix_extra" "*,*,*,*,1,*") + (set_attr "length_immediate" "*,*,*,*,1,*") + (set_attr "prefix" "vex") + (set_attr "mode" "SF,,SI,SF,TI,*")]) + +(define_insn "*vec_set_0_sse4_1" + [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x, x,x, x,m") + (vec_merge:SSEMODE4S + (vec_duplicate:SSEMODE4S + (match_operand: 2 + "general_operand" " x,m,*r,x,*rm,*rfF")) + (match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,0, 0,0") + (const_int 1)))] + "TARGET_SSE4_1" + "@ + insertps\t{$0xe, %2, %0|%0, %2, 0xe} + mov\t{%2, %0|%0, %2} + movd\t{%2, %0|%0, %2} + movss\t{%2, %0|%0, %2} + pinsrd\t{$0, %2, %0|%0, %2, 0} + #" + [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*") + (set_attr "prefix_extra" "*,*,*,*,1,*") + (set_attr "length_immediate" "*,*,*,*,1,*") + (set_attr "mode" "SF,,SI,SF,TI,*")]) + +(define_insn "*vec_set_0_sse2" + [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x, x,x,m") + (vec_merge:SSEMODE4S + (vec_duplicate:SSEMODE4S + (match_operand: 2 + "general_operand" " m,*r,x,x*rfF")) + (match_operand:SSEMODE4S 1 "vector_move_operand" " C, C,0,0") + (const_int 1)))] + "TARGET_SSE2" + "@ + mov\t{%2, %0|%0, %2} + movd\t{%2, %0|%0, %2} + movss\t{%2, %0|%0, %2} #" [(set_attr "type" "ssemov") - (set_attr "prefix" "vex") - (set_attr "mode" "SF")]) - -(define_insn "vec_setv4sf_0" - [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,Y2,m") - (vec_merge:V4SF - (vec_duplicate:V4SF - (match_operand:SF 2 "general_operand" " x,m,*r,x*rfF")) - (match_operand:V4SF 1 "vector_move_operand" " 0,C,C ,0") + (set_attr "mode" ",SI,SF,*")]) + +(define_insn "vec_set_0" + [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x,m") + (vec_merge:SSEMODE4S + (vec_duplicate:SSEMODE4S + (match_operand: 2 + "general_operand" " m,x,x*rfF")) + (match_operand:SSEMODE4S 1 "vector_move_operand" " C,0,0") (const_int 1)))] "TARGET_SSE" "@ movss\t{%2, %0|%0, %2} movss\t{%2, %0|%0, %2} - movd\t{%2, %0|%0, %2} #" [(set_attr "type" "ssemov") (set_attr "mode" "SF")]) @@ -4484,7 +4544,7 @@ (set_attr "mode" "V4DF")]) (define_expand "vec_interleave_highv2df" - [(set (match_operand:V2DF 0 "nonimmediate_operand" "") + [(set (match_operand:V2DF 0 "register_operand" "") (vec_select:V2DF (vec_concat:V4DF (match_operand:V2DF 1 "nonimmediate_operand" "") @@ -4492,24 +4552,46 @@ (parallel [(const_int 1) (const_int 3)])))] "TARGET_SSE2" - "ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);") +{ + if (!ix86_vec_interleave_v2df_operator_ok (operands, 1)) + operands[2] = force_reg (V2DFmode, operands[2]); +}) (define_insn "*avx_interleave_highv2df" - [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m") + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,m") (vec_select:V2DF (vec_concat:V4DF - (match_operand:V2DF 1 "nonimmediate_operand" " x,o,x") - (match_operand:V2DF 2 "nonimmediate_operand" " x,x,0")) + (match_operand:V2DF 1 "nonimmediate_operand" " x,o,o,x") + (match_operand:V2DF 2 "nonimmediate_operand" " x,1,x,0")) (parallel [(const_int 1) (const_int 3)])))] - "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 1)" "@ vunpckhpd\t{%2, %1, %0|%0, %1, %2} + vmovddup\t{%H1, %0|%0, %H1} vmovlpd\t{%H1, %2, %0|%0, %2, %H1} vmovhpd\t{%1, %0|%0, %1}" - [(set_attr "type" "sselog,ssemov,ssemov") + [(set_attr "type" "sselog,sselog,ssemov,ssemov") (set_attr "prefix" "vex") - (set_attr "mode" "V2DF,V1DF,V1DF")]) + (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")]) + +(define_insn "*sse3_interleave_highv2df" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,m") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "nonimmediate_operand" " 0,o,o,x") + (match_operand:V2DF 2 "nonimmediate_operand" " x,1,0,0")) + (parallel [(const_int 1) + (const_int 3)])))] + "TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 1)" + "@ + unpckhpd\t{%2, %0|%0, %2} + movddup\t{%H1, %0|%0, %H1} + movlpd\t{%H1, %0|%0, %H1} + movhpd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,sselog,ssemov,ssemov") + (set_attr "prefix_data16" "*,*,1,1") + (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")]) (define_insn "*sse2_interleave_highv2df" [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m") @@ -4519,7 +4601,7 @@ (match_operand:V2DF 2 "nonimmediate_operand" " x,0,0")) (parallel [(const_int 1) (const_int 3)])))] - "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 1)" "@ unpckhpd\t{%2, %0|%0, %2} movlpd\t{%H1, %0|%0, %H1} @@ -4528,85 +4610,48 @@ (set_attr "prefix_data16" "*,1,1") (set_attr "mode" "V2DF,V1DF,V1DF")]) -(define_insn "avx_movddup256" - [(set (match_operand:V4DF 0 "register_operand" "=x") +;; Recall that the 256-bit unpck insns only shuffle within their lanes. +(define_expand "avx_movddup256" + [(set (match_operand:V4DF 0 "register_operand" "") (vec_select:V4DF (vec_concat:V8DF - (match_operand:V4DF 1 "nonimmediate_operand" "xm") + (match_operand:V4DF 1 "nonimmediate_operand" "") (match_dup 1)) - (parallel [(const_int 0) (const_int 2) - (const_int 4) (const_int 6)])))] + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)])))] "TARGET_AVX" - "vmovddup\t{%1, %0|%0, %1}" - [(set_attr "type" "sselog1") - (set_attr "prefix" "vex") - (set_attr "mode" "V4DF")]) - -(define_insn "*avx_movddup" - [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,o") - (vec_select:V2DF - (vec_concat:V4DF - (match_operand:V2DF 1 "nonimmediate_operand" "xm,x") - (match_dup 1)) - (parallel [(const_int 0) - (const_int 2)])))] - "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" - "@ - vmovddup\t{%1, %0|%0, %1} - #" - [(set_attr "type" "sselog1,ssemov") - (set_attr "prefix" "vex") - (set_attr "mode" "V2DF")]) - -(define_insn "*sse3_movddup" - [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,o") - (vec_select:V2DF - (vec_concat:V4DF - (match_operand:V2DF 1 "nonimmediate_operand" "xm,x") - (match_dup 1)) - (parallel [(const_int 0) - (const_int 2)])))] - "TARGET_SSE3 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" - "@ - movddup\t{%1, %0|%0, %1} - #" - [(set_attr "type" "sselog1,ssemov") - (set_attr "mode" "V2DF")]) - -(define_split - [(set (match_operand:V2DF 0 "memory_operand" "") - (vec_select:V2DF - (vec_concat:V4DF - (match_operand:V2DF 1 "register_operand" "") - (match_dup 1)) - (parallel [(const_int 0) - (const_int 2)])))] - "TARGET_SSE3 && reload_completed" - [(const_int 0)] -{ - rtx low = gen_rtx_REG (DFmode, REGNO (operands[1])); - emit_move_insn (adjust_address (operands[0], DFmode, 0), low); - emit_move_insn (adjust_address (operands[0], DFmode, 8), low); - DONE; -}) + "") -;; Recall that the 256-bit unpck insns only shuffle within their lanes. -(define_insn "avx_unpcklpd256" - [(set (match_operand:V4DF 0 "register_operand" "=x") +(define_expand "avx_unpcklpd256" + [(set (match_operand:V4DF 0 "register_operand" "") (vec_select:V4DF (vec_concat:V8DF - (match_operand:V4DF 1 "register_operand" "x") - (match_operand:V4DF 2 "nonimmediate_operand" "xm")) + (match_operand:V4DF 1 "register_operand" "") + (match_operand:V4DF 2 "nonimmediate_operand" "")) (parallel [(const_int 0) (const_int 4) (const_int 2) (const_int 6)])))] "TARGET_AVX" - "vunpcklpd\t{%2, %1, %0|%0, %1, %2}" + "") + +(define_insn "*avx_unpcklpd256" + [(set (match_operand:V4DF 0 "register_operand" "=x,x") + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "nonimmediate_operand" "xm,x") + (match_operand:V4DF 2 "nonimmediate_operand" " 1,xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)])))] + "TARGET_AVX + && (!MEM_P (operands[1]) || rtx_equal_p (operands[1], operands[2]))" + "@ + vmovddup\t{%1, %0|%0, %1} + vunpcklpd\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) (define_expand "vec_interleave_lowv2df" - [(set (match_operand:V2DF 0 "nonimmediate_operand" "") + [(set (match_operand:V2DF 0 "register_operand" "") (vec_select:V2DF (vec_concat:V4DF (match_operand:V2DF 1 "nonimmediate_operand" "") @@ -4614,24 +4659,46 @@ (parallel [(const_int 0) (const_int 2)])))] "TARGET_SSE2" - "ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);") +{ + if (!ix86_vec_interleave_v2df_operator_ok (operands, 0)) + operands[1] = force_reg (V2DFmode, operands[1]); +}) (define_insn "*avx_interleave_lowv2df" - [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o") + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,o") (vec_select:V2DF (vec_concat:V4DF - (match_operand:V2DF 1 "nonimmediate_operand" " x,x,0") - (match_operand:V2DF 2 "nonimmediate_operand" " x,m,x")) + (match_operand:V2DF 1 "nonimmediate_operand" " x,m,x,0") + (match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x")) (parallel [(const_int 0) (const_int 2)])))] - "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 0)" "@ vunpcklpd\t{%2, %1, %0|%0, %1, %2} + vmovddup\t{%1, %0|%0, %1} vmovhpd\t{%2, %1, %0|%0, %1, %2} vmovlpd\t{%2, %H0|%H0, %2}" - [(set_attr "type" "sselog,ssemov,ssemov") + [(set_attr "type" "sselog,sselog,ssemov,ssemov") (set_attr "prefix" "vex") - (set_attr "mode" "V2DF,V1DF,V1DF")]) + (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")]) + +(define_insn "*sse3_interleave_lowv2df" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,o") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "nonimmediate_operand" " 0,m,0,0") + (match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x")) + (parallel [(const_int 0) + (const_int 2)])))] + "TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 0)" + "@ + unpcklpd\t{%2, %0|%0, %2} + movddup\t{%1, %0|%0, %1} + movhpd\t{%2, %0|%0, %2} + movlpd\t{%2, %H0|%H0, %2}" + [(set_attr "type" "sselog,sselog,ssemov,ssemov") + (set_attr "prefix_data16" "*,*,1,1") + (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")]) (define_insn "*sse2_interleave_lowv2df" [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o") @@ -4641,7 +4708,7 @@ (match_operand:V2DF 2 "nonimmediate_operand" " x,m,x")) (parallel [(const_int 0) (const_int 2)])))] - "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 0)" "@ unpcklpd\t{%2, %0|%0, %2} movhpd\t{%2, %0|%0, %2} @@ -4650,6 +4717,37 @@ (set_attr "prefix_data16" "*,1,1") (set_attr "mode" "V2DF,V1DF,V1DF")]) +(define_split + [(set (match_operand:V2DF 0 "memory_operand" "") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "register_operand" "") + (match_dup 1)) + (parallel [(const_int 0) + (const_int 2)])))] + "TARGET_SSE3 && reload_completed" + [(const_int 0)] +{ + rtx low = gen_rtx_REG (DFmode, REGNO (operands[1])); + emit_move_insn (adjust_address (operands[0], DFmode, 0), low); + emit_move_insn (adjust_address (operands[0], DFmode, 8), low); + DONE; +}) + +(define_split + [(set (match_operand:V2DF 0 "register_operand" "") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "memory_operand" "") + (match_dup 1)) + (parallel [(match_operand:SI 2 "const_0_to_1_operand" "") + (match_operand:SI 3 "const_int_operand" "")])))] + "TARGET_SSE3 && INTVAL (operands[2]) + 2 == INTVAL (operands[3])" + [(set (match_dup 0) (vec_duplicate:V2DF (match_dup 1)))] +{ + operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8); +}) + (define_expand "avx_shufpd256" [(match_operand:V4DF 0 "register_operand" "") (match_operand:V4DF 1 "register_operand" "") @@ -7408,6 +7506,20 @@ [(set_attr "type" "ssemov") (set_attr "mode" "V2SF,V4SF,V2SF")]) +(define_insn "*vec_dupv4si_avx" + [(set (match_operand:V4SI 0 "register_operand" "=x,x") + (vec_duplicate:V4SI + (match_operand:SI 1 "register_operand" "x,m")))] + "TARGET_AVX" + "@ + vpshufd\t{$0, %1, %0|%0, %1, 0} + vbroadcastss\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1,ssemov") + (set_attr "length_immediate" "1,0") + (set_attr "prefix_extra" "0,1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI,V4SF")]) + (define_insn "*vec_dupv4si" [(set (match_operand:V4SI 0 "register_operand" "=Y2,x") (vec_duplicate:V4SI @@ -7417,19 +7529,31 @@ %vpshufd\t{$0, %1, %0|%0, %1, 0} shufps\t{$0, %0, %0|%0, %0, 0}" [(set_attr "type" "sselog1") - (set_attr "prefix" "maybe_vex,orig") (set_attr "length_immediate" "1") (set_attr "mode" "TI,V4SF")]) (define_insn "*vec_dupv2di_avx" - [(set (match_operand:V2DI 0 "register_operand" "=x") + [(set (match_operand:V2DI 0 "register_operand" "=x,x") (vec_duplicate:V2DI - (match_operand:DI 1 "register_operand" "x")))] + (match_operand:DI 1 "nonimmediate_operand" " x,m")))] "TARGET_AVX" - "vpunpcklqdq\t{%1, %1, %0|%0, %1, %1}" + "@ + vpunpcklqdq\t{%1, %1, %0|%0, %1, %1} + vmovddup\t{%1, %0|%0, %1}" [(set_attr "type" "sselog1") (set_attr "prefix" "vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "TI,DF")]) + +(define_insn "*vec_dupv2di_sse3" + [(set (match_operand:V2DI 0 "register_operand" "=x,x") + (vec_duplicate:V2DI + (match_operand:DI 1 "nonimmediate_operand" " 0,m")))] + "TARGET_SSE3" + "@ + punpcklqdq\t%0, %0 + movddup\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "mode" "TI,DF")]) (define_insn "*vec_dupv2di" [(set (match_operand:V2DI 0 "register_operand" "=Y2,x") @@ -11838,6 +11962,108 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) +(define_insn_and_split "vec_dup" + [(set (match_operand:AVX256MODE24P 0 "register_operand" "=x,x") + (vec_duplicate:AVX256MODE24P + (match_operand: 1 "nonimmediate_operand" "m,?x")))] + "TARGET_AVX" + "@ + vbroadcasts\t{%1, %0|%0, %1} + #" + "&& reload_completed && REG_P (operands[1])" + [(set (match_dup 2) (vec_duplicate: (match_dup 1))) + (set (match_dup 0) (vec_concat:AVX256MODE24P (match_dup 2) (match_dup 2)))] +{ + operands[2] = gen_rtx_REG (mode, REGNO (operands[0])); +} + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "avx_vbroadcastf128_" + [(set (match_operand:AVX256MODE 0 "register_operand" "=x,x,x") + (vec_concat:AVX256MODE + (match_operand: 1 "nonimmediate_operand" "m,0,?x") + (match_dup 1)))] + "TARGET_AVX" + "@ + vbroadcastf128\t{%1, %0|%0, %1} + vinsertf128\t{$1, %1, %0, %0|%0, %0, %1, 1} + vperm2f128\t{$0, %t1, %t1, %0|%0, %t1, %t1, 0}" + [(set_attr "type" "ssemov,sselog1,sselog1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "0,1,1") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF,V8SF,V8SF")]) + +;; Recognize broadcast as a vec_select as produced by builtin_vec_perm. +;; If it so happens that the input is in memory, use vbroadcast. +;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128). +(define_insn "*avx_vperm_broadcast_v4sf" + [(set (match_operand:V4SF 0 "register_operand" "=x,x,x") + (vec_select:V4SF + (match_operand:V4SF 1 "nonimmediate_operand" "m,o,x") + (match_parallel 2 "avx_vbroadcast_operand" + [(match_operand 3 "const_int_operand" "C,n,n")])))] + "TARGET_AVX" +{ + int elt = INTVAL (operands[3]); + switch (which_alternative) + { + case 0: + case 1: + operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4); + return "vbroadcastss\t{%1, %0|%0, %1}"; + case 2: + operands[2] = GEN_INT (elt * 0x55); + return "vpermilps\t{%2, %1, %0|%0, %1, %2}"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "ssemov,ssemov,sselog1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "0,0,1") + (set_attr "prefix" "vex") + (set_attr "mode" "SF,SF,V4SF")]) + +(define_insn_and_split "*avx_vperm_broadcast_" + [(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x,x,x") + (vec_select:AVX256MODEF2P + (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "m,o,?x") + (match_parallel 2 "avx_vbroadcast_operand" + [(match_operand 3 "const_int_operand" "C,n,n")])))] + "TARGET_AVX" + "#" + "&& reload_completed" + [(set (match_dup 0) (vec_duplicate:AVX256MODEF2P (match_dup 1)))] +{ + rtx op0 = operands[0], op1 = operands[1]; + int elt = INTVAL (operands[3]); + + if (REG_P (op1)) + { + int mask; + + /* Shuffle element we care about into all elements of the 128-bit lane. + The other lane gets shuffled too, but we don't care. */ + if (mode == V4DFmode) + mask = (elt & 1 ? 15 : 0); + else + mask = (elt & 3) * 0x55; + emit_insn (gen_avx_vpermil (op0, op1, GEN_INT (mask))); + + /* Shuffle the lane we care about into both lanes of the dest. */ + mask = (elt / ( / 2)) * 0x11; + emit_insn (gen_avx_vperm2f1283 (op0, op0, op0, GEN_INT (mask))); + DONE; + } + + operands[1] = adjust_address_nv (op1, mode, + elt * GET_MODE_SIZE (mode)); +}) + (define_expand "avx_vpermil" [(set (match_operand:AVXMODEFDP 0 "register_operand" "") (vec_select:AVXMODEFDP @@ -11989,58 +12215,6 @@ (set_attr "prefix" "vex") (set_attr "mode" "V8SF")]) -(define_insn "avx_vbroadcasts" - [(set (match_operand:AVXMODEF4P 0 "register_operand" "=x") - (vec_concat:AVXMODEF4P - (vec_concat: - (match_operand: 1 "memory_operand" "m") - (match_dup 1)) - (vec_concat: - (match_dup 1) - (match_dup 1))))] - "TARGET_AVX" - "vbroadcasts\t{%1, %0|%0, %1}" - [(set_attr "type" "ssemov") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "vex") - (set_attr "mode" "")]) - -(define_insn "avx_vbroadcastss256" - [(set (match_operand:V8SF 0 "register_operand" "=x") - (vec_concat:V8SF - (vec_concat:V4SF - (vec_concat:V2SF - (match_operand:SF 1 "memory_operand" "m") - (match_dup 1)) - (vec_concat:V2SF - (match_dup 1) - (match_dup 1))) - (vec_concat:V4SF - (vec_concat:V2SF - (match_dup 1) - (match_dup 1)) - (vec_concat:V2SF - (match_dup 1) - (match_dup 1)))))] - "TARGET_AVX" - "vbroadcastss\t{%1, %0|%0, %1}" - [(set_attr "type" "ssemov") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "vex") - (set_attr "mode" "SF")]) - -(define_insn "avx_vbroadcastf128_p256" - [(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x") - (vec_concat:AVX256MODEF2P - (match_operand: 1 "memory_operand" "m") - (match_dup 1)))] - "TARGET_AVX" - "vbroadcastf128\t{%1, %0|%0, %1}" - [(set_attr "type" "ssemov") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "vex") - (set_attr "mode" "V4SF")]) - (define_expand "avx_vinsertf128" [(match_operand:AVX256MODE 0 "register_operand" "") (match_operand:AVX256MODE 1 "register_operand" "")