GEN_INT (1), GEN_INT (5)));
break;
+ case E_V2SFmode:
+ sse_mode = V4SFmode;
+ double_sse_mode = V8SFmode;
+ mask = gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (4,
+ GEN_INT (0), GEN_INT (4),
+ GEN_INT (1), GEN_INT (5)));
+ break;
+
default:
gcc_unreachable ();
}
rtx insn = gen_rtx_SET (dest, op2);
emit_insn (insn);
+ /* Move bits 64:127 to bits 0:63. */
if (high_p)
{
- /* Move bits 64:127 to bits 0:63. */
- mask = gen_rtx_PARALLEL (VOIDmode,
- gen_rtvec (4, GEN_INT (2), GEN_INT (3),
- GEN_INT (0), GEN_INT (0)));
- dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
- op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+ if (sse_mode == V4SFmode)
+ {
+ mask = gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+ GEN_INT (4), GEN_INT (5)));
+ op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
+ op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
+ }
+ else
+ {
+ mask = gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+ GEN_INT (0), GEN_INT (1)));
+ dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
+ op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+ }
+
insn = gen_rtx_SET (dest, op1);
emit_insn (insn);
}
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
- else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+ else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
+ || GET_MODE_SIZE (vmode) == 8))
;
else
return false;
case E_V8SFmode:
case E_V2DFmode:
case E_V4SFmode:
+ case E_V4HImode:
case E_V8HImode:
case E_V8SImode:
case E_V32HImode:
vmode = V8HImode;
goto do_subreg;
+ case E_V2SImode:
+ for (i = 0; i < 2; ++i)
+ mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
+ vmode = V4HImode;
+ goto do_subreg;
+
case E_V4SImode:
for (i = 0; i < 4; ++i)
mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
vperm = force_reg (vmode, vperm);
- if (GET_MODE_SIZE (vmode) == 16)
+ if (GET_MODE_SIZE (vmode) == 8)
+ emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm));
+ else if (GET_MODE_SIZE (vmode) == 16)
emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
else
emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
op1 = gen_lowpart (vmode, op1);
break;
+ case E_V8QImode:
+ for (i = 0; i < 8; i += 2)
+ if (d->perm[i] + 1 != d->perm[i + 1])
+ goto use_pblendvb;
+
+ for (i = 0; i < 4; ++i)
+ mask |= (d->perm[i * 2] >= 8) << i;
+ vmode = V4HImode;
+ goto do_subreg;
+
case E_V32QImode:
/* See if bytes move in pairs. If not, vpblendvb must be used. */
for (i = 0; i < 32; i += 2)
}
else
{
- if (GET_MODE_SIZE (d->vmode) == 16)
+ if (GET_MODE_SIZE (d->vmode) == 8)
+ {
+ if (!TARGET_SSSE3)
+ return false;
+ vmode = V8QImode;
+ }
+ else if (GET_MODE_SIZE (d->vmode) == 16)
{
if (!TARGET_SSSE3)
return false;
eltsz = GET_MODE_UNIT_SIZE (d->vmode);
if (!d->one_operand_p)
mask = 2 * nelt - 1;
- else if (vmode == V16QImode)
- mask = nelt - 1;
else if (vmode == V64QImode)
mask = nelt / 4 - 1;
- else
+ else if (vmode == V32QImode)
mask = nelt / 2 - 1;
+ else
+ mask = nelt - 1;
for (i = 0; i < nelt; ++i)
{
}
}
- vperm = gen_rtx_CONST_VECTOR (vmode,
- gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
- vperm = force_reg (vmode, vperm);
+ machine_mode vpmode = vmode;
+
+ if (vmode == V8QImode)
+ {
+ for (i = nelt; i < 16; ++i)
+ rperm[i] = constm1_rtx;
+ vpmode = V16QImode;
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (vpmode,
+ gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
+ vperm = force_reg (vpmode, vperm);
target = d->target;
if (d->vmode != vmode)
op0 = gen_lowpart (vmode, d->op0);
if (d->one_operand_p)
{
- if (vmode == V16QImode)
+ if (vmode == V8QImode)
+ emit_insn (gen_mmx_pshufbv8qi3 (target, op0, vperm));
+ else if (vmode == V16QImode)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
- else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+ else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 8
+ || GET_MODE_SIZE (vmode) == 16))
;
else
return false;
rtx_insn *seq;
bool ok, same_halves = false;
- if (GET_MODE_SIZE (d->vmode) == 16)
+ if (GET_MODE_SIZE (d->vmode) == 8
+ || GET_MODE_SIZE (d->vmode) == 16)
{
if (d->one_operand_p)
return false;
memset (remap, 0xff, sizeof (remap));
dremap = *d;
- if (GET_MODE_SIZE (d->vmode) == 16)
+ if (GET_MODE_SIZE (d->vmode) == 8)
+ {
+ unsigned HOST_WIDE_INT h1, h2, h3, h4;
+
+ /* Split the two input vectors into 4 halves. */
+ h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
+ h2 = h1 << nelt2;
+ h3 = h2 << nelt2;
+ h4 = h3 << nelt2;
+
+ /* If the elements from the low halves use interleave low,
+ and similarly for interleave high. */
+ if ((contents & (h1 | h3)) == contents)
+ {
+ /* punpckl* */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i] = i * 2;
+ remap[i + nelt] = i * 2 + 1;
+ dremap.perm[i * 2] = i;
+ dremap.perm[i * 2 + 1] = i + nelt;
+ }
+ }
+ else if ((contents & (h2 | h4)) == contents)
+ {
+ /* punpckh* */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nelt2] = i * 2;
+ remap[i + nelt + nelt2] = i * 2 + 1;
+ dremap.perm[i * 2] = i + nelt2;
+ dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+ }
+ }
+ else
+ return false;
+ }
+ else if (GET_MODE_SIZE (d->vmode) == 16)
{
unsigned HOST_WIDE_INT h1, h2, h3, h4;
}
/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
- and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
- with two "and" and "pack" or two "shift" and "pack" insns. We should
- have already failed all two instruction sequences. */
+ and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
+ operands with two "and" and "pack" or two "shift" and "pack" insns.
+ We should have already failed all two instruction sequences. */
static bool
expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
gen_pack = gen_sse4_1_packusdw;
gen_shift = gen_lshrv4si3;
break;
+ case E_V8QImode:
+ /* No check as all instructions are SSE2. */
+ c = 0xff;
+ s = 8;
+ half_mode = V4HImode;
+ gen_and = gen_andv4hi3;
+ gen_pack = gen_mmx_packuswb;
+ gen_shift = gen_lshrv4hi3;
+ break;
case E_V16QImode:
/* No check as all instructions are SSE2. */
c = 0xff;
end_perm = true;
break;
default:
- /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
- general shuffles. */
+ /* Only V8QI, V8HI, V16QI, V16HI and V32QI modes
+ are more profitable than general shuffles. */
return false;
}
}
break;
+ case E_V8QImode:
case E_V16QImode:
return expand_vec_perm_even_odd_pack (d);
/* These are always implementable using standard shuffle patterns. */
gcc_unreachable ();
+ case E_V8QImode:
+ /* These can be implemented via interleave. We save one insn by
+ stopping once we have promoted to V2SImode and then use pshufd. */
+ if (d->testing_p)
+ return true;
+ do
+ {
+ rtx dest;
+ rtx (*gen) (rtx, rtx, rtx)
+ = vmode == V8QImode ? gen_mmx_punpcklbw
+ : gen_mmx_punpcklwd;
+
+ if (elt >= nelt2)
+ {
+ gen = vmode == V8QImode ? gen_mmx_punpckhbw
+ : gen_mmx_punpckhwd;
+ elt -= nelt2;
+ }
+ nelt2 /= 2;
+
+ dest = gen_reg_rtx (vmode);
+ emit_insn (gen (dest, op0, op0));
+ vmode = get_mode_wider_vector (vmode);
+ op0 = gen_lowpart (vmode, dest);
+ }
+ while (vmode != V2SImode);
+
+ memset (perm2, elt, 2);
+ dest = gen_reg_rtx (V2SImode);
+ ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
+ gcc_assert (ok);
+ if (!d->testing_p)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+ return true;
+
case E_V8HImode:
case E_V16QImode:
/* These can be implemented via interleave. We save one insn by
case E_V2SFmode:
case E_V2SImode:
case E_V4HImode:
+ case E_V8QImode:
if (!TARGET_MMX_WITH_SSE)
return false;
break;
(set_attr "prefix" "maybe_vex,orig")
(set_attr "mode" "V4SF")])
+(define_insn_and_split "*vec_interleave_lowv2sf"
+ [(set (match_operand:V2SF 0 "register_operand" "=x,v")
+ (vec_select:V2SF
+ (vec_concat:V4SF
+ (match_operand:V2SF 1 "register_operand" "0,v")
+ (match_operand:V2SF 2 "register_operand" "x,v"))
+ (parallel [(const_int 0) (const_int 2)])))]
+ "TARGET_MMX_WITH_SSE"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+ "ix86_split_mmx_punpck (operands, false); DONE;"
+ [(set_attr "isa" "noavx,avx")
+ (set_attr "type" "sselog")
+ (set_attr "prefix" "orig,maybe_evex")
+ (set_attr "mode" "V4SF")])
+
+(define_insn_and_split "*vec_interleave_highv2sf"
+ [(set (match_operand:V2SF 0 "register_operand" "=x,v")
+ (vec_select:V2SF
+ (vec_concat:V4SF
+ (match_operand:V2SF 1 "register_operand" "0,v")
+ (match_operand:V2SF 2 "register_operand" "x,v"))
+ (parallel [(const_int 1) (const_int 3)])))]
+ "TARGET_MMX_WITH_SSE"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+ "ix86_split_mmx_punpck (operands, true); DONE;"
+ [(set_attr "isa" "noavx,avx")
+ (set_attr "type" "sselog")
+ (set_attr "prefix" "orig,vex")
+ (set_attr "mode" "V4SF")])
+
(define_insn "*vec_dupv2sf"
[(set (match_operand:V2SF 0 "register_operand" "=y,Yv,x")
(vec_duplicate:V2SF
pack<s_trunsuffix>swb\t{%2, %0|%0, %2}
#
#"
- "TARGET_SSE2 && reload_completed
+ "&& reload_completed
&& SSE_REGNO_P (REGNO (operands[0]))"
[(const_int 0)]
"ix86_split_mmx_pack (operands, <any_s_truncate:CODE>); DONE;"
packssdw\t{%2, %0|%0, %2}
#
#"
- "TARGET_SSE2 && reload_completed
+ "&& reload_completed
&& SSE_REGNO_P (REGNO (operands[0]))"
[(const_int 0)]
"ix86_split_mmx_pack (operands, SS_TRUNCATE); DONE;"
punpckhbw\t{%2, %0|%0, %2}
#
#"
- "TARGET_SSE2 && reload_completed
+ "&& reload_completed
&& SSE_REGNO_P (REGNO (operands[0]))"
[(const_int 0)]
"ix86_split_mmx_punpck (operands, true); DONE;"
punpcklbw\t{%2, %0|%0, %k2}
#
#"
- "TARGET_SSE2 && reload_completed
+ "&& reload_completed
&& SSE_REGNO_P (REGNO (operands[0]))"
[(const_int 0)]
"ix86_split_mmx_punpck (operands, false); DONE;"
punpckhwd\t{%2, %0|%0, %2}
#
#"
- "TARGET_SSE2 && reload_completed
+ "&& reload_completed
&& SSE_REGNO_P (REGNO (operands[0]))"
[(const_int 0)]
"ix86_split_mmx_punpck (operands, true); DONE;"
punpcklwd\t{%2, %0|%0, %k2}
#
#"
- "TARGET_SSE2 && reload_completed
+ "&& reload_completed
&& SSE_REGNO_P (REGNO (operands[0]))"
[(const_int 0)]
"ix86_split_mmx_punpck (operands, false); DONE;"
punpckhdq\t{%2, %0|%0, %2}
#
#"
- "TARGET_SSE2 && reload_completed
+ "&& reload_completed
&& SSE_REGNO_P (REGNO (operands[0]))"
[(const_int 0)]
"ix86_split_mmx_punpck (operands, true); DONE;"
punpckldq\t{%2, %0|%0, %k2}
#
#"
- "TARGET_SSE2 && reload_completed
+ "&& reload_completed
&& SSE_REGNO_P (REGNO (operands[0]))"
[(const_int 0)]
"ix86_split_mmx_punpck (operands, false); DONE;"
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "TI")])
+(define_insn "mmx_pshufbv8qi3"
+ [(set (match_operand:V8QI 0 "register_operand" "=x,Yw")
+ (unspec:V8QI
+ [(match_operand:V8QI 1 "register_operand" "0,Yw")
+ (match_operand:V16QI 2 "vector_operand" "xBm,Ywm")]
+ UNSPEC_PSHUFB))]
+ "TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
+ "@
+ pshufb\t{%2, %0|%0, %2}
+ vpshufb\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "isa" "noavx,avx")
+ (set_attr "type" "sselog1")
+ (set_attr "prefix_data16" "1,*")
+ (set_attr "prefix_extra" "1")
+ (set_attr "prefix" "orig,maybe_evex")
+ (set_attr "btver2_decode" "vector")
+ (set_attr "mode" "TI")])
+
(define_expand "mmx_pshufw"
[(match_operand:V4HI 0 "register_operand")
(match_operand:V4HI 1 "register_mmxmem_operand")
(set_attr "length_immediate" "1")
(set_attr "mode" "TI")])
+(define_insn "*mmx_pblendw"
+ [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,x")
+ (vec_merge:V4HI
+ (match_operand:V4HI 2 "register_operand" "Yr,*x,x")
+ (match_operand:V4HI 1 "register_operand" "0,0,x")
+ (match_operand:SI 3 "const_0_to_63_operand" "n,n,n")))]
+ "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+ "@
+ pblendw\t{%3, %2, %0|%0, %2, %3}
+ pblendw\t{%3, %2, %0|%0, %2, %3}
+ vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "isa" "noavx,noavx,avx")
+ (set_attr "type" "ssemov")
+ (set_attr "prefix_extra" "1")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "orig,orig,vex")
+ (set_attr "mode" "TI")])
+
;; Optimize V2SImode load from memory, swapping the elements and
;; storing back into the memory into DImode rotate of the memory by 32.
(define_split