From 8d7dae0eb366a88a1baba1857ecc54c09e4a520e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 4 Jun 2021 17:37:15 +0200 Subject: [PATCH] i386: Add init pattern for V2HI vectors [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit 2021-06-03 Uroš Bizjak gcc/ PR target/100637 * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate): Handle V2HI mode. (ix86_expand_vector_init_general): Ditto. Use SImode instead of word_mode for logic operations when GET_MODE_SIZE (mode) < UNITS_PER_WORD. (expand_vec_perm_even_odd_1): Assert that V2HI mode should be implemented by expand_vec_perm_1. (expand_vec_perm_broadcast_1): Assert that V2HI and V4HI modes should be implemented using standard shuffle patterns. (ix86_vectorize_vec_perm_const): Handle V2HImode. Add V4HI and V2HI modes to modes, implementable with shuffle for one operand. * config/i386/mmx.md (*punpckwd): New insn_and_split pattern. (*pshufw_1): New insn pattern. (*vec_dupv2hi): Ditto. (vec_initv2hihi): New expander. gcc/testsuite/ PR target/100637 * gcc.dg/vect/slp-perm-9.c (dg-final): Adjust dumps for vect32 targets. --- gcc/config/i386/i386-expand.c | 45 ++++++++++++++----- gcc/config/i386/mmx.md | 82 ++++++++++++++++++++++++++++++++++ gcc/testsuite/gcc.dg/vect/slp-perm-9.c | 8 ++-- 3 files changed, 121 insertions(+), 14 deletions(-) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 68bb5ab..804cb59 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -13723,6 +13723,19 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, } goto widen; + case E_V2HImode: + if (TARGET_SSE2) + { + rtx x; + + val = gen_lowpart (SImode, val); + x = gen_rtx_TRUNCATE (HImode, val); + x = gen_rtx_VEC_DUPLICATE (mode, x); + emit_insn (gen_rtx_SET (target, x)); + return true; + } + return false; + case E_V8QImode: if (!mmx_ok) return false; @@ -14524,6 +14537,8 @@ quarter: case E_V4HImode: case E_V8QImode: + + case E_V2HImode: break; default: @@ -14532,12 +14547,14 @@ quarter: { int i, j, n_elts, n_words, n_elt_per_word; - machine_mode inner_mode; + machine_mode tmp_mode, inner_mode; rtx words[4], shift; + tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode; + inner_mode = GET_MODE_INNER (mode); n_elts = GET_MODE_NUNITS (mode); - n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD; + n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode); n_elt_per_word = n_elts / n_words; shift = GEN_INT (GET_MODE_BITSIZE (inner_mode)); @@ -14548,15 +14565,15 @@ quarter: for (j = 0; j < n_elt_per_word; ++j) { rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1); - elt = convert_modes (word_mode, inner_mode, elt, true); + elt = convert_modes (tmp_mode, inner_mode, elt, true); if (j == 0) word = elt; else { - word = expand_simple_binop (word_mode, ASHIFT, word, shift, + word = expand_simple_binop (tmp_mode, ASHIFT, word, shift, word, 1, OPTAB_LIB_WIDEN); - word = expand_simple_binop (word_mode, IOR, word, elt, + word = expand_simple_binop (tmp_mode, IOR, word, elt, word, 1, OPTAB_LIB_WIDEN); } } @@ -14570,14 +14587,14 @@ quarter: { rtx tmp = gen_reg_rtx (mode); emit_clobber (tmp); - emit_move_insn (gen_lowpart (word_mode, tmp), words[0]); - emit_move_insn (gen_highpart (word_mode, tmp), words[1]); + emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]); + emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]); emit_move_insn (target, tmp); } else if (n_words == 4) { rtx tmp = gen_reg_rtx (V4SImode); - gcc_assert (word_mode == SImode); + gcc_assert (tmp_mode == SImode); vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words)); ix86_expand_vector_init_general (false, V4SImode, tmp, vals); emit_move_insn (target, gen_lowpart (mode, tmp)); @@ -19548,6 +19565,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) case E_V2DImode: case E_V2SImode: case E_V4SImode: + case E_V2HImode: /* These are always directly implementable by expand_vec_perm_1. */ gcc_unreachable (); @@ -19758,6 +19776,8 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) case E_V2DImode: case E_V2SImode: case E_V4SImode: + case E_V2HImode: + case E_V4HImode: /* These are always implementable using standard shuffle patterns. */ gcc_unreachable (); @@ -20267,6 +20287,10 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, if (!TARGET_MMX_WITH_SSE) return false; break; + case E_V2HImode: + if (!TARGET_SSE2) + return false; + break; case E_V2DImode: case E_V2DFmode: if (!TARGET_SSE) @@ -20298,10 +20322,11 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, /* Check whether the mask can be applied to the vector type. */ d.one_operand_p = (which != 3); - /* Implementable with shufps or pshufd. */ + /* Implementable with shufps, pshufd or pshuflw. */ if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V2SFmode - || d.vmode == V4SImode || d.vmode == V2SImode)) + || d.vmode == V4SImode || d.vmode == V2SImode + || d.vmode == V4HImode || d.vmode == V2HImode)) return true; /* Otherwise we have to go through the motions and see if we can diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 914e5e9..c3fd280 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3292,6 +3292,88 @@ DONE; }) +(define_insn_and_split "*punpckwd" + [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") + (vec_select:V2HI + (vec_concat:V4HI + (match_operand:V2HI 1 "register_operand" "0,Yw") + (match_operand:V2HI 2 "register_operand" "x,Yw")) + (parallel [(match_operand 3 "const_0_to_3_operand") + (match_operand 4 "const_0_to_3_operand")])))] + "TARGET_SSE2" + "#" + "&& reload_completed" + [(set (match_dup 5) + (vec_select:V4HI + (match_dup 5) + (parallel [(match_dup 3) (match_dup 4) + (const_int 0) (const_int 0)])))] +{ + rtx dest = lowpart_subreg (V8HImode, operands[0], V2HImode); + rtx op1 = lowpart_subreg (V8HImode, operands[1], V2HImode); + rtx op2 = lowpart_subreg (V8HImode, operands[2], V2HImode); + + emit_insn (gen_vec_interleave_lowv8hi (dest, op1, op2)); + + static const int map[4] = { 0, 2, 1, 3 }; + + int sel0 = map[INTVAL (operands[3])]; + int sel1 = map[INTVAL (operands[4])]; + + if (sel0 == 0 && sel1 == 1) + DONE; + + operands[3] = GEN_INT (sel0); + operands[4] = GEN_INT (sel1); + + operands[5] = lowpart_subreg (V4HImode, dest, V8HImode); +} + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog") + (set_attr "mode" "TI")]) + +(define_insn "*pshufw_1" + [(set (match_operand:V2HI 0 "register_operand" "=Yw") + (vec_select:V2HI + (match_operand:V2HI 1 "register_operand" "Yw") + (parallel [(match_operand 2 "const_0_to_1_operand") + (match_operand 3 "const_0_to_1_operand")])))] + "TARGET_SSE2" +{ + int mask = 0; + mask |= INTVAL (operands[2]) << 0; + mask |= INTVAL (operands[3]) << 2; + mask |= 2 << 4; + mask |= 3 << 6; + operands[2] = GEN_INT (mask); + + return "%vpshuflw\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*vec_dupv2hi" + [(set (match_operand:V2HI 0 "register_operand" "=Yw") + (vec_duplicate:V2HI + (truncate:HI + (match_operand:SI 1 "register_operand" "Yw"))))] + "TARGET_SSE2" + "%vpshuflw\t{$0, %1, %0|%0, %1, 0}" + [(set_attr "type" "sselog1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_expand "vec_initv2hihi" + [(match_operand:V2HI 0 "register_operand") + (match_operand 1)] + "TARGET_SSE2" +{ + ix86_expand_vector_init (false, operands[0], + operands[1]); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Miscellaneous diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c index ab75f44..873eddf 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c @@ -57,13 +57,13 @@ int main (int argc, const char* argv[]) return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 2 "vect" { target { ! { vect_perm_short || vect_load_lanes } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_perm_short || vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 2 "vect" { target { ! { { vect_perm_short || vect32 } || vect_load_lanes } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_perm_short || vect32 } || vect_load_lanes } } } } */ /* We don't try permutes with a group size of 3 for variable-length vectors. */ /* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target { vect_perm_short && { { ! vect_perm3_short } && { ! vect_partial_vectors_usage_1 } } } xfail vect_variable_length } } } */ /* Try to vectorize the epilogue using partial vectors. */ /* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 2 "vect" { target { vect_perm_short && { { ! vect_perm3_short } && vect_partial_vectors_usage_1 } } xfail vect_variable_length } } } */ /* { dg-final { scan-tree-dump-not "permutation requires at least three vectors" "vect" { target vect_perm3_short } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { ! vect_perm3_short } || vect_load_lanes } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_short && { ! vect_load_lanes } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { ! { vect_perm3_short || vect32 } } || vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { { vect_perm3_short || vect32 } && { ! vect_load_lanes } } } } } */ -- 2.7.4