From 829c4bea06600ea4201462f91ce6d76ca21fdb35 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 13 May 2021 12:14:14 +0200 Subject: [PATCH] ix86: Support V{2, 4}DImode arithmetic right shifts for SSE2+ [PR98856] As mentioned in the PR, we don't support arithmetic right V2DImode or V4DImode on x86 without -mavx512vl or -mxop. The ISAs indeed don't have {,v}psraq instructions until AVX512VL, but we actually can emulate it quite easily. One case is arithmetic >> 63, we can just emit {,v}pxor; {,v}pcmpgt for that for SSE4.2+, or for SSE2 psrad $31; pshufd $0xf5. Then arithmetic >> by constant > 32, that can be done with {,v}psrad $31 and {,v}psrad $(cst-32) and two operand permutation, arithmetic >> 32 can be done as {,v}psrad $31 and permutation of that and the original operand. Arithmetic >> by constant < 32 can be done as {,v}psrad $cst and {,v}psrlq $cst and two operand permutation. And arithmetic >> by variable scalar amount can be done as arithmetic >> 63, logical >> by the amount, << by (64 - amount of the >> 63 result; note that the vector << 64 result in 0) and oring together. I had to improve the permutation generation so that it actually handles the needed permutations (or handles them better). 2021-05-13 Jakub Jelinek PR tree-optimization/98856 * config/i386/i386.c (ix86_shift_rotate_cost): Add CODE argument. Expect V2DI and V4DI arithmetic right shifts to be emulated. (ix86_rtx_costs, ix86_add_stmt_cost): Adjust ix86_shift_rotate_cost caller. * config/i386/i386-expand.c (expand_vec_perm_2perm_interleave, expand_vec_perm_2perm_pblendv): New functions. (ix86_expand_vec_perm_const_1): Use them. * config/i386/sse.md (ashr3): Rename to ... (ashr3): ... this. (ashr3): New define_expand with VI248_AVX512BW iterator. (ashrv4di3): New define_expand. (ashrv2di3): Change condition to TARGET_SSE2, handle !TARGET_XOP and !TARGET_AVX512VL expansion. * gcc.target/i386/sse2-psraq-1.c: New test. * gcc.target/i386/sse4_2-psraq-1.c: New test. * gcc.target/i386/avx-psraq-1.c: New test. * gcc.target/i386/avx2-psraq-1.c: New test. * gcc.target/i386/avx-pr82370.c: Adjust expected number of vpsrad instructions. * gcc.target/i386/avx2-pr82370.c: Likewise. * gcc.target/i386/avx512f-pr82370.c: Likewise. * gcc.target/i386/avx512bw-pr82370.c: Likewise. * gcc.dg/torture/vshuf-4.inc: Add two further permutations. * gcc.dg/torture/vshuf-8.inc: Likewise. --- gcc/config/i386/i386-expand.c | 248 +++++++++++++++++++++++ gcc/config/i386/i386.c | 29 ++- gcc/config/i386/sse.md | 246 +++++++++++++++++++++- gcc/testsuite/gcc.dg/torture/vshuf-4.inc | 4 +- gcc/testsuite/gcc.dg/torture/vshuf-8.inc | 4 +- gcc/testsuite/gcc.target/i386/avx-pr82370.c | 2 +- gcc/testsuite/gcc.target/i386/avx-psraq-1.c | 13 ++ gcc/testsuite/gcc.target/i386/avx2-pr82370.c | 4 +- gcc/testsuite/gcc.target/i386/avx2-psraq-1.c | 51 +++++ gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c | 4 +- gcc/testsuite/gcc.target/i386/avx512f-pr82370.c | 4 +- gcc/testsuite/gcc.target/i386/sse2-psraq-1.c | 53 +++++ gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c | 13 ++ 13 files changed, 660 insertions(+), 15 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx-psraq-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx2-psraq-1.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-psraq-1.c create mode 100644 gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 92488b8..0fa8d45 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -18662,6 +18662,242 @@ expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d) return true; } +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement + a two vector permutation using two single vector permutations and + {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one + of dfirst or dsecond is identity permutation. */ + +static bool +expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn) +{ + unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt; + struct expand_vec_perm_d dfirst, dsecond, dfinal; + bool ident1 = true, ident2 = true; + + if (d->one_operand_p) + return false; + + if (GET_MODE_SIZE (d->vmode) == 16) + { + if (!TARGET_SSE) + return false; + if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2) + return false; + } + else if (GET_MODE_SIZE (d->vmode) == 32) + { + if (!TARGET_AVX) + return false; + if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2) + return false; + lane = nelt2; + } + else + return false; + + for (i = 1; i < nelt; i++) + if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1))) + return false; + + dfirst = *d; + dsecond = *d; + dfinal = *d; + dfirst.op1 = dfirst.op0; + dfirst.one_operand_p = true; + dsecond.op0 = dsecond.op1; + dsecond.one_operand_p = true; + + for (i = 0; i < nelt; i++) + if (d->perm[i] >= nelt) + { + dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt; + if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0)) + ident2 = false; + dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)] + = d->perm[i] - nelt; + } + else + { + dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i]; + if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0)) + ident1 = false; + dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i]; + } + + if (two_insn && !ident1 && !ident2) + return false; + + if (!d->testing_p) + { + if (!ident1) + dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode); + if (!ident2) + dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode); + if (d->perm[0] >= nelt) + std::swap (dfinal.op0, dfinal.op1); + } + + bool ok; + rtx_insn *seq1 = NULL, *seq2 = NULL; + + if (!ident1) + { + start_sequence (); + ok = expand_vec_perm_1 (&dfirst); + seq1 = get_insns (); + end_sequence (); + + if (!ok) + return false; + } + + if (!ident2) + { + start_sequence (); + ok = expand_vec_perm_1 (&dsecond); + seq2 = get_insns (); + end_sequence (); + + if (!ok) + return false; + } + + if (d->testing_p) + return true; + + for (i = 0; i < nelt; i++) + { + dfinal.perm[i] = i / 2; + if (i >= lane) + dfinal.perm[i] += lane / 2; + if ((i & 1) != 0) + dfinal.perm[i] += nelt; + } + emit_insn (seq1); + emit_insn (seq2); + ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1, + dfinal.perm, dfinal.nelt, false); + gcc_assert (ok); + return true; +} + +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify + the permutation using two single vector permutations and the SSE4_1 pblendv + instruction. If two_insn, succeed only if one of dfirst or dsecond is + identity permutation. */ + +static bool +expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn) +{ + unsigned i, nelt = d->nelt; + struct expand_vec_perm_d dfirst, dsecond, dfinal; + machine_mode vmode = d->vmode; + bool ident1 = true, ident2 = true; + + /* Use the same checks as in expand_vec_perm_blend. */ + if (d->one_operand_p) + return false; + if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) + ; + else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) + ; + else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) + ; + else + return false; + + dfirst = *d; + dsecond = *d; + dfinal = *d; + dfirst.op1 = dfirst.op0; + dfirst.one_operand_p = true; + dsecond.op0 = dsecond.op1; + dsecond.one_operand_p = true; + + for (i = 0; i < nelt; ++i) + if (d->perm[i] >= nelt) + { + dfirst.perm[i] = 0xff; + dsecond.perm[i] = d->perm[i] - nelt; + if (d->perm[i] != i + nelt) + ident2 = false; + } + else + { + dsecond.perm[i] = 0xff; + dfirst.perm[i] = d->perm[i]; + if (d->perm[i] != i) + ident1 = false; + } + + if (two_insn && !ident1 && !ident2) + return false; + + /* For now. Ideally treat 0xff as a wildcard. */ + for (i = 0; i < nelt; ++i) + if (dfirst.perm[i] == 0xff) + { + if (GET_MODE_SIZE (vmode) == 32 + && dfirst.perm[i ^ (nelt / 2)] != 0xff) + dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2); + else + dfirst.perm[i] = i; + } + else + { + if (GET_MODE_SIZE (vmode) == 32 + && dsecond.perm[i ^ (nelt / 2)] != 0xff) + dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2); + else + dsecond.perm[i] = i; + } + + if (!d->testing_p) + { + if (!ident1) + dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode); + if (!ident2) + dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode); + } + + bool ok; + rtx_insn *seq1 = NULL, *seq2 = NULL; + + if (!ident1) + { + start_sequence (); + ok = expand_vec_perm_1 (&dfirst); + seq1 = get_insns (); + end_sequence (); + + if (!ok) + return false; + } + + if (!ident2) + { + start_sequence (); + ok = expand_vec_perm_1 (&dsecond); + seq2 = get_insns (); + end_sequence (); + + if (!ok) + return false; + } + + if (d->testing_p) + return true; + + for (i = 0; i < nelt; ++i) + dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i); + + emit_insn (seq1); + emit_insn (seq2); + ok = expand_vec_perm_blend (&dfinal); + gcc_assert (ok); + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF permutation using two vperm2f128, followed by a vshufpd insn blending the two vectors together. */ @@ -19773,6 +20009,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_pblendv (d)) return true; + if (expand_vec_perm_2perm_interleave (d, true)) + return true; + + if (expand_vec_perm_2perm_pblendv (d, true)) + return true; + /* Try sequences of three instructions. */ if (expand_vec_perm_even_odd_pack (d)) @@ -19790,6 +20032,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_vperm2f128_vblend (d)) return true; + if (expand_vec_perm_2perm_interleave (d, false)) + return true; + + if (expand_vec_perm_2perm_pblendv (d, false)) + return true; + /* Try sequences of four instructions. */ if (expand_vec_perm_even_odd_trunc (d)) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 915f89f..6a1f574 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19732,6 +19732,7 @@ ix86_division_cost (const struct processor_costs *cost, static int ix86_shift_rotate_cost (const struct processor_costs *cost, + enum rtx_code code, enum machine_mode mode, bool constant_op1, HOST_WIDE_INT op1_val, bool speed, @@ -19770,6 +19771,19 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, count = 7; return ix86_vec_cost (mode, cost->sse_op * count); } + /* V*DImode arithmetic right shift is emulated. */ + else if (code == ASHIFTRT + && (mode == V2DImode || mode == V4DImode) + && !TARGET_XOP + && !TARGET_AVX512VL) + { + int count = 4; + if (constant_op1 && op1_val == 63 && TARGET_SSE4_2) + count = 2; + else if (constant_op1) + count = 3; + return ix86_vec_cost (mode, cost->sse_op * count); + } else return ix86_vec_cost (mode, cost->sse_op); } @@ -19939,13 +19953,15 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case LSHIFTRT: case ROTATERT: bool skip_op0, skip_op1; - *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)), + *total = ix86_shift_rotate_cost (cost, code, mode, + CONSTANT_P (XEXP (x, 1)), CONST_INT_P (XEXP (x, 1)) ? INTVAL (XEXP (x, 1)) : -1, speed, GET_CODE (XEXP (x, 1)) == AND, SUBREG_P (XEXP (x, 1)) - && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND, + && GET_CODE (XEXP (XEXP (x, 1), + 0)) == AND, &skip_op0, &skip_op1); if (skip_op0 || skip_op1) { @@ -22383,11 +22399,16 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, case LROTATE_EXPR: case RROTATE_EXPR: { + tree op1 = gimple_assign_rhs1 (stmt_info->stmt); tree op2 = gimple_assign_rhs2 (stmt_info->stmt); stmt_cost = ix86_shift_rotate_cost - (ix86_cost, mode, + (ix86_cost, + (subcode == RSHIFT_EXPR + && !TYPE_UNSIGNED (TREE_TYPE (op1))) + ? ASHIFTRT : LSHIFTRT, mode, TREE_CODE (op2) == INTEGER_CST, - cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1, + cst_and_fits_in_hwi (op2) + ? int_cst_value (op2) : -1, true, false, false, NULL, NULL); } break; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 4072d0c..62f4e15 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -12468,7 +12468,7 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "")]) -(define_insn "ashr3" +(define_insn "ashr3" [(set (match_operand:VI248_AVX512BW_AVX512VL 0 "register_operand" "=v,v") (ashiftrt:VI248_AVX512BW_AVX512VL (match_operand:VI248_AVX512BW_AVX512VL 1 "nonimmediate_operand" "v,vm") @@ -12482,6 +12482,126 @@ (const_string "0"))) (set_attr "mode" "")]) +(define_expand "ashr3" + [(set (match_operand:VI248_AVX512BW 0 "register_operand") + (ashiftrt:VI248_AVX512BW + (match_operand:VI248_AVX512BW 1 "nonimmediate_operand") + (match_operand:DI 2 "nonmemory_operand")))] + "TARGET_AVX512F") + +(define_expand "ashrv4di3" + [(set (match_operand:V4DI 0 "register_operand") + (ashiftrt:V4DI + (match_operand:V4DI 1 "nonimmediate_operand") + (match_operand:DI 2 "nonmemory_operand")))] + "TARGET_AVX2" +{ + if (!TARGET_AVX512VL) + { + if (CONST_INT_P (operands[2]) && UINTVAL (operands[2]) >= 63) + { + rtx zero = force_reg (V4DImode, CONST0_RTX (V4DImode)); + emit_insn (gen_avx2_gtv4di3 (operands[0], zero, operands[1])); + DONE; + } + if (operands[2] == const0_rtx) + { + emit_move_insn (operands[0], operands[1]); + DONE; + } + operands[1] = force_reg (V4DImode, operands[1]); + if (CONST_INT_P (operands[2])) + { + vec_perm_builder sel (8, 8, 1); + sel.quick_grow (8); + rtx arg0, arg1; + rtx op1 = lowpart_subreg (V8SImode, operands[1], V4DImode); + rtx target = gen_reg_rtx (V8SImode); + if (INTVAL (operands[2]) > 32) + { + arg0 = gen_reg_rtx (V8SImode); + arg1 = gen_reg_rtx (V8SImode); + emit_insn (gen_ashrv8si3 (arg1, op1, GEN_INT (31))); + emit_insn (gen_ashrv8si3 (arg0, op1, + GEN_INT (INTVAL (operands[2]) - 32))); + sel[0] = 1; + sel[1] = 9; + sel[2] = 3; + sel[3] = 11; + sel[4] = 5; + sel[5] = 13; + sel[6] = 7; + sel[7] = 15; + } + else if (INTVAL (operands[2]) == 32) + { + arg0 = op1; + arg1 = gen_reg_rtx (V8SImode); + emit_insn (gen_ashrv8si3 (arg1, op1, GEN_INT (31))); + sel[0] = 1; + sel[1] = 9; + sel[2] = 3; + sel[3] = 11; + sel[4] = 5; + sel[5] = 13; + sel[6] = 7; + sel[7] = 15; + } + else + { + arg0 = gen_reg_rtx (V4DImode); + arg1 = gen_reg_rtx (V8SImode); + emit_insn (gen_lshrv4di3 (arg0, operands[1], operands[2])); + emit_insn (gen_ashrv8si3 (arg1, op1, operands[2])); + arg0 = lowpart_subreg (V8SImode, arg0, V4DImode); + sel[0] = 0; + sel[1] = 9; + sel[2] = 2; + sel[3] = 11; + sel[4] = 4; + sel[5] = 13; + sel[6] = 6; + sel[7] = 15; + } + vec_perm_indices indices (sel, 2, 8); + bool ok = targetm.vectorize.vec_perm_const (V8SImode, target, + arg0, arg1, indices); + gcc_assert (ok); + emit_move_insn (operands[0], + lowpart_subreg (V4DImode, target, V8SImode)); + DONE; + } + + rtx zero = force_reg (V4DImode, CONST0_RTX (V4DImode)); + rtx zero_or_all_ones = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_gtv4di3 (zero_or_all_ones, zero, operands[1])); + rtx lshr_res = gen_reg_rtx (V4DImode); + emit_insn (gen_lshrv4di3 (lshr_res, operands[1], operands[2])); + rtx ashl_res = gen_reg_rtx (V4DImode); + rtx amount; + if (TARGET_64BIT) + { + amount = gen_reg_rtx (DImode); + emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)), + operands[2])); + } + else + { + rtx temp = gen_reg_rtx (SImode); + emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)), + lowpart_subreg (SImode, operands[2], + DImode))); + amount = gen_reg_rtx (V4SImode); + emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode), + temp)); + } + amount = lowpart_subreg (DImode, amount, GET_MODE (amount)); + emit_insn (gen_ashlv4di3 (ashl_res, zero_or_all_ones, amount)); + emit_insn (gen_iorv4di3 (operands[0], lshr_res, ashl_res)); + DONE; + } +}) + (define_insn "3" [(set (match_operand:VI248_AVX512BW_2 0 "register_operand" "=v,v") (any_lshift:VI248_AVX512BW_2 @@ -20329,10 +20449,132 @@ (ashiftrt:V2DI (match_operand:V2DI 1 "register_operand") (match_operand:DI 2 "nonmemory_operand")))] - "TARGET_XOP || TARGET_AVX512VL" + "TARGET_SSE2" { if (!TARGET_AVX512VL) { + if (TARGET_SSE4_2 + && CONST_INT_P (operands[2]) + && UINTVAL (operands[2]) >= 63) + { + rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode)); + emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1])); + DONE; + } + if (operands[2] == const0_rtx) + { + emit_move_insn (operands[0], operands[1]); + DONE; + } + if (CONST_INT_P (operands[2]) + && (!TARGET_XOP || UINTVAL (operands[2]) >= 63)) + { + vec_perm_builder sel (4, 4, 1); + sel.quick_grow (4); + rtx arg0, arg1; + rtx op1 = lowpart_subreg (V4SImode, operands[1], V2DImode); + rtx target = gen_reg_rtx (V4SImode); + if (UINTVAL (operands[2]) >= 63) + { + arg0 = arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31))); + sel[0] = 1; + sel[1] = 1; + sel[2] = 3; + sel[3] = 3; + } + else if (INTVAL (operands[2]) > 32) + { + arg0 = gen_reg_rtx (V4SImode); + arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31))); + emit_insn (gen_ashrv4si3 (arg0, op1, + GEN_INT (INTVAL (operands[2]) - 32))); + sel[0] = 1; + sel[1] = 5; + sel[2] = 3; + sel[3] = 7; + } + else if (INTVAL (operands[2]) == 32) + { + arg0 = op1; + arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31))); + sel[0] = 1; + sel[1] = 5; + sel[2] = 3; + sel[3] = 7; + } + else + { + arg0 = gen_reg_rtx (V2DImode); + arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2])); + emit_insn (gen_ashrv4si3 (arg1, op1, operands[2])); + arg0 = lowpart_subreg (V4SImode, arg0, V2DImode); + sel[0] = 0; + sel[1] = 5; + sel[2] = 2; + sel[3] = 7; + } + vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4); + bool ok = targetm.vectorize.vec_perm_const (V4SImode, target, + arg0, arg1, indices); + gcc_assert (ok); + emit_move_insn (operands[0], + lowpart_subreg (V2DImode, target, V4SImode)); + DONE; + } + if (!TARGET_XOP) + { + rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode)); + rtx zero_or_all_ones; + if (TARGET_SSE4_2) + { + zero_or_all_ones = gen_reg_rtx (V2DImode); + emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero, + operands[1])); + } + else + { + rtx temp = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (temp, lowpart_subreg (V4SImode, + operands[1], + V2DImode), + GEN_INT (31))); + zero_or_all_ones = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp, + const1_rtx, const1_rtx, + GEN_INT (3), GEN_INT (3))); + zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones, + V4SImode); + } + rtx lshr_res = gen_reg_rtx (V2DImode); + emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2])); + rtx ashl_res = gen_reg_rtx (V2DImode); + rtx amount; + if (TARGET_64BIT) + { + amount = gen_reg_rtx (DImode); + emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)), + operands[2])); + } + else + { + rtx temp = gen_reg_rtx (SImode); + emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)), + lowpart_subreg (SImode, operands[2], + DImode))); + amount = gen_reg_rtx (V4SImode); + emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode), + temp)); + } + amount = lowpart_subreg (DImode, amount, GET_MODE (amount)); + emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount)); + emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res)); + DONE; + } + rtx reg = gen_reg_rtx (V2DImode); rtx par; bool negate = false; diff --git a/gcc/testsuite/gcc.dg/torture/vshuf-4.inc b/gcc/testsuite/gcc.dg/torture/vshuf-4.inc index d041b33..fb35df8 100644 --- a/gcc/testsuite/gcc.dg/torture/vshuf-4.inc +++ b/gcc/testsuite/gcc.dg/torture/vshuf-4.inc @@ -25,7 +25,9 @@ T (21, 2, 6, 3, 7) \ T (22, 1, 2, 3, 0) \ T (23, 2, 1, 0, 3) \ T (24, 2, 5, 6, 3) \ -T (25, 0, 1, 4, 5) +T (25, 0, 1, 4, 5) \ +T (26, 1, 5, 3, 7) \ +T (27, 0, 5, 2, 7) #define EXPTESTS \ T (116, 1, 2, 4, 3) \ T (117, 7, 3, 3, 0) \ diff --git a/gcc/testsuite/gcc.dg/torture/vshuf-8.inc b/gcc/testsuite/gcc.dg/torture/vshuf-8.inc index de358f3..d628039 100644 --- a/gcc/testsuite/gcc.dg/torture/vshuf-8.inc +++ b/gcc/testsuite/gcc.dg/torture/vshuf-8.inc @@ -27,7 +27,9 @@ T (23, 6, 5, 4, 3, 2, 1, 0, 7) \ T (24, 0, 1, 2, 3, 8, 9, 10, 11) \ T (25, 0, 1, 2, 3, 12, 13, 14, 15) \ T (26, 0, 1, 8, 9, 10, 11, 12, 13) \ -T (27, 0, 8, 9, 10, 11, 12, 13, 14) +T (27, 0, 8, 9, 10, 11, 12, 13, 14) \ +T (28, 1, 9, 3, 11, 5, 13, 7, 15) \ +T (29, 0, 9, 2, 11, 4, 13, 6, 15) #define EXPTESTS \ T (116, 9, 3, 9, 4, 7, 0, 0, 6) \ T (117, 4, 14, 12, 8, 9, 6, 0, 10) \ diff --git a/gcc/testsuite/gcc.target/i386/avx-pr82370.c b/gcc/testsuite/gcc.target/i386/avx-pr82370.c index 4dc8a5b..dc12dbf 100644 --- a/gcc/testsuite/gcc.target/i386/avx-pr82370.c +++ b/gcc/testsuite/gcc.target/i386/avx-pr82370.c @@ -4,7 +4,7 @@ /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */ /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */ /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */ -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */ +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 6 } } */ /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */ /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */ /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-psraq-1.c b/gcc/testsuite/gcc.target/i386/avx-psraq-1.c new file mode 100644 index 0000000..2722088 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-psraq-1.c @@ -0,0 +1,13 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx -mno-avx2" } */ +/* { dg-require-effective-target avx } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#endif + +#ifndef TEST +#define TEST avx_test +#endif + +#include "sse2-psraq-1.c" diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr82370.c b/gcc/testsuite/gcc.target/i386/avx2-pr82370.c index 6609ebb..df3dfd8 100644 --- a/gcc/testsuite/gcc.target/i386/avx2-pr82370.c +++ b/gcc/testsuite/gcc.target/i386/avx2-pr82370.c @@ -4,7 +4,7 @@ /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 2 } } */ /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */ /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ @@ -13,7 +13,7 @@ /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 2 } } */ /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 0 } } */ /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx2-psraq-1.c b/gcc/testsuite/gcc.target/i386/avx2-psraq-1.c new file mode 100644 index 0000000..e9051bf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-psraq-1.c @@ -0,0 +1,51 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ +/* { dg-require-effective-target avx2 } */ + +#ifndef CHECK_H +#define CHECK_H "avx2-check.h" +#endif + +#ifndef TEST +#define TEST avx2_test +#endif + +#include CHECK_H + +typedef long long V __attribute__((vector_size (32))); + +#define TESTN(N) \ +static V \ +__attribute__((noipa)) \ +test##N (V x) \ +{ \ + return x >> N; \ +} + +#define TESTS TESTN (63) TESTN (49) TESTN (32) TESTN (31) TESTN (18) +TESTS + +struct +{ + int n; + V (*fn) (V); +} tests[] = { +#undef TESTN +#define TESTN(N) { N, test##N }, + TESTS +}; + +static void +TEST (void) +{ + V a = (V) { 0xdeadbeefcafebabeULL, 0x123456789abcdef0ULL, + 0x173a74be8a95134cULL, 0x817bae35ac0ebf12ULL }; + int i; + for (i = 0; tests[i].n; i++) + { + V c = tests[i].fn (a); + if (c[0] != a[0] >> tests[i].n || c[1] != a[1] >> tests[i].n + || c[2] != a[2] >> tests[i].n || c[3] != a[3] >> tests[i].n) + abort (); + } +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c index 174f499..12c3b27 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c @@ -4,7 +4,7 @@ /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 2 } } */ /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */ /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ @@ -13,7 +13,7 @@ /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 2 } } */ /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 0 } } */ /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr82370.c b/gcc/testsuite/gcc.target/i386/avx512f-pr82370.c index 20ad8dc..b179f9b 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-pr82370.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr82370.c @@ -4,7 +4,7 @@ /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 2 } } */ /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */ /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */ @@ -13,7 +13,7 @@ /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 3 } } */ -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 2 } } */ /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 0 } } */ /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 3 } } */ /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-psraq-1.c b/gcc/testsuite/gcc.target/i386/sse2-psraq-1.c new file mode 100644 index 0000000..9a08ee4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-psraq-1.c @@ -0,0 +1,53 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2 -mno-sse3" } */ +/* { dg-require-effective-target sse2 } */ + +#ifndef CHECK_H +#define CHECK_H "sse2-check.h" +#endif + +#ifndef TEST +#define TEST sse2_test +#endif + +#include CHECK_H + +typedef long long V __attribute__((vector_size (16))); + +#define TESTN(N) \ +static V \ +__attribute__((noipa)) \ +test##N (V x) \ +{ \ + return x >> N; \ +} + +#define TESTS TESTN (63) TESTN (49) TESTN (32) TESTN (31) TESTN (18) +TESTS + +struct +{ + int n; + V (*fn) (V); +} tests[] = { +#undef TESTN +#define TESTN(N) { N, test##N }, + TESTS +}; + +static void +TEST (void) +{ + V a = (V) { 0xdeadbeefcafebabeULL, 0x123456789abcdef0ULL }; + V b = (V) { 0x173a74be8a95134cULL, 0x817bae35ac0ebf12ULL }; + int i; + for (i = 0; tests[i].n; i++) + { + V c = tests[i].fn (a); + if (c[0] != a[0] >> tests[i].n || c[1] != a[1] >> tests[i].n) + abort (); + c = tests[i].fn (b); + if (c[0] != b[0] >> tests[i].n || c[1] != b[1] >> tests[i].n) + abort (); + } +} diff --git a/gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c b/gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c new file mode 100644 index 0000000..947b623 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c @@ -0,0 +1,13 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse4.2 -mno-avx" } */ +/* { dg-require-effective-target sse4 } */ + +#ifndef CHECK_H +#define CHECK_H "sse4_2-check.h" +#endif + +#ifndef TEST +#define TEST sse4_2_test +#endif + +#include "sse2-psraq-1.c" -- 2.7.4