From 829c4bea06600ea4201462f91ce6d76ca21fdb35 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Thu, 13 May 2021 12:14:14 +0200
Subject: [PATCH] ix86: Support V{2, 4}DImode arithmetic right shifts for SSE2+
 [PR98856]

As mentioned in the PR, we don't support arithmetic right V2DImode or
V4DImode on x86 without -mavx512vl or -mxop.  The ISAs indeed don't have
{,v}psraq instructions until AVX512VL, but we actually can emulate it quite
easily.
One case is arithmetic >> 63, we can just emit {,v}pxor; {,v}pcmpgt for
that for SSE4.2+, or for SSE2 psrad $31; pshufd $0xf5.
Then arithmetic >> by constant > 32, that can be done with {,v}psrad $31
and {,v}psrad $(cst-32) and two operand permutation,
arithmetic >> 32 can be done as {,v}psrad $31 and permutation of that
and the original operand.  Arithmetic >> by constant < 32 can be done
as {,v}psrad $cst and {,v}psrlq $cst and two operand permutation.
And arithmetic >> by variable scalar amount can be done as
arithmetic >> 63, logical >> by the amount, << by (64 - amount of the
>> 63 result; note that the vector << 64 result in 0) and oring together.

I had to improve the permutation generation so that it actually handles
the needed permutations (or handles them better).

2021-05-13  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/98856
	* config/i386/i386.c (ix86_shift_rotate_cost): Add CODE argument.
	Expect V2DI and V4DI arithmetic right shifts to be emulated.
	(ix86_rtx_costs, ix86_add_stmt_cost): Adjust ix86_shift_rotate_cost
	caller.
	* config/i386/i386-expand.c (expand_vec_perm_2perm_interleave,
	expand_vec_perm_2perm_pblendv): New functions.
	(ix86_expand_vec_perm_const_1): Use them.
	* config/i386/sse.md (ashr<mode>3<mask_name>): Rename to ...
	(<mask_codefor>ashr<mode>3<mask_name>): ... this.
	(ashr<mode>3): New define_expand with VI248_AVX512BW iterator.
	(ashrv4di3): New define_expand.
	(ashrv2di3): Change condition to TARGET_SSE2, handle !TARGET_XOP
	and !TARGET_AVX512VL expansion.

	* gcc.target/i386/sse2-psraq-1.c: New test.
	* gcc.target/i386/sse4_2-psraq-1.c: New test.
	* gcc.target/i386/avx-psraq-1.c: New test.
	* gcc.target/i386/avx2-psraq-1.c: New test.
	* gcc.target/i386/avx-pr82370.c: Adjust expected number of vpsrad
	instructions.
	* gcc.target/i386/avx2-pr82370.c: Likewise.
	* gcc.target/i386/avx512f-pr82370.c: Likewise.
	* gcc.target/i386/avx512bw-pr82370.c: Likewise.
	* gcc.dg/torture/vshuf-4.inc: Add two further permutations.
	* gcc.dg/torture/vshuf-8.inc: Likewise.
---
 gcc/config/i386/i386-expand.c                    | 248 +++++++++++++++++++++++
 gcc/config/i386/i386.c                           |  29 ++-
 gcc/config/i386/sse.md                           | 246 +++++++++++++++++++++-
 gcc/testsuite/gcc.dg/torture/vshuf-4.inc         |   4 +-
 gcc/testsuite/gcc.dg/torture/vshuf-8.inc         |   4 +-
 gcc/testsuite/gcc.target/i386/avx-pr82370.c      |   2 +-
 gcc/testsuite/gcc.target/i386/avx-psraq-1.c      |  13 ++
 gcc/testsuite/gcc.target/i386/avx2-pr82370.c     |   4 +-
 gcc/testsuite/gcc.target/i386/avx2-psraq-1.c     |  51 +++++
 gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c |   4 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr82370.c  |   4 +-
 gcc/testsuite/gcc.target/i386/sse2-psraq-1.c     |  53 +++++
 gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c   |  13 ++
 13 files changed, 660 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-psraq-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-psraq-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-psraq-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 92488b8..0fa8d45 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -18662,6 +18662,242 @@ expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
+   a two vector permutation using two single vector permutations and
+   {,v}{,p}unpckl{ps,pd,bw,wd,dq}.  If two_insn, succeed only if one
+   of dfirst or dsecond is identity permutation.  */
+
+static bool
+expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
+{
+  unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
+  struct expand_vec_perm_d dfirst, dsecond, dfinal;
+  bool ident1 = true, ident2 = true;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      if (!TARGET_SSE)
+	return false;
+      if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
+	return false;
+    }
+  else if (GET_MODE_SIZE (d->vmode) == 32)
+    {
+      if (!TARGET_AVX)
+	return false;
+      if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
+	return false;
+      lane = nelt2;
+    }
+  else
+    return false;
+
+  for (i = 1; i < nelt; i++)
+    if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
+      return false;
+
+  dfirst = *d;
+  dsecond = *d;
+  dfinal = *d;
+  dfirst.op1 = dfirst.op0;
+  dfirst.one_operand_p = true;
+  dsecond.op0 = dsecond.op1;
+  dsecond.one_operand_p = true;
+
+  for (i = 0; i < nelt; i++)
+    if (d->perm[i] >= nelt)
+      {
+	dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
+	if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
+	  ident2 = false;
+	dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
+	  = d->perm[i] - nelt;
+      }
+    else
+      {
+	dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
+	if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
+	  ident1 = false;
+	dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
+      }
+
+  if (two_insn && !ident1 && !ident2)
+    return false;
+
+  if (!d->testing_p)
+    {
+      if (!ident1)
+	dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
+      if (!ident2)
+	dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
+      if (d->perm[0] >= nelt)
+	std::swap (dfinal.op0, dfinal.op1);
+    }
+
+  bool ok;
+  rtx_insn *seq1 = NULL, *seq2 = NULL;
+
+  if (!ident1)
+    {
+      start_sequence ();
+      ok = expand_vec_perm_1 (&dfirst);
+      seq1 = get_insns ();
+      end_sequence ();
+
+      if (!ok)
+	return false;
+    }
+
+  if (!ident2)
+    {
+      start_sequence ();
+      ok = expand_vec_perm_1 (&dsecond);
+      seq2 = get_insns ();
+      end_sequence ();
+
+      if (!ok)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  for (i = 0; i < nelt; i++)
+    {
+      dfinal.perm[i] = i / 2;
+      if (i >= lane)
+	dfinal.perm[i] += lane / 2;
+      if ((i & 1) != 0)
+	dfinal.perm[i] += nelt;
+    }
+  emit_insn (seq1);
+  emit_insn (seq2);
+  ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
+			       dfinal.perm, dfinal.nelt, false);
+  gcc_assert (ok);
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
+   the permutation using two single vector permutations and the SSE4_1 pblendv
+   instruction.  If two_insn, succeed only if one of dfirst or dsecond is
+   identity permutation.  */
+
+static bool
+expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
+{
+  unsigned i, nelt = d->nelt;
+  struct expand_vec_perm_d dfirst, dsecond, dfinal;
+  machine_mode vmode = d->vmode;
+  bool ident1 = true, ident2 = true;
+
+  /* Use the same checks as in expand_vec_perm_blend.  */
+  if (d->one_operand_p)
+    return false;
+  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+    ;
+  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+    ;
+  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+    ;
+  else
+    return false;
+
+  dfirst = *d;
+  dsecond = *d;
+  dfinal = *d;
+  dfirst.op1 = dfirst.op0;
+  dfirst.one_operand_p = true;
+  dsecond.op0 = dsecond.op1;
+  dsecond.one_operand_p = true;
+
+  for (i = 0; i < nelt; ++i)
+    if (d->perm[i] >= nelt)
+      {
+	dfirst.perm[i] = 0xff;
+	dsecond.perm[i] = d->perm[i] - nelt;
+	if (d->perm[i] != i + nelt)
+	  ident2 = false;
+      }
+    else
+      {
+	dsecond.perm[i] = 0xff;
+	dfirst.perm[i] = d->perm[i];
+	if (d->perm[i] != i)
+	  ident1 = false;
+      }
+
+  if (two_insn && !ident1 && !ident2)
+    return false;
+
+  /* For now.  Ideally treat 0xff as a wildcard.  */
+  for (i = 0; i < nelt; ++i)
+    if (dfirst.perm[i] == 0xff)
+      {
+	if (GET_MODE_SIZE (vmode) == 32
+	    && dfirst.perm[i ^ (nelt / 2)] != 0xff)
+	  dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
+	else
+	  dfirst.perm[i] = i;
+      }
+    else
+      {
+	if (GET_MODE_SIZE (vmode) == 32
+	    && dsecond.perm[i ^ (nelt / 2)] != 0xff)
+	  dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
+	else
+	  dsecond.perm[i] = i;
+      }
+
+  if (!d->testing_p)
+    {
+      if (!ident1)
+	dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
+      if (!ident2)
+	dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
+    }
+
+  bool ok;
+  rtx_insn *seq1 = NULL, *seq2 = NULL;
+
+  if (!ident1)
+    {
+      start_sequence ();
+      ok = expand_vec_perm_1 (&dfirst);
+      seq1 = get_insns ();
+      end_sequence ();
+
+      if (!ok)
+	return false;
+    }
+
+  if (!ident2)
+    {
+      start_sequence ();
+      ok = expand_vec_perm_1 (&dsecond);
+      seq2 = get_insns ();
+      end_sequence ();
+
+      if (!ok)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  for (i = 0; i < nelt; ++i)
+    dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
+
+  emit_insn (seq1);
+  emit_insn (seq2);
+  ok = expand_vec_perm_blend (&dfinal);
+  gcc_assert (ok);
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
    permutation using two vperm2f128, followed by a vshufpd insn blending
    the two vectors together.  */
@@ -19773,6 +20009,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_pblendv (d))
     return true;
 
+  if (expand_vec_perm_2perm_interleave (d, true))
+    return true;
+
+  if (expand_vec_perm_2perm_pblendv (d, true))
+    return true;
+
   /* Try sequences of three instructions.  */
 
   if (expand_vec_perm_even_odd_pack (d))
@@ -19790,6 +20032,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_vperm2f128_vblend (d))
     return true;
 
+  if (expand_vec_perm_2perm_interleave (d, false))
+    return true;
+
+  if (expand_vec_perm_2perm_pblendv (d, false))
+    return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 915f89f..6a1f574 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19732,6 +19732,7 @@ ix86_division_cost (const struct processor_costs *cost,
 
 static int
 ix86_shift_rotate_cost (const struct processor_costs *cost,
+			enum rtx_code code,
 			enum machine_mode mode, bool constant_op1,
 			HOST_WIDE_INT op1_val,
 			bool speed,
@@ -19770,6 +19771,19 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 	    count = 7;
 	  return ix86_vec_cost (mode, cost->sse_op * count);
 	}
+      /* V*DImode arithmetic right shift is emulated.  */
+      else if (code == ASHIFTRT
+	       && (mode == V2DImode || mode == V4DImode)
+	       && !TARGET_XOP
+	       && !TARGET_AVX512VL)
+	{
+	  int count = 4;
+	  if (constant_op1 && op1_val == 63 && TARGET_SSE4_2)
+	    count = 2;
+	  else if (constant_op1)
+	    count = 3;
+	  return ix86_vec_cost (mode, cost->sse_op * count);
+	}
       else
 	return ix86_vec_cost (mode, cost->sse_op);
     }
@@ -19939,13 +19953,15 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
     case LSHIFTRT:
     case ROTATERT:
       bool skip_op0, skip_op1;
-      *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
+      *total = ix86_shift_rotate_cost (cost, code, mode,
+				       CONSTANT_P (XEXP (x, 1)),
 				       CONST_INT_P (XEXP (x, 1))
 					 ? INTVAL (XEXP (x, 1)) : -1,
 				       speed,
 				       GET_CODE (XEXP (x, 1)) == AND,
 				       SUBREG_P (XEXP (x, 1))
-				       && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
+				       && GET_CODE (XEXP (XEXP (x, 1),
+							  0)) == AND,
 				       &skip_op0, &skip_op1);
       if (skip_op0 || skip_op1)
 	{
@@ -22383,11 +22399,16 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	case LROTATE_EXPR:
 	case RROTATE_EXPR:
 	  {
+	    tree op1 = gimple_assign_rhs1 (stmt_info->stmt);
 	    tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
 	    stmt_cost = ix86_shift_rotate_cost
-			   (ix86_cost, mode,
+			   (ix86_cost,
+			    (subcode == RSHIFT_EXPR
+			     && !TYPE_UNSIGNED (TREE_TYPE (op1)))
+			    ? ASHIFTRT : LSHIFTRT, mode,
 		            TREE_CODE (op2) == INTEGER_CST,
-			    cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
+			    cst_and_fits_in_hwi (op2)
+			    ? int_cst_value (op2) : -1,
 		            true, false, false, NULL, NULL);
 	  }
 	  break;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 4072d0c..62f4e15 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12468,7 +12468,7 @@
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "ashr<mode>3<mask_name>"
+(define_insn "<mask_codefor>ashr<mode>3<mask_name>"
   [(set (match_operand:VI248_AVX512BW_AVX512VL 0 "register_operand" "=v,v")
 	(ashiftrt:VI248_AVX512BW_AVX512VL
 	  (match_operand:VI248_AVX512BW_AVX512VL 1 "nonimmediate_operand" "v,vm")
@@ -12482,6 +12482,126 @@
        (const_string "0")))
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_expand "ashr<mode>3"
+  [(set (match_operand:VI248_AVX512BW 0 "register_operand")
+	(ashiftrt:VI248_AVX512BW
+	  (match_operand:VI248_AVX512BW 1 "nonimmediate_operand")
+	  (match_operand:DI 2 "nonmemory_operand")))]
+  "TARGET_AVX512F")
+
+(define_expand "ashrv4di3"
+  [(set (match_operand:V4DI 0 "register_operand")
+	(ashiftrt:V4DI
+	  (match_operand:V4DI 1 "nonimmediate_operand")
+	  (match_operand:DI 2 "nonmemory_operand")))]
+  "TARGET_AVX2"
+{
+  if (!TARGET_AVX512VL)
+    {
+      if (CONST_INT_P (operands[2]) && UINTVAL (operands[2]) >= 63)
+	{
+	  rtx zero = force_reg (V4DImode, CONST0_RTX (V4DImode));
+	  emit_insn (gen_avx2_gtv4di3 (operands[0], zero, operands[1]));
+	  DONE;
+	}
+      if (operands[2] == const0_rtx)
+	{
+	  emit_move_insn (operands[0], operands[1]);
+	  DONE;
+	}
+      operands[1] = force_reg (V4DImode, operands[1]);
+      if (CONST_INT_P (operands[2]))
+	{
+	  vec_perm_builder sel (8, 8, 1);
+	  sel.quick_grow (8);
+	  rtx arg0, arg1;
+	  rtx op1 = lowpart_subreg (V8SImode, operands[1], V4DImode);
+	  rtx target = gen_reg_rtx (V8SImode);
+	  if (INTVAL (operands[2]) > 32)
+	    {
+	      arg0 = gen_reg_rtx (V8SImode);
+	      arg1 = gen_reg_rtx (V8SImode);
+	      emit_insn (gen_ashrv8si3 (arg1, op1, GEN_INT (31)));
+	      emit_insn (gen_ashrv8si3 (arg0, op1,
+					GEN_INT (INTVAL (operands[2]) - 32)));
+	      sel[0] = 1;
+	      sel[1] = 9;
+	      sel[2] = 3;
+	      sel[3] = 11;
+	      sel[4] = 5;
+	      sel[5] = 13;
+	      sel[6] = 7;
+	      sel[7] = 15;
+	    }
+	  else if (INTVAL (operands[2]) == 32)
+	    {
+	      arg0 = op1;
+	      arg1 = gen_reg_rtx (V8SImode);
+	      emit_insn (gen_ashrv8si3 (arg1, op1, GEN_INT (31)));
+	      sel[0] = 1;
+	      sel[1] = 9;
+	      sel[2] = 3;
+	      sel[3] = 11;
+	      sel[4] = 5;
+	      sel[5] = 13;
+	      sel[6] = 7;
+	      sel[7] = 15;
+	    }
+	  else
+	    {
+	      arg0 = gen_reg_rtx (V4DImode);
+	      arg1 = gen_reg_rtx (V8SImode);
+	      emit_insn (gen_lshrv4di3 (arg0, operands[1], operands[2]));
+	      emit_insn (gen_ashrv8si3 (arg1, op1, operands[2]));
+	      arg0 = lowpart_subreg (V8SImode, arg0, V4DImode);
+	      sel[0] = 0;
+	      sel[1] = 9;
+	      sel[2] = 2;
+	      sel[3] = 11;
+	      sel[4] = 4;
+	      sel[5] = 13;
+	      sel[6] = 6;
+	      sel[7] = 15;
+	    }
+	  vec_perm_indices indices (sel, 2, 8);
+	  bool ok = targetm.vectorize.vec_perm_const (V8SImode, target,
+						      arg0, arg1, indices);
+	  gcc_assert (ok);
+	  emit_move_insn (operands[0],
+			  lowpart_subreg (V4DImode, target, V8SImode));
+	  DONE;
+	}
+
+      rtx zero = force_reg (V4DImode, CONST0_RTX (V4DImode));
+      rtx zero_or_all_ones = gen_reg_rtx (V4DImode);
+      emit_insn (gen_avx2_gtv4di3 (zero_or_all_ones, zero, operands[1]));
+      rtx lshr_res = gen_reg_rtx (V4DImode);
+      emit_insn (gen_lshrv4di3 (lshr_res, operands[1], operands[2]));
+      rtx ashl_res = gen_reg_rtx (V4DImode);
+      rtx amount;
+      if (TARGET_64BIT)
+	{
+	  amount = gen_reg_rtx (DImode);
+	  emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
+				 operands[2]));
+	}
+      else
+	{
+	  rtx temp = gen_reg_rtx (SImode);
+	  emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
+				 lowpart_subreg (SImode, operands[2],
+						 DImode)));
+	  amount = gen_reg_rtx (V4SImode);
+	  emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
+					temp));
+	}
+      amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
+      emit_insn (gen_ashlv4di3 (ashl_res, zero_or_all_ones, amount));
+      emit_insn (gen_iorv4di3 (operands[0], lshr_res, ashl_res));
+      DONE;
+    }
+})
+
 (define_insn "<mask_codefor><insn><mode>3<mask_name>"
   [(set (match_operand:VI248_AVX512BW_2 0 "register_operand" "=v,v")
 	(any_lshift:VI248_AVX512BW_2
@@ -20329,10 +20449,132 @@
 	(ashiftrt:V2DI
 	  (match_operand:V2DI 1 "register_operand")
 	  (match_operand:DI 2 "nonmemory_operand")))]
-  "TARGET_XOP || TARGET_AVX512VL"
+  "TARGET_SSE2"
 {
   if (!TARGET_AVX512VL)
     {
+      if (TARGET_SSE4_2
+	  && CONST_INT_P (operands[2])
+	  && UINTVAL (operands[2]) >= 63)
+	{
+	  rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
+	  emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
+	  DONE;
+	}
+      if (operands[2] == const0_rtx)
+	{
+	  emit_move_insn (operands[0], operands[1]);
+	  DONE;
+	}
+      if (CONST_INT_P (operands[2])
+	  && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
+	{
+	  vec_perm_builder sel (4, 4, 1);
+	  sel.quick_grow (4);
+	  rtx arg0, arg1;
+	  rtx op1 = lowpart_subreg (V4SImode, operands[1], V2DImode);
+	  rtx target = gen_reg_rtx (V4SImode);
+	  if (UINTVAL (operands[2]) >= 63)
+	    {
+	      arg0 = arg1 = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
+	      sel[0] = 1;
+	      sel[1] = 1;
+	      sel[2] = 3;
+	      sel[3] = 3;
+	    }
+	  else if (INTVAL (operands[2]) > 32)
+	    {
+	      arg0 = gen_reg_rtx (V4SImode);
+	      arg1 = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
+	      emit_insn (gen_ashrv4si3 (arg0, op1,
+					GEN_INT (INTVAL (operands[2]) - 32)));
+	      sel[0] = 1;
+	      sel[1] = 5;
+	      sel[2] = 3;
+	      sel[3] = 7;
+	    }
+	  else if (INTVAL (operands[2]) == 32)
+	    {
+	      arg0 = op1;
+	      arg1 = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
+	      sel[0] = 1;
+	      sel[1] = 5;
+	      sel[2] = 3;
+	      sel[3] = 7;
+	    }
+	  else
+	    {
+	      arg0 = gen_reg_rtx (V2DImode);
+	      arg1 = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
+	      emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
+	      arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
+	      sel[0] = 0;
+	      sel[1] = 5;
+	      sel[2] = 2;
+	      sel[3] = 7;
+	    }
+	  vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
+	  bool ok = targetm.vectorize.vec_perm_const (V4SImode, target,
+						      arg0, arg1, indices);
+	  gcc_assert (ok);
+	  emit_move_insn (operands[0],
+			  lowpart_subreg (V2DImode, target, V4SImode));
+	  DONE;
+	}
+      if (!TARGET_XOP)
+	{
+	  rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
+	  rtx zero_or_all_ones;
+	  if (TARGET_SSE4_2)
+	    {
+	      zero_or_all_ones = gen_reg_rtx (V2DImode);
+	      emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
+					     operands[1]));
+	    }
+	  else
+	    {
+	      rtx temp = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_ashrv4si3 (temp, lowpart_subreg (V4SImode,
+							      operands[1],
+							      V2DImode),
+					GEN_INT (31)));
+	      zero_or_all_ones = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
+					    const1_rtx, const1_rtx,
+					    GEN_INT (3), GEN_INT (3)));
+	      zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
+						 V4SImode);
+	    }
+	  rtx lshr_res = gen_reg_rtx (V2DImode);
+	  emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
+	  rtx ashl_res = gen_reg_rtx (V2DImode);
+	  rtx amount;
+	  if (TARGET_64BIT)
+	    {
+	      amount = gen_reg_rtx (DImode);
+	      emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
+				     operands[2]));
+	    }
+	  else
+	    {
+	      rtx temp = gen_reg_rtx (SImode);
+	      emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
+				     lowpart_subreg (SImode, operands[2],
+						     DImode)));
+	      amount = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
+					    temp));
+	    }
+	  amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
+	  emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
+	  emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
+	  DONE;
+	}
+
       rtx reg = gen_reg_rtx (V2DImode);
       rtx par;
       bool negate = false;
diff --git a/gcc/testsuite/gcc.dg/torture/vshuf-4.inc b/gcc/testsuite/gcc.dg/torture/vshuf-4.inc
index d041b33..fb35df8 100644
--- a/gcc/testsuite/gcc.dg/torture/vshuf-4.inc
+++ b/gcc/testsuite/gcc.dg/torture/vshuf-4.inc
@@ -25,7 +25,9 @@ T (21,	2, 6, 3, 7) \
 T (22,	1, 2, 3, 0) \
 T (23,	2, 1, 0, 3) \
 T (24,	2, 5, 6, 3) \
-T (25,	0, 1, 4, 5)
+T (25,	0, 1, 4, 5) \
+T (26,	1, 5, 3, 7) \
+T (27,	0, 5, 2, 7)
 #define EXPTESTS \
 T (116,	1, 2, 4, 3) \
 T (117,	7, 3, 3, 0) \
diff --git a/gcc/testsuite/gcc.dg/torture/vshuf-8.inc b/gcc/testsuite/gcc.dg/torture/vshuf-8.inc
index de358f3..d628039 100644
--- a/gcc/testsuite/gcc.dg/torture/vshuf-8.inc
+++ b/gcc/testsuite/gcc.dg/torture/vshuf-8.inc
@@ -27,7 +27,9 @@ T (23,	6, 5, 4, 3, 2, 1, 0, 7) \
 T (24,	0, 1, 2, 3, 8, 9, 10, 11) \
 T (25,	0, 1, 2, 3, 12, 13, 14, 15) \
 T (26,	0, 1, 8, 9, 10, 11, 12, 13) \
-T (27,	0, 8, 9, 10, 11, 12, 13, 14)
+T (27,	0, 8, 9, 10, 11, 12, 13, 14) \
+T (28,	1, 9, 3, 11, 5, 13, 7, 15) \
+T (29,	0, 9, 2, 11, 4, 13, 6, 15)
 #define EXPTESTS \
 T (116,	9, 3, 9, 4, 7, 0, 0, 6) \
 T (117,	4, 14, 12, 8, 9, 6, 0, 10) \
diff --git a/gcc/testsuite/gcc.target/i386/avx-pr82370.c b/gcc/testsuite/gcc.target/i386/avx-pr82370.c
index 4dc8a5b..dc12dbf 100644
--- a/gcc/testsuite/gcc.target/i386/avx-pr82370.c
+++ b/gcc/testsuite/gcc.target/i386/avx-pr82370.c
@@ -4,7 +4,7 @@
 /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
 /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
 /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
-/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
+/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 6 } } */
 /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */
 /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
 /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-psraq-1.c b/gcc/testsuite/gcc.target/i386/avx-psraq-1.c
new file mode 100644
index 0000000..2722088
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-psraq-1.c
@@ -0,0 +1,13 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -mno-avx2" } */
+/* { dg-require-effective-target avx } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+#include "sse2-psraq-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr82370.c b/gcc/testsuite/gcc.target/i386/avx2-pr82370.c
index 6609ebb..df3dfd8 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-pr82370.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr82370.c
@@ -4,7 +4,7 @@
 /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
-/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 2 } } */
 /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */
 /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
@@ -13,7 +13,7 @@
 /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
-/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 2 } } */
 /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 0 } } */
 /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx2-psraq-1.c b/gcc/testsuite/gcc.target/i386/avx2-psraq-1.c
new file mode 100644
index 0000000..e9051bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-psraq-1.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
+/* { dg-require-effective-target avx2 } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx2_test
+#endif
+
+#include CHECK_H
+
+typedef long long V __attribute__((vector_size (32)));
+
+#define TESTN(N) \
+static V			\
+__attribute__((noipa))		\
+test##N (V x)			\
+{				\
+  return x >> N;		\
+}
+
+#define TESTS TESTN (63) TESTN (49) TESTN (32) TESTN (31) TESTN (18)
+TESTS
+
+struct
+{
+  int n;
+  V (*fn) (V);
+} tests[] = {
+#undef TESTN
+#define TESTN(N) { N, test##N },
+  TESTS
+};
+
+static void
+TEST (void)
+{
+  V a = (V) { 0xdeadbeefcafebabeULL, 0x123456789abcdef0ULL,
+	      0x173a74be8a95134cULL, 0x817bae35ac0ebf12ULL };
+  int i;
+  for (i = 0; tests[i].n; i++)
+    {
+      V c = tests[i].fn (a);
+      if (c[0] != a[0] >> tests[i].n || c[1] != a[1] >> tests[i].n
+	  || c[2] != a[2] >> tests[i].n || c[3] != a[3] >> tests[i].n)
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c
index 174f499..12c3b27 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c
@@ -4,7 +4,7 @@
 /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
-/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 2 } } */
 /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */
 /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
@@ -13,7 +13,7 @@
 /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
-/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 2 } } */
 /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 0 } } */
 /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr82370.c b/gcc/testsuite/gcc.target/i386/avx512f-pr82370.c
index 20ad8dc..b179f9b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr82370.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr82370.c
@@ -4,7 +4,7 @@
 /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
-/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 2 } } */
 /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */
 /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
@@ -13,7 +13,7 @@
 /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
 /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 3 } } */
-/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 2 } } */
 /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 0 } } */
 /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 3 } } */
 /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psraq-1.c b/gcc/testsuite/gcc.target/i386/sse2-psraq-1.c
new file mode 100644
index 0000000..9a08ee4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-psraq-1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2 -mno-sse3" } */
+/* { dg-require-effective-target sse2 } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse2_test
+#endif
+
+#include CHECK_H
+
+typedef long long V __attribute__((vector_size (16)));
+
+#define TESTN(N) \
+static V			\
+__attribute__((noipa))		\
+test##N (V x)			\
+{				\
+  return x >> N;		\
+}
+
+#define TESTS TESTN (63) TESTN (49) TESTN (32) TESTN (31) TESTN (18)
+TESTS
+
+struct
+{
+  int n;
+  V (*fn) (V);
+} tests[] = {
+#undef TESTN
+#define TESTN(N) { N, test##N },
+  TESTS
+};
+
+static void
+TEST (void)
+{
+  V a = (V) { 0xdeadbeefcafebabeULL, 0x123456789abcdef0ULL };
+  V b = (V) { 0x173a74be8a95134cULL, 0x817bae35ac0ebf12ULL };
+  int i;
+  for (i = 0; tests[i].n; i++)
+    {
+      V c = tests[i].fn (a);
+      if (c[0] != a[0] >> tests[i].n || c[1] != a[1] >> tests[i].n)
+	abort ();
+      c = tests[i].fn (b);
+      if (c[0] != b[0] >> tests[i].n || c[1] != b[1] >> tests[i].n)
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c b/gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c
new file mode 100644
index 0000000..947b623
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c
@@ -0,0 +1,13 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse4.2 -mno-avx" } */
+/* { dg-require-effective-target sse4 } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_2_test
+#endif
+
+#include "sse2-psraq-1.c"
-- 
2.7.4