}
// Simplify the operands using demanded-bits information.
- if (SimplifyDemandedBits(SDValue(N, 0)))
+ if (!VT.isVector() &&
+ SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
// (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
// zero/one bits live out.
unsigned OperandBitWidth = Src.getScalarValueSizeInBits();
APInt TruncMask = DemandedBits.zext(OperandBitWidth);
- if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, Known, TLO,
- Depth + 1))
+ if (SimplifyDemandedBits(Src, TruncMask, Known, TLO, Depth + 1))
return true;
Known = Known.trunc(BitWidth);
// undesirable.
break;
- const APInt *ShAmtC =
- TLO.DAG.getValidShiftAmountConstant(Src, DemandedElts);
- if (!ShAmtC)
+ SDValue ShAmt = Src.getOperand(1);
+ auto *ShAmtC = dyn_cast<ConstantSDNode>(ShAmt);
+ if (!ShAmtC || ShAmtC->getAPIntValue().uge(BitWidth))
break;
uint64_t ShVal = ShAmtC->getZExtValue();
if (!(HighBits & DemandedBits)) {
// None of the shifted in bits are needed. Add a truncate of the
// shift input, then shift it.
- SDValue ShAmt = Src.getOperand(1);
if (TLO.LegalTypes())
ShAmt = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL));
SDValue NewTrunc =
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
return N->getOpcode() == ISD::SIGN_EXTEND ||
- N->getOpcode() == ISD::ANY_EXTEND ||
isExtendedBUILD_VECTOR(N, DAG, true);
}
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, <8 x i8>* %A
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
+; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, <8 x i16>* %A
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
+; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
+; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
+; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, <8 x i16>* %A
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
+; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
+; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
; CHECK-LABEL: amull_extvec_v8i8_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.8b, #12
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp3 = zext <8 x i8> %arg to <8 x i16>
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #1234
; CHECK-NEXT: dup v1.4h, w8
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #1234
; CHECK-NEXT: dup v1.2s, w8
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
; CHECK-LABEL: amull2_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull v2.8h, v0.8b, v1.8b
-; CHECK-NEXT: smull2 v1.8h, v0.16b, v1.16b
+; CHECK-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-NEXT: umull2 v1.8h, v0.16b, v1.16b
; CHECK-NEXT: bic v2.8h, #255, lsl #8
; CHECK-NEXT: bic v1.8h, #255, lsl #8
; CHECK-NEXT: mov v0.16b, v2.16b
define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
; CHECK-LABEL: amull2_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull v2.4s, v0.4h, v1.4h
-; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT: umull v2.4s, v0.4h, v1.4h
+; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h
; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
; CHECK-NEXT: and v0.16b, v2.16b, v3.16b
define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
; CHECK-LABEL: amull2_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull v2.2d, v0.2s, v1.2s
-; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v2.2d, v0.2s, v1.2s
+; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s
; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
; CHECK-NEXT: and v0.16b, v2.16b, v3.16b
; CHECK-LABEL: mlai16_trunc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
-; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
entry:
define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
; CHECK-LABEL: mlai16_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h
; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ldr d1, [x1, #16]
; CHECK-NEXT: ldr d2, [x2, #16]
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
-; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: str d0, [x0, #16]
; CHECK-NEXT: ret
define <4 x i32> @addmuli16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
; CHECK-LABEL: addmuli16_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT: smlal v1.4s, v0.4h, v2.4h
+; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h
+; CHECK-NEXT: umlal v1.4s, v0.4h, v2.4h
; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
; CHECK-LABEL: mlai32_trunc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
-; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
+; CHECK-NEXT: saddw v0.2d, v0.2d, v2.2s
; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: ret
entry:
define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
; CHECK-LABEL: mlai32_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
+; CHECK-NEXT: umull v0.2d, v1.2s, v0.2s
; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ldr d1, [x1, #32]
; CHECK-NEXT: ldr d2, [x2, #32]
; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
-; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
+; CHECK-NEXT: saddw v0.2d, v0.2d, v2.2s
; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: str d0, [x0, #32]
; CHECK-NEXT: ret
define <2 x i64> @addmuli32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
; CHECK-LABEL: addmuli32_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: smlal v1.2d, v0.2s, v2.2s
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: umlal v1.2d, v0.2s, v2.2s
; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
; CHECK-NEXT: str d1, [x1, #16]
; CHECK-NEXT: ldr d1, [x2, #16]
; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
-; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: str d0, [x0, #16]
; CHECK-NEXT: ret
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; VI-NEXT: s_or_b32 s0, s1, 4
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_and_b32 s0, s0, 0xff
+; VI-NEXT: v_or_b32_e32 v2, s0, v0
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: flat_store_short v[0:1], v2
define arm_aapcs_vfpcc <4 x i16> @mla_args(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
; CHECK-LABEL: mla_args:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmull.u16 q8, d1, d0
-; CHECK-NEXT: vaddw.u16 q8, q8, d2
+; CHECK-NEXT: vmull.s16 q8, d1, d0
+; CHECK-NEXT: vaddw.s16 q8, q8, d2
; CHECK-NEXT: vmovn.i32 d0, q8
; CHECK-NEXT: bx lr
entry:
; CHECK-NEXT: vldr d16, [r0, #16]
; CHECK-NEXT: vldr d17, [r1, #16]
; CHECK-NEXT: vldr d18, [r2, #16]
-; CHECK-NEXT: vmull.u16 q8, d17, d16
-; CHECK-NEXT: vaddw.u16 q8, q8, d18
+; CHECK-NEXT: vmull.s16 q8, d17, d16
+; CHECK-NEXT: vaddw.s16 q8, q8, d18
; CHECK-NEXT: vmovn.i32 d16, q8
; CHECK-NEXT: vstr d16, [r0, #16]
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc <4 x i16> @addmul_args(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
; CHECK-LABEL: addmul_args:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmull.u16 q8, d1, d2
-; CHECK-NEXT: vmlal.u16 q8, d0, d2
+; CHECK-NEXT: vmull.s16 q8, d1, d2
+; CHECK-NEXT: vmlal.s16 q8, d0, d2
; CHECK-NEXT: vmovn.i32 d0, q8
; CHECK-NEXT: bx lr
entry:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldr d16, [r2, #16]
; CHECK-NEXT: vldr d17, [r1, #16]
-; CHECK-NEXT: vmull.u16 q9, d17, d16
+; CHECK-NEXT: vmull.s16 q9, d17, d16
; CHECK-NEXT: vldr d17, [r0, #16]
-; CHECK-NEXT: vmlal.u16 q9, d17, d16
+; CHECK-NEXT: vmlal.s16 q9, d17, d16
; CHECK-NEXT: vmovn.i32 d16, q9
; CHECK-NEXT: vstr d16, [r0, #16]
; CHECK-NEXT: bx lr
; CHECK-NEXT: vldr d18, [r2, #16]
; CHECK-NEXT: vld1.16 {d16}, [r3:64]
; CHECK-NEXT: vmovl.u16 q8, d16
-; CHECK-NEXT: vaddw.u16 q10, q8, d18
+; CHECK-NEXT: vaddw.s16 q10, q8, d18
; CHECK-NEXT: vmovn.i32 d19, q10
; CHECK-NEXT: vldr d20, [r0, #16]
; CHECK-NEXT: vstr d19, [r0, #16]
; CHECK-NEXT: vmovn.i32 d16, q11
; CHECK-NEXT: vstr d16, [r1, #16]
; CHECK-NEXT: vldr d16, [r2, #16]
-; CHECK-NEXT: vmlal.u16 q11, d16, d20
+; CHECK-NEXT: vmlal.s16 q11, d16, d20
; CHECK-NEXT: vmovn.i32 d16, q11
; CHECK-NEXT: vstr d16, [r0, #16]
; CHECK-NEXT: bx lr
define void @func2(i16* %a, i16* %b, i16* %c) {
; CHECK-LABEL: func2:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldr d16, [r1, #16]
-; CHECK-NEXT: add r3, r0, #16
-; CHECK-NEXT: vldr d17, [r2, #16]
-; CHECK-NEXT: vaddl.u16 q9, d17, d16
-; CHECK-NEXT: vmovn.i32 d18, q9
-; CHECK-NEXT: vld1.16 {d19}, [r3:64]
-; CHECK-NEXT: vstr d18, [r0, #16]
+; CHECK-NEXT: add r3, r1, #16
; CHECK-NEXT: vldr d18, [r2, #16]
-; CHECK-NEXT: vmull.s16 q10, d17, d18
-; CHECK-NEXT: vmovl.s16 q11, d18
+; CHECK-NEXT: vld1.16 {d16}, [r3:64]
; CHECK-NEXT: vmovl.u16 q8, d16
+; CHECK-NEXT: vaddw.s16 q10, q8, d18
+; CHECK-NEXT: vmovn.i32 d19, q10
+; CHECK-NEXT: vldr d20, [r0, #16]
+; CHECK-NEXT: vstr d19, [r0, #16]
+; CHECK-NEXT: vldr d19, [r2, #16]
+; CHECK-NEXT: vmull.s16 q11, d18, d19
; CHECK-NEXT: vmovl.s16 q9, d19
-; CHECK-NEXT: vmla.i32 q10, q8, q11
-; CHECK-NEXT: vmovn.i32 d16, q10
+; CHECK-NEXT: vmla.i32 q11, q8, q9
+; CHECK-NEXT: vmovn.i32 d16, q11
; CHECK-NEXT: vstr d16, [r1, #16]
-; CHECK-NEXT: add r1, r2, #16
-; CHECK-NEXT: vld1.16 {d16}, [r1:64]
-; CHECK-NEXT: vmovl.u16 q8, d16
-; CHECK-NEXT: vmla.i32 q10, q8, q9
-; CHECK-NEXT: vadd.i32 q8, q10, q9
+; CHECK-NEXT: vldr d16, [r2, #16]
+; CHECK-NEXT: vmlal.s16 q11, d16, d20
+; CHECK-NEXT: vaddw.s16 q8, q11, d20
; CHECK-NEXT: vmovn.i32 d16, q8
; CHECK-NEXT: vstr d16, [r0, #16]
; CHECK-NEXT: bx lr
; CHECK-NEXT: vldrht.s32 q3, [r1], #8
; CHECK-NEXT: vmul.i32 q2, q3, q2
; CHECK-NEXT: vqshrnb.s32 q2, q2, #15
+; CHECK-NEXT: vmovlb.s16 q2, q2
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.32 q2, [r2], #8
; CHECK-NEXT: le lr, .LBB8_2
; CHECK-NEXT: vldrbt.s16 q6, [r1], #8
; CHECK-NEXT: vmul.i16 q5, q6, q5
; CHECK-NEXT: vqshrnb.s16 q5, q5, #7
+; CHECK-NEXT: vmovlb.s8 q5, q5
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrbt.16 q5, [r2], #8
; CHECK-NEXT: le lr, .LBB17_2
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vmov.u16 r1, q1[4]
; CHECK-NEXT: vmullb.s16 q2, q3, q2
-; CHECK-NEXT: vshr.u32 q3, q2, #16
+; CHECK-NEXT: vshr.s32 q3, q2, #16
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vmov.16 q2[0], r0
; CHECK-NEXT: vmov r0, s13
; CHECK-NEXT: vmov.u16 r1, q0[5]
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: vmullb.s16 q0, q1, q3
-; CHECK-NEXT: vshr.u32 q0, q0, #16
+; CHECK-NEXT: vshr.s32 q0, q0, #16
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q2[4], r0
; CHECK-NEXT: vmov r0, s1
; CHECK-NEXT: vmov.u8 r0, q0[7]
; CHECK-NEXT: vmov.16 q3[7], r0
; CHECK-NEXT: vmullb.s8 q2, q3, q2
-; CHECK-NEXT: vshr.u16 q3, q2, #8
+; CHECK-NEXT: vshr.s16 q3, q2, #8
; CHECK-NEXT: vmov.u16 r0, q3[0]
; CHECK-NEXT: vmov.8 q2[0], r0
; CHECK-NEXT: vmov.u16 r0, q3[1]
; CHECK-NEXT: vmov.u8 r0, q0[15]
; CHECK-NEXT: vmov.16 q1[7], r0
; CHECK-NEXT: vmullb.s8 q0, q1, q3
-; CHECK-NEXT: vshr.u16 q0, q0, #8
+; CHECK-NEXT: vshr.s16 q0, q0, #8
; CHECK-NEXT: vmov.u16 r0, q0[0]
; CHECK-NEXT: vmov.8 q2[8], r0
; CHECK-NEXT: vmov.u16 r0, q0[1]
;
; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <1,3,5,7,u,u,u,u>
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: vzeroupper
define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind {
; X86-LABEL: signbits_sext_shuffle_sitofp:
; X86: # %bb.0:
-; X86-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X86-NEXT: vpmovsxdq %xmm0, %xmm1
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X86-NEXT: vpmovsxdq %xmm0, %xmm0
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X86-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
;
; X64-AVX1-LABEL: signbits_sext_shuffle_sitofp:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
;
; X64-AVX2-LABEL: signbits_sext_shuffle_sitofp:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
; CHECK-LABEL: trunc_v16i32_v16i16_sign:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
-; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0
+; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1
+; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <16 x i32>, <16 x i32>* %x
%b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
}
define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" {
-; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign:
-; CHECK-AVX512: # %bb.0:
-; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0
-; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1
-; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
-; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; CHECK-AVX512-NEXT: retq
-;
-; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign:
-; CHECK-VBMI: # %bb.0:
-; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
-; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
-; CHECK-VBMI-NEXT: retq
+; CHECK-LABEL: trunc_v32i16_v32i8_sign:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0
+; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1
+; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-NEXT: retq
%a = load <32 x i16>, <32 x i16>* %x
%b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%c = trunc <32 x i16> %b to <32 x i8>
;
; AVX2-FAST-LABEL: trunc8i64_8i32_ashr:
; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7]
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
;
; AVX512-LABEL: trunc8i64_8i32_ashr:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: retq
entry:
}
define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) {
-; SSE2-LABEL: trunc8i32_8i16_ashr:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc8i32_8i16_ashr:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i32_8i16_ashr:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc8i32_8i16_ashr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc8i32_8i16_ashr:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i32_8i16_ashr:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc8i32_8i16_ashr:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512F-NEXT: vzeroupper
;
; AVX512VL-LABEL: trunc8i32_8i16_ashr:
; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i16_ashr:
; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
;
; AVX512BWVL-LABEL: trunc8i32_8i16_ashr:
; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
}
define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
-; SSE2-LABEL: trunc16i32_16i16_ashr:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: movdqu %xmm2, (%rax)
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc16i32_16i16_ashr:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSSE3-NEXT: psrad $16, %xmm3
-; SSSE3-NEXT: psrad $16, %xmm2
-; SSSE3-NEXT: packssdw %xmm3, %xmm2
-; SSSE3-NEXT: movdqu %xmm2, (%rax)
-; SSSE3-NEXT: movdqu %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc16i32_16i16_ashr:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: psrld $16, %xmm3
-; SSE41-NEXT: psrld $16, %xmm2
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: movdqu %xmm2, (%rax)
-; SSE41-NEXT: movdqu %xmm0, (%rax)
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc16i32_16i16_ashr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: psrad $16, %xmm3
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: movdqu %xmm2, (%rax)
+; SSE-NEXT: movdqu %xmm0, (%rax)
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc16i32_16i16_ashr:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm1, (%rax)
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
;
; AVX2-LABEL: trunc16i32_16i16_ashr:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
}
define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
-; SSE2-LABEL: trunc16i32_16i8_ashr:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: psrld $24, %xmm1
-; SSE2-NEXT: psrld $24, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: psrld $24, %xmm3
-; SSE2-NEXT: psrld $24, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc16i32_16i8_ashr:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: psrld $24, %xmm1
-; SSSE3-NEXT: psrld $24, %xmm0
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: psrld $24, %xmm3
-; SSSE3-NEXT: psrld $24, %xmm2
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: movdqu %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc16i32_16i8_ashr:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: psrld $24, %xmm1
-; SSE41-NEXT: psrld $24, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: psrld $24, %xmm3
-; SSE41-NEXT: psrld $24, %xmm2
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: packuswb %xmm2, %xmm0
-; SSE41-NEXT: movdqu %xmm0, (%rax)
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc16i32_16i8_ashr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: psrad $24, %xmm1
+; SSE-NEXT: psrad $24, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: psrad $24, %xmm3
+; SSE-NEXT: psrad $24, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: packsswb %xmm2, %xmm0
+; SSE-NEXT: movdqu %xmm0, (%rax)
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc16i32_16i8_ashr:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc16i32_16i8_ashr:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
; SSE-LABEL: trunc16i16_16i8_ashr:
; SSE: # %bb.0: # %entry
-; SSE-NEXT: psrlw $8, %xmm1
-; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: psraw $8, %xmm1
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: packsswb %xmm1, %xmm0
; SSE-NEXT: movdqu %xmm0, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc16i16_16i8_ashr:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc16i16_16i8_ashr:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc16i16_16i8_ashr:
; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper