This is my first LLVM patch, so please tell me if there are any process issues.
The main observation for this patch is that we can lower UMIN/UMAX with v8i16 by using unsigned saturated subtractions in a clever way. Previously this operation was lowered by turning the signbit of both inputs and the output which turns the unsigned minimum/maximum into a signed one.
We could use this trick in reverse for lowering SMIN/SMAX with v16i8 instead. In terms of latency/throughput this is the needs one large move instruction. It's just that the sign bit turning has an increased chance of being optimized further. This is particularly apparent in the "reduce" test cases. However due to the slight regression in the single use case, this patch no longer proposes this.
Unfortunately this argument also applies in reverse to the new lowering of UMIN/UMAX with v8i16 which regresses the "horizontal-reduce-umax", "horizontal-reduce-umin", "vector-reduce-umin" and "vector-reduce-umax" test cases a bit with this patch. Maybe some extra casework would be possible to avoid this. However independent of that I believe that the benefits in the common case of just 1 to 3 chained min/max instructions outweighs the downsides in that specific case.
Patch By: @TomHender (Tom Hender) ActuallyaDeviloper
Differential Revision: https://reviews.llvm.org/D87236
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
- // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
- // using the SMIN/SMAX instructions and flipping the signbit back.
+ // For pre-SSE41, we can perform UMIN/UMAX v8i16 by using psubusw.
if (VT == MVT::v8i16) {
assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
"Unexpected MIN/MAX opcode");
- SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
- N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
- N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
- Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
- SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
- return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
+ if (Opcode == ISD::UMIN)
+ return DAG.getNode(ISD::SUB, DL, VT, N0,
+ DAG.getNode(ISD::USUBSAT, DL, VT, N0, N1));
+ return DAG.getNode(ISD::ADD, DL, VT,
+ DAG.getNode(ISD::USUBSAT, DL, VT, N1, N0), N0);
}
// Else, expand to a compare/select.
{ ISD::SSUBSAT, MVT::v16i8, 1 },
{ ISD::UADDSAT, MVT::v8i16, 1 },
{ ISD::UADDSAT, MVT::v16i8, 1 },
+ { ISD::UMAX, MVT::v8i16, 2 },
{ ISD::UMAX, MVT::v16i8, 1 },
+ { ISD::UMIN, MVT::v8i16, 2 },
{ ISD::UMIN, MVT::v16i8, 1 },
{ ISD::USUBSAT, MVT::v8i16, 1 },
{ ISD::USUBSAT, MVT::v16i8, 1 },
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
; X86-SSE2-LABEL: test_reduce_v8i16:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT: pxor %xmm2, %xmm0
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X86-SSE2-NEXT: paddw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X86-SSE2-NEXT: paddw %xmm1, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrld $16, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X86-SSE2-NEXT: paddw %xmm0, %xmm1
; X86-SSE2-NEXT: movd %xmm1, %eax
-; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: test_reduce_v8i16:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT: pxor %xmm2, %xmm0
-; X64-SSE2-NEXT: pxor %xmm2, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X64-SSE2-NEXT: paddw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X64-SSE2-NEXT: paddw %xmm1, %xmm0
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrld $16, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X64-SSE2-NEXT: paddw %xmm0, %xmm1
; X64-SSE2-NEXT: movd %xmm1, %eax
-; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
define i16 @test_reduce_v16i16(<16 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pxor %xmm2, %xmm0
-; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT: psrld $16, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT: movd %xmm1, %eax
-; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X86-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X86-SSE2-NEXT: paddw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X86-SSE2-NEXT: paddw %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X86-SSE2-NEXT: paddw %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X86-SSE2-NEXT: paddw %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
;
; X64-SSE2-LABEL: test_reduce_v16i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT: pxor %xmm2, %xmm1
-; X64-SSE2-NEXT: pxor %xmm2, %xmm0
-; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT: psrld $16, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT: movd %xmm1, %eax
-; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X64-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X64-SSE2-NEXT: paddw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X64-SSE2-NEXT: paddw %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X64-SSE2-NEXT: paddw %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X64-SSE2-NEXT: paddw %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
define i16 @test_reduce_v32i16(<32 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT: pxor %xmm4, %xmm3
-; X86-SSE2-NEXT: pxor %xmm4, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; X86-SSE2-NEXT: pxor %xmm4, %xmm2
-; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm2
-; X86-SSE2-NEXT: pxor %xmm4, %xmm0
-; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT: psrld $16, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT: movd %xmm1, %eax
-; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X86-SSE2-NEXT: psubusw %xmm0, %xmm2
+; X86-SSE2-NEXT: paddw %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm3
+; X86-SSE2-NEXT: paddw %xmm1, %xmm3
+; X86-SSE2-NEXT: psubusw %xmm2, %xmm3
+; X86-SSE2-NEXT: paddw %xmm2, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT: psubusw %xmm3, %xmm0
+; X86-SSE2-NEXT: paddw %xmm3, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X86-SSE2-NEXT: paddw %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X86-SSE2-NEXT: paddw %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
;
; X64-SSE2-LABEL: test_reduce_v32i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT: pxor %xmm4, %xmm3
-; X64-SSE2-NEXT: pxor %xmm4, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; X64-SSE2-NEXT: pxor %xmm4, %xmm2
-; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm2
-; X64-SSE2-NEXT: pxor %xmm4, %xmm0
-; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT: psrld $16, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT: movd %xmm1, %eax
-; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X64-SSE2-NEXT: psubusw %xmm0, %xmm2
+; X64-SSE2-NEXT: paddw %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm3
+; X64-SSE2-NEXT: paddw %xmm1, %xmm3
+; X64-SSE2-NEXT: psubusw %xmm2, %xmm3
+; X64-SSE2-NEXT: paddw %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; X64-SSE2-NEXT: psubusw %xmm3, %xmm0
+; X64-SSE2-NEXT: paddw %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X64-SSE2-NEXT: paddw %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X64-SSE2-NEXT: paddw %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
; X86-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT: pxor %xmm2, %xmm0
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X86-SSE2-NEXT: paddw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X86-SSE2-NEXT: paddw %xmm1, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrld $16, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X86-SSE2-NEXT: paddw %xmm0, %xmm1
; X86-SSE2-NEXT: movd %xmm1, %eax
-; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT: pxor %xmm2, %xmm0
-; X64-SSE2-NEXT: pxor %xmm2, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X64-SSE2-NEXT: paddw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X64-SSE2-NEXT: paddw %xmm1, %xmm0
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrld $16, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X64-SSE2-NEXT: paddw %xmm0, %xmm1
; X64-SSE2-NEXT: movd %xmm1, %eax
-; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
; X86-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT: pxor %xmm2, %xmm0
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X86-SSE2-NEXT: paddw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X86-SSE2-NEXT: paddw %xmm1, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrld $16, %xmm1
-; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X86-SSE2-NEXT: paddw %xmm0, %xmm1
; X86-SSE2-NEXT: movd %xmm1, %eax
-; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT: pxor %xmm2, %xmm0
-; X64-SSE2-NEXT: pxor %xmm2, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X64-SSE2-NEXT: paddw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm0
+; X64-SSE2-NEXT: paddw %xmm1, %xmm0
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrld $16, %xmm1
-; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: psubusw %xmm0, %xmm1
+; X64-SSE2-NEXT: paddw %xmm0, %xmm1
; X64-SSE2-NEXT: movd %xmm1, %eax
-; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
; X86-SSE2-LABEL: test_reduce_v8i16:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT: pxor %xmm2, %xmm0
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrld $16, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: movd %xmm1, %eax
-; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: test_reduce_v8i16:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT: pxor %xmm2, %xmm0
-; X64-SSE2-NEXT: pxor %xmm2, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrld $16, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: movd %xmm1, %eax
-; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
define i16 @test_reduce_v16i16(<16 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pxor %xmm2, %xmm0
-; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrld $16, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: movd %xmm1, %eax
-; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
;
; X64-SSE2-LABEL: test_reduce_v16i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT: pxor %xmm2, %xmm1
-; X64-SSE2-NEXT: pxor %xmm2, %xmm0
-; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrld $16, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: movd %xmm1, %eax
-; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
define i16 @test_reduce_v32i16(<32 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT: pxor %xmm4, %xmm3
-; X86-SSE2-NEXT: pxor %xmm4, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm3, %xmm1
-; X86-SSE2-NEXT: pxor %xmm4, %xmm2
-; X86-SSE2-NEXT: pminsw %xmm1, %xmm2
-; X86-SSE2-NEXT: pxor %xmm4, %xmm0
-; X86-SSE2-NEXT: pminsw %xmm2, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: psubusw %xmm3, %xmm4
+; X86-SSE2-NEXT: psubw %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: psubusw %xmm2, %xmm3
+; X86-SSE2-NEXT: psubw %xmm3, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrld $16, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: movd %xmm1, %eax
-; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
;
; X64-SSE2-LABEL: test_reduce_v32i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT: pxor %xmm4, %xmm3
-; X64-SSE2-NEXT: pxor %xmm4, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm3, %xmm1
-; X64-SSE2-NEXT: pxor %xmm4, %xmm2
-; X64-SSE2-NEXT: pminsw %xmm1, %xmm2
-; X64-SSE2-NEXT: pxor %xmm4, %xmm0
-; X64-SSE2-NEXT: pminsw %xmm2, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: psubusw %xmm3, %xmm4
+; X64-SSE2-NEXT: psubw %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: psubusw %xmm2, %xmm3
+; X64-SSE2-NEXT: psubw %xmm3, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrld $16, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: movd %xmm1, %eax
-; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
; X86-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT: pxor %xmm2, %xmm0
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrld $16, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: movd %xmm1, %eax
-; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT: pxor %xmm2, %xmm0
-; X64-SSE2-NEXT: pxor %xmm2, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrld $16, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: movd %xmm1, %eax
-; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
; X86-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT: pxor %xmm2, %xmm0
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrld $16, %xmm1
-; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: movd %xmm1, %eax
-; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X86-SSE2-NEXT: psubw %xmm2, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT: pxor %xmm2, %xmm0
-; X64-SSE2-NEXT: pxor %xmm2, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrld $16, %xmm1
-; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: movd %xmm1, %eax
-; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: psubusw %xmm1, %xmm2
+; X64-SSE2-NEXT: psubw %xmm2, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-SSE2-NEXT: retq
;
; SSE-LABEL: reassociate_umax_v8i16:
; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE-NEXT: pxor %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pmaxsw %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm3
-; SSE-NEXT: pmaxsw %xmm3, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: psubusw %xmm2, %xmm0
+; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: psubusw %xmm3, %xmm0
+; SSE-NEXT: paddw %xmm3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_umax_v8i16:
; SSE-LABEL: reassociate_umin_v8i16:
; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE-NEXT: pxor %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pminsw %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm3
-; SSE-NEXT: pminsw %xmm3, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: psubusw %xmm0, %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: psubusw %xmm2, %xmm0
+; SSE-NEXT: psubw %xmm0, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_umin_v8i16:
; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm2, %xmm0
; SSE-NEXT: paddw %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE-NEXT: pxor %xmm2, %xmm5
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pmaxsw %xmm5, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm4
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pmaxsw %xmm4, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm6
-; SSE-NEXT: pmaxsw %xmm6, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm7
-; SSE-NEXT: pmaxsw %xmm7, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: psubusw %xmm5, %xmm1
+; SSE-NEXT: paddw %xmm5, %xmm1
+; SSE-NEXT: psubusw %xmm4, %xmm0
+; SSE-NEXT: paddw %xmm4, %xmm0
+; SSE-NEXT: psubusw %xmm6, %xmm0
+; SSE-NEXT: paddw %xmm6, %xmm0
+; SSE-NEXT: psubusw %xmm7, %xmm1
+; SSE-NEXT: paddw %xmm7, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_umax_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm2, %xmm0
; SSE-NEXT: paddw %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE-NEXT: pxor %xmm2, %xmm5
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pminsw %xmm5, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm4
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pminsw %xmm4, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm6
-; SSE-NEXT: pminsw %xmm6, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm7
-; SSE-NEXT: pminsw %xmm7, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm5, %xmm2
+; SSE-NEXT: psubusw %xmm1, %xmm2
+; SSE-NEXT: psubw %xmm2, %xmm5
+; SSE-NEXT: movdqa %xmm4, %xmm1
+; SSE-NEXT: psubusw %xmm0, %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm6, %xmm0
+; SSE-NEXT: psubusw %xmm4, %xmm0
+; SSE-NEXT: psubw %xmm0, %xmm6
+; SSE-NEXT: movdqa %xmm7, %xmm0
+; SSE-NEXT: psubusw %xmm5, %xmm0
+; SSE-NEXT: psubw %xmm0, %xmm7
+; SSE-NEXT: movdqa %xmm6, %xmm0
+; SSE-NEXT: movdqa %xmm7, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_umin_v16i16:
define <32 x i16> @reassociate_umax_v32i16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, <32 x i16> %x3) {
; SSE-LABEL: reassociate_umax_v32i16:
; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15
; SSE-NEXT: paddw %xmm4, %xmm0
; SSE-NEXT: paddw %xmm5, %xmm1
; SSE-NEXT: paddw %xmm6, %xmm2
; SSE-NEXT: paddw %xmm7, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: pmaxsw %xmm3, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pmaxsw %xmm2, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pmaxsw %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: pmaxsw %xmm0, %xmm1
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: pmaxsw %xmm1, %xmm0
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: pmaxsw %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pmaxsw %xmm3, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pmaxsw %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: psubusw %xmm15, %xmm3
+; SSE-NEXT: paddw %xmm15, %xmm3
+; SSE-NEXT: psubusw %xmm14, %xmm2
+; SSE-NEXT: paddw %xmm14, %xmm2
+; SSE-NEXT: psubusw %xmm13, %xmm1
+; SSE-NEXT: paddw %xmm13, %xmm1
+; SSE-NEXT: psubusw %xmm12, %xmm0
+; SSE-NEXT: paddw %xmm12, %xmm0
+; SSE-NEXT: psubusw %xmm11, %xmm0
+; SSE-NEXT: paddw %xmm11, %xmm0
+; SSE-NEXT: psubusw %xmm10, %xmm1
+; SSE-NEXT: paddw %xmm10, %xmm1
+; SSE-NEXT: psubusw %xmm9, %xmm2
+; SSE-NEXT: paddw %xmm9, %xmm2
+; SSE-NEXT: psubusw %xmm8, %xmm3
+; SSE-NEXT: paddw %xmm8, %xmm3
; SSE-NEXT: retq
;
; AVX2-LABEL: reassociate_umax_v32i16:
define <32 x i16> @reassociate_umin_v32i16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, <32 x i16> %x3) {
; SSE-LABEL: reassociate_umin_v32i16:
; SSE: # %bb.0:
-; SSE-NEXT: paddw %xmm4, %xmm0
-; SSE-NEXT: paddw %xmm5, %xmm1
-; SSE-NEXT: paddw %xmm6, %xmm2
-; SSE-NEXT: paddw %xmm7, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: pminsw %xmm3, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm8
+; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: movdqa %xmm1, %xmm10
+; SSE-NEXT: movdqa %xmm0, %xmm11
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pminsw %xmm2, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm1
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pminsw %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm0
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: pminsw %xmm0, %xmm1
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: pminsw %xmm1, %xmm0
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: pminsw %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pminsw %xmm3, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pminsw %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12
+; SSE-NEXT: paddw %xmm4, %xmm11
+; SSE-NEXT: paddw %xmm5, %xmm10
+; SSE-NEXT: paddw %xmm6, %xmm9
+; SSE-NEXT: paddw %xmm7, %xmm8
+; SSE-NEXT: movdqa %xmm12, %xmm4
+; SSE-NEXT: psubusw %xmm8, %xmm4
+; SSE-NEXT: psubw %xmm4, %xmm12
+; SSE-NEXT: movdqa %xmm13, %xmm4
+; SSE-NEXT: psubusw %xmm9, %xmm4
+; SSE-NEXT: psubw %xmm4, %xmm13
+; SSE-NEXT: movdqa %xmm15, %xmm4
+; SSE-NEXT: psubusw %xmm10, %xmm4
+; SSE-NEXT: psubw %xmm4, %xmm15
+; SSE-NEXT: movdqa %xmm14, %xmm4
+; SSE-NEXT: psubusw %xmm11, %xmm4
+; SSE-NEXT: psubw %xmm4, %xmm14
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: psubusw %xmm14, %xmm4
+; SSE-NEXT: psubw %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psubusw %xmm15, %xmm4
+; SSE-NEXT: psubw %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: psubusw %xmm13, %xmm4
+; SSE-NEXT: psubw %xmm4, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psubusw %xmm12, %xmm4
+; SSE-NEXT: psubw %xmm4, %xmm3
; SSE-NEXT: retq
;
; AVX2-LABEL: reassociate_umin_v32i16:
; SSE2-LABEL: truncstore_v32i16_v32i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm6, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [33023,33023,33023,33023,33023,33023,33023,33023]
-; SSE2-NEXT: pminsw %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm6, %xmm1
-; SSE2-NEXT: pxor %xmm6, %xmm0
-; SSE2-NEXT: pminsw %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm6, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psubusw %xmm8, %xmm6
+; SSE2-NEXT: psubw %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psubusw %xmm8, %xmm6
+; SSE2-NEXT: psubw %xmm6, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: pcmpeqb %xmm7, %xmm4
; SSE2-NEXT: pmovmskb %xmm4, %ecx
; SSE2-NEXT: # %bb.23: # %cond.store21
; SSE2-NEXT: movb %ch, 11(%rdi)
; SSE2-NEXT: .LBB15_24: # %else22
-; SSE2-NEXT: pxor %xmm6, %xmm3
-; SSE2-NEXT: pxor %xmm6, %xmm2
; SSE2-NEXT: testl $4096, %eax # imm = 0x1000
; SSE2-NEXT: pextrw $6, %xmm0, %ecx
; SSE2-NEXT: je .LBB15_26
; SSE2-NEXT: # %bb.25: # %cond.store23
; SSE2-NEXT: movb %cl, 12(%rdi)
; SSE2-NEXT: .LBB15_26: # %else24
-; SSE2-NEXT: pminsw %xmm8, %xmm3
-; SSE2-NEXT: pminsw %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: psubusw %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psubusw %xmm8, %xmm4
; SSE2-NEXT: testl $8192, %eax # imm = 0x2000
; SSE2-NEXT: je .LBB15_28
; SSE2-NEXT: # %bb.27: # %cond.store25
; SSE2-NEXT: movb %ch, 13(%rdi)
; SSE2-NEXT: .LBB15_28: # %else26
-; SSE2-NEXT: pxor %xmm6, %xmm3
-; SSE2-NEXT: pxor %xmm6, %xmm2
+; SSE2-NEXT: psubw %xmm1, %xmm3
+; SSE2-NEXT: psubw %xmm4, %xmm2
; SSE2-NEXT: testl $16384, %eax # imm = 0x4000
; SSE2-NEXT: pextrw $7, %xmm0, %ecx
; SSE2-NEXT: je .LBB15_30
; SSE2-LABEL: truncstore_v16i16_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023]
-; SSE2-NEXT: pminsw %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psubusw %xmm4, %xmm5
+; SSE2-NEXT: psubw %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psubusw %xmm4, %xmm5
+; SSE2-NEXT: psubw %xmm5, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: pcmpeqb %xmm2, %xmm3
; SSE2-NEXT: pmovmskb %xmm3, %eax
; SSE2-LABEL: truncstore_v8i16_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubusw {{.*}}(%rip), %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqw %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-LABEL: vec128_i16_unsigned_reg_reg:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: por {{.*}}(%rip), %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pminsw %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm3, %xmm5
-; SSE2-NEXT: pmaxsw %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: psubw %xmm5, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; SSE2-NEXT: por {{.*}}(%rip), %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: psubw %xmm0, %xmm2
+; SSE2-NEXT: paddw %xmm0, %xmm2
+; SSE2-NEXT: paddw %xmm1, %xmm2
; SSE2-NEXT: psrlw $1, %xmm2
-; SSE2-NEXT: pmullw %xmm4, %xmm2
+; SSE2-NEXT: pmullw %xmm3, %xmm2
; SSE2-NEXT: paddw %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
define <8 x i16> @unsigned_sat_constant_v8i16_using_min(<8 x i16> %x) {
; SSE2-LABEL: unsigned_sat_constant_v8i16_using_min:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psubusw {{.*}}(%rip), %xmm1
+; SSE2-NEXT: psubw %xmm1, %xmm0
; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16> %y) {
; SSE2-LABEL: unsigned_sat_variable_v8i16_using_min:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
-; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: pminsw %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubusw %xmm2, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: retq
;
define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: test_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pmaxsw %xmm1, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: psubusw %xmm0, %xmm1
+; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8i16:
define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: test_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pminsw %xmm1, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psubusw %xmm1, %xmm2
+; SSE-NEXT: psubw %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8i16:
define <8 x i16> @max_gt_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: max_gt_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v8i16:
define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: max_gt_v16i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm3
+; SSE2-NEXT: paddw %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v16i16:
define <8 x i16> @max_ge_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: max_ge_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v8i16:
define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: max_ge_v16i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm3
+; SSE2-NEXT: paddw %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v16i16:
define <8 x i16> @min_lt_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: min_lt_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v8i16:
define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: min_lt_v16i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psubusw %xmm2, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psubusw %xmm3, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v16i16:
define <8 x i16> @min_le_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: min_le_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v8i16:
define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: min_le_v16i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psubusw %xmm2, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psubusw %xmm3, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE2-LABEL: test_v4i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE2-LABEL: test_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
define i16 @test_v16i16(<16 x i16> %a0) {
; SSE2-LABEL: test_v16i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: psubusw %xmm1, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
define i16 @test_v32i16(<32 x i16> %a0) {
; SSE2-LABEL: test_v32i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pmaxsw %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
+; SSE2-NEXT: psubusw %xmm0, %xmm2
+; SSE2-NEXT: paddw %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm3
+; SSE2-NEXT: paddw %xmm1, %xmm3
+; SSE2-NEXT: psubusw %xmm2, %xmm3
+; SSE2-NEXT: paddw %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT: psubusw %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
define i16 @test_v64i16(<64 x i16> %a0) {
; SSE2-LABEL: test_v64i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pmaxsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pmaxsw %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pmaxsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pmaxsw %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pmaxsw %xmm5, %xmm1
-; SSE2-NEXT: pmaxsw %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm5
+; SSE2-NEXT: paddw %xmm1, %xmm5
+; SSE2-NEXT: psubusw %xmm3, %xmm7
+; SSE2-NEXT: paddw %xmm3, %xmm7
+; SSE2-NEXT: psubusw %xmm0, %xmm4
+; SSE2-NEXT: paddw %xmm0, %xmm4
+; SSE2-NEXT: psubusw %xmm2, %xmm6
+; SSE2-NEXT: paddw %xmm2, %xmm6
+; SSE2-NEXT: psubusw %xmm4, %xmm6
+; SSE2-NEXT: paddw %xmm4, %xmm6
+; SSE2-NEXT: psubusw %xmm5, %xmm7
+; SSE2-NEXT: paddw %xmm5, %xmm7
+; SSE2-NEXT: psubusw %xmm6, %xmm7
+; SSE2-NEXT: paddw %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT: psubusw %xmm7, %xmm0
+; SSE2-NEXT: paddw %xmm7, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE2-LABEL: test_v4i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE2-LABEL: test_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE2-NEXT: pminsw %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
define i16 @test_v16i16(<16 x i16> %a0) {
; SSE2-LABEL: test_v16i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE2-NEXT: pminsw %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
define i16 @test_v32i16(<32 x i16> %a0) {
; SSE2-LABEL: test_v32i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pminsw %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psubusw %xmm3, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubusw %xmm2, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE2-NEXT: pminsw %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
define i16 @test_v64i16(<64 x i16> %a0) {
; SSE2-LABEL: test_v64i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pminsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pminsw %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pminsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pminsw %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pminsw %xmm5, %xmm1
-; SSE2-NEXT: pminsw %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT: pminsw %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: psubusw %xmm6, %xmm8
+; SSE2-NEXT: psubw %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psubusw %xmm4, %xmm6
+; SSE2-NEXT: psubw %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psubusw %xmm7, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psubusw %xmm5, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psubusw %xmm3, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubusw %xmm2, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
define <8 x i8> @trunc_usat_v8i16_v8i8(<8 x i16> %a0) {
; SSE2-LABEL: trunc_usat_v8i16_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psubusw {{.*}}(%rip), %xmm1
+; SSE2-NEXT: psubw %xmm1, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i16_v8i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pminsw {{.*}}(%rip), %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psubusw {{.*}}(%rip), %xmm1
+; SSSE3-NEXT: psubw %xmm1, %xmm0
; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
define void @trunc_usat_v8i16_v8i8_store(<8 x i16> %a0, <8 x i8> *%p1) {
; SSE2-LABEL: trunc_usat_v8i16_v8i8_store:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psubusw {{.*}}(%rip), %xmm1
+; SSE2-NEXT: psubw %xmm1, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i16_v8i8_store:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pminsw {{.*}}(%rip), %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psubusw {{.*}}(%rip), %xmm1
+; SSSE3-NEXT: psubw %xmm1, %xmm0
; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: movq %xmm0, (%rdi)
; SSSE3-NEXT: retq
define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) {
; SSE2-LABEL: trunc_usat_v16i16_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023]
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psubusw %xmm2, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubusw %xmm2, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v16i16_v16i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023]
-; SSSE3-NEXT: pminsw %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: pminsw %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: psubusw %xmm2, %xmm3
+; SSSE3-NEXT: psubw %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psubusw %xmm2, %xmm3
+; SSSE3-NEXT: psubw %xmm3, %xmm0
; SSSE3-NEXT: packuswb %xmm1, %xmm0
; SSSE3-NEXT: retq
;
define <32 x i8> @trunc_usat_v32i16_v32i8(<32 x i16>* %p0) {
; SSE2-LABEL: trunc_usat_v32i16_v32i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: movdqa 48(%rdi), %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023]
-; SSE2-NEXT: pminsw %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa 32(%rdi), %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: packuswb %xmm0, %xmm1
-; SSE2-NEXT: movdqa 16(%rdi), %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pminsw %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa 16(%rdi), %xmm2
+; SSE2-NEXT: movdqa 32(%rdi), %xmm1
+; SSE2-NEXT: movdqa 48(%rdi), %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psubusw %xmm4, %xmm5
+; SSE2-NEXT: psubw %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psubusw %xmm4, %xmm5
+; SSE2-NEXT: psubw %xmm5, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psubusw %xmm4, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubusw %xmm4, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v32i16_v32i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT: movdqa 48(%rdi), %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023]
-; SSSE3-NEXT: pminsw %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa 32(%rdi), %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pminsw %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: packuswb %xmm0, %xmm1
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm4
-; SSSE3-NEXT: pminsw %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: movdqa (%rdi), %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: pminsw %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: packuswb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm2
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm1
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: psubusw %xmm4, %xmm5
+; SSSE3-NEXT: psubw %xmm5, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: psubusw %xmm4, %xmm5
+; SSSE3-NEXT: psubw %xmm5, %xmm1
+; SSSE3-NEXT: packuswb %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: psubusw %xmm4, %xmm3
+; SSSE3-NEXT: psubw %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psubusw %xmm4, %xmm3
+; SSSE3-NEXT: psubw %xmm3, %xmm0
+; SSSE3-NEXT: packuswb %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v32i16_v32i8:
define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test13:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test13:
define <8 x i16> @test14(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test14:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test14:
define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test15:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test15:
define <8 x i16> @test16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test16:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test16:
define <16 x i16> @test37(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test37:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psubusw %xmm2, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psubusw %xmm3, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test37:
define <16 x i16> @test38(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test38:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psubusw %xmm2, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psubusw %xmm3, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test38:
define <16 x i16> @test39(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test39:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm3
+; SSE2-NEXT: paddw %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test39:
define <16 x i16> @test40(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test40:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm3
+; SSE2-NEXT: paddw %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test40:
define <8 x i16> @test61(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test61:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test61:
define <8 x i16> @test62(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test62:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test62:
define <8 x i16> @test63(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test63:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test63:
define <8 x i16> @test64(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test64:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test64:
define <16 x i16> @test85(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test85:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm3
+; SSE2-NEXT: paddw %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test85:
define <16 x i16> @test86(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test86:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: psubusw %xmm0, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm3
+; SSE2-NEXT: paddw %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test86:
define <16 x i16> @test87(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test87:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psubusw %xmm2, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psubusw %xmm3, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test87:
define <16 x i16> @test88(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test88:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psubusw %xmm2, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psubusw %xmm3, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test88:
define <32 x i16> @test109(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test109:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pminsw %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pminsw %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pminsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pminsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: psubusw %xmm4, %xmm8
+; SSE2-NEXT: psubw %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psubusw %xmm5, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psubusw %xmm6, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psubusw %xmm7, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test109:
define <32 x i16> @test110(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test110:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pminsw %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pminsw %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pminsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pminsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: psubusw %xmm4, %xmm8
+; SSE2-NEXT: psubw %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psubusw %xmm5, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psubusw %xmm6, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psubusw %xmm7, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test110:
define <32 x i16> @test111(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test111:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pmaxsw %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pmaxsw %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pmaxsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pmaxsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: psubusw %xmm0, %xmm4
+; SSE2-NEXT: paddw %xmm4, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm5
+; SSE2-NEXT: paddw %xmm5, %xmm1
+; SSE2-NEXT: psubusw %xmm2, %xmm6
+; SSE2-NEXT: paddw %xmm6, %xmm2
+; SSE2-NEXT: psubusw %xmm3, %xmm7
+; SSE2-NEXT: paddw %xmm7, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test111:
define <32 x i16> @test112(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test112:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pmaxsw %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pmaxsw %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pmaxsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pmaxsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: psubusw %xmm0, %xmm4
+; SSE2-NEXT: paddw %xmm4, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm5
+; SSE2-NEXT: paddw %xmm5, %xmm1
+; SSE2-NEXT: psubusw %xmm2, %xmm6
+; SSE2-NEXT: paddw %xmm6, %xmm2
+; SSE2-NEXT: psubusw %xmm3, %xmm7
+; SSE2-NEXT: paddw %xmm7, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test112:
define <32 x i16> @test141(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test141:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pmaxsw %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pmaxsw %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pmaxsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pmaxsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: psubusw %xmm0, %xmm4
+; SSE2-NEXT: paddw %xmm4, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm5
+; SSE2-NEXT: paddw %xmm5, %xmm1
+; SSE2-NEXT: psubusw %xmm2, %xmm6
+; SSE2-NEXT: paddw %xmm6, %xmm2
+; SSE2-NEXT: psubusw %xmm3, %xmm7
+; SSE2-NEXT: paddw %xmm7, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test141:
define <32 x i16> @test142(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test142:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pmaxsw %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pmaxsw %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pmaxsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pmaxsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: psubusw %xmm0, %xmm4
+; SSE2-NEXT: paddw %xmm4, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm5
+; SSE2-NEXT: paddw %xmm5, %xmm1
+; SSE2-NEXT: psubusw %xmm2, %xmm6
+; SSE2-NEXT: paddw %xmm6, %xmm2
+; SSE2-NEXT: psubusw %xmm3, %xmm7
+; SSE2-NEXT: paddw %xmm7, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test142:
define <32 x i16> @test143(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test143:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pminsw %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pminsw %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pminsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pminsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: psubusw %xmm4, %xmm8
+; SSE2-NEXT: psubw %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psubusw %xmm5, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psubusw %xmm6, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psubusw %xmm7, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test143:
define <32 x i16> @test144(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test144:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pminsw %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pminsw %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pminsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pminsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: psubusw %xmm4, %xmm8
+; SSE2-NEXT: psubw %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psubusw %xmm5, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psubusw %xmm6, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psubusw %xmm7, %xmm4
+; SSE2-NEXT: psubw %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test144: