From 23cd0da9e9882fc534f430f3bf2e695b61a9904a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 6 Aug 2019 10:52:44 +0000 Subject: [PATCH] [X86][SSE] Add tests for min/max partial reduction As mentioned on D65047 / rL366933 the plan is to enable partial reduction handling wherever possible. llvm-svn: 368015 --- llvm/test/CodeGen/X86/horizontal-reduce-smax.ll | 438 ++++++++++++++++++++++++ llvm/test/CodeGen/X86/horizontal-reduce-smin.ll | 438 ++++++++++++++++++++++++ llvm/test/CodeGen/X86/horizontal-reduce-umax.ll | 418 ++++++++++++++++++++++ llvm/test/CodeGen/X86/horizontal-reduce-umin.ll | 394 +++++++++++++++++++++ 4 files changed, 1688 insertions(+) diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll index c08f08f..32fee3a 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -1829,3 +1829,441 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { %19 = extractelement <64 x i8> %18, i32 0 ret i8 %19 } + +; +; Partial Vector Reductions +; + +define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i16_v8i16: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pxor LCPI12_0, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i16_v8i16: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %2 = icmp sgt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> + %5 = icmp sgt <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> + %8 = icmp sgt <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = extractelement <16 x i16> %9, i32 0 + ret i16 %10 +} + +define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i16_v8i16: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pxor LCPI13_0, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i16_v8i16: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %2 = icmp sgt <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> + %5 = icmp sgt <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> + %8 = icmp sgt <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = extractelement <32 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8_v16i8: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pxor LCPI14_0, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: xorb $127, %al +; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8_v16i8: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: xorb $127, %al +; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> + %2 = icmp sgt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> + %5 = icmp sgt <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> + %8 = icmp sgt <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> + %11 = icmp sgt <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = extractelement <32 x i8> %12, i32 0 + ret i8 %13 +} + +define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8_v16i8: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pxor LCPI15_0, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: xorb $127, %al +; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8_v16i8: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: xorb $127, %al +; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> + %2 = icmp sgt <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> + %5 = icmp sgt <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> + %8 = icmp sgt <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> + %11 = icmp sgt <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = extractelement <64 x i8> %12, i32 0 + ret i8 %13 +} diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll index c526cb8..a344322 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -1833,3 +1833,441 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { %19 = extractelement <64 x i8> %18, i32 0 ret i8 %19 } + +; +; Partial Vector Reductions +; + +define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i16_v8i16: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pxor LCPI12_0, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i16_v8i16: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %2 = icmp slt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> + %5 = icmp slt <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> + %8 = icmp slt <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = extractelement <16 x i16> %9, i32 0 + ret i16 %10 +} + +define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i16_v8i16: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pxor LCPI13_0, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i16_v8i16: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %2 = icmp slt <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> + %5 = icmp slt <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> + %8 = icmp slt <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = extractelement <32 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8_v16i8: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pxor LCPI14_0, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: xorb $-128, %al +; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8_v16i8: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: xorb $-128, %al +; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> + %2 = icmp slt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> + %5 = icmp slt <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> + %8 = icmp slt <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> + %11 = icmp slt <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = extractelement <32 x i8> %12, i32 0 + ret i8 %13 +} + +define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8_v16i8: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pxor LCPI15_0, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: xorb $-128, %al +; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8_v16i8: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: xorb $-128, %al +; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> + %2 = icmp slt <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> + %5 = icmp slt <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> + %8 = icmp slt <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> + %11 = icmp slt <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = extractelement <64 x i8> %12, i32 0 + ret i8 %13 +} diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll index 99038d7..05cd44d 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -2021,3 +2021,421 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { %19 = extractelement <64 x i8> %18, i32 0 ret i8 %19 } + +; +; Partial Vector Reductions +; + +define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i16_v8i16: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: notl %eax +; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i16_v8i16: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: notl %eax +; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %2 = icmp ugt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> + %5 = icmp ugt <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> + %8 = icmp ugt <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = extractelement <16 x i16> %9, i32 0 + ret i16 %10 +} + +define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i16_v8i16: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: notl %eax +; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i16_v8i16: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: notl %eax +; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %2 = icmp ugt <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> + %5 = icmp ugt <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> + %8 = icmp ugt <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = extractelement <32 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8_v16i8: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: notb %al +; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8_v16i8: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: notb %al +; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> + %2 = icmp ugt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> + %5 = icmp ugt <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> + %8 = icmp ugt <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> + %11 = icmp ugt <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = extractelement <32 x i8> %12, i32 0 + ret i8 %13 +} + +define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8_v16i8: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: notb %al +; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8_v16i8: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: notb %al +; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> + %2 = icmp ugt <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> + %5 = icmp ugt <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> + %8 = icmp ugt <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> + %11 = icmp ugt <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = extractelement <64 x i8> %12, i32 0 + ret i8 %13 +} diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll index 5e4d830..ed64ec2 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -1885,3 +1885,397 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { %19 = extractelement <64 x i8> %18, i32 0 ret i8 %19 } + +; +; Partial Vector Reductions +; + +define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i16_v8i16: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i16_v8i16: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %2 = icmp ult <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> + %5 = icmp ult <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> + %8 = icmp ult <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = extractelement <16 x i16> %9, i32 0 + ret i16 %10 +} + +define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i16_v8i16: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i16_v8i16: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %2 = icmp ult <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> + %5 = icmp ult <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> + %8 = icmp ult <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = extractelement <32 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8_v16i8: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8_v16i8: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> + %2 = icmp ult <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> + %5 = icmp ult <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> + %8 = icmp ult <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> + %11 = icmp ult <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = extractelement <32 x i8> %12, i32 0 + ret i8 %13 +} + +define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: +; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8_v16i8: +; X86-SSE42: ## %bb.0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: +; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8_v16i8: +; X64-SSE42: ## %bb.0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> + %2 = icmp ult <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> + %5 = icmp ult <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> + %8 = icmp ult <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> + %11 = icmp ult <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = extractelement <64 x i8> %12, i32 0 + ret i8 %13 +} -- 2.7.4