From: Craig Topper Date: Mon, 12 Aug 2019 06:55:58 +0000 (+0000) Subject: [X86] Add some reduction add test cases that show sub-optimal code on avx2 and later. X-Git-Tag: llvmorg-11-init~12165 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8750c77df4cdf8c39065f3053491b645576da93b;p=platform%2Fupstream%2Fllvm.git [X86] Add some reduction add test cases that show sub-optimal code on avx2 and later. For v4i8 and v8i8 when the reduction starts with a load we end up shifting the data in the scalar domain and copying to the vector domain a second time using a broadcast. We already copied it to the vector domain once. It's better to just shuffle it there. llvm-svn: 368544 --- diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll index 3baa9c7..2922efc 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -979,6 +979,53 @@ define i8 @test_v2i8(<2 x i8> %a0) { ret i8 %1 } +define i8 @test_v2i8_load(<2 x i8>* %p) { +; SSE2-LABEL: test_v2i8_load: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i8_load: +; SSE41: # %bb.0: +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i8_load: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i8_load: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl (%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a0 = load <2 x i8>, <2 x i8>* %p + %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0) + ret i8 %1 +} + define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: @@ -1027,6 +1074,89 @@ define i8 @test_v4i8(<4 x i8> %a0) { ret i8 %1 } +define i8 @test_v4i8_load(<4 x i8>* %p) { +; SSE2-LABEL: test_v4i8_load: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i8_load: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v4i8_load: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i8_load: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v4i8_load: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v4i8_load: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movl (%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm0 +; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: retq + %a0 = load <4 x i8>, <4 x i8>* %p + %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0) + ret i8 %1 +} + define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-LABEL: test_v8i8: ; SSE2: # %bb.0: @@ -1083,6 +1213,101 @@ define i8 @test_v8i8(<8 x i8> %a0) { ret i8 %1 } +define i8 @test_v8i8_load(<8 x i8>* %p) { +; SSE2-LABEL: test_v8i8_load: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i8_load: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v8i8_load: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i8_load: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: shrq $32, %rax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v8i8_load: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movq (%rdi), %rax +; AVX512BW-NEXT: vmovq %rax, %xmm0 +; AVX512BW-NEXT: shrq $32, %rax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8i8_load: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movq (%rdi), %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: shrq $32, %rax +; AVX512VL-NEXT: vpbroadcastd %eax, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: retq + %a0 = load <8 x i8>, <8 x i8>* %p + %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0) + ret i8 %1 +} + define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i8: ; SSE2: # %bb.0: