ret i8 %1
}
+define i8 @test_v2i8_load(<2 x i8>* %p) {
+; SSE2-LABEL: test_v2i8_load:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movzwl (%rdi), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2i8_load:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movzwl (%rdi), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: pextrb $0, %xmm1, %eax
+; SSE41-NEXT: # kill: def $al killed $al killed $eax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v2i8_load:
+; AVX: # %bb.0:
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v2i8_load:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movzwl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
+ %a0 = load <2 x i8>, <2 x i8>* %p
+ %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
+ ret i8 %1
+}
+
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
ret i8 %1
}
+define i8 @test_v4i8_load(<4 x i8>* %p) {
+; SSE2-LABEL: test_v4i8_load:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4i8_load:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: pextrb $0, %xmm0, %eax
+; SSE41-NEXT: # kill: def $al killed $al killed $eax
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v4i8_load:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v4i8_load:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v4i8_load:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movl (%rdi), %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm0
+; AVX512BW-NEXT: shrl $16, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm1
+; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v4i8_load:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movl (%rdi), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: shrl $16, %eax
+; AVX512VL-NEXT: vpbroadcastw %eax, %xmm1
+; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT: retq
+ %a0 = load <4 x i8>, <4 x i8>* %p
+ %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
+ ret i8 %1
+}
+
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
ret i8 %1
}
+define i8 @test_v8i8_load(<8 x i8>* %p) {
+; SSE2-LABEL: test_v8i8_load:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8i8_load:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: pextrb $0, %xmm1, %eax
+; SSE41-NEXT: # kill: def $al killed $al killed $eax
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v8i8_load:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8i8_load:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: shrq $32, %rax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v8i8_load:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movq (%rdi), %rax
+; AVX512BW-NEXT: vmovq %rax, %xmm0
+; AVX512BW-NEXT: shrq $32, %rax
+; AVX512BW-NEXT: vmovd %eax, %xmm1
+; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v8i8_load:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movq (%rdi), %rax
+; AVX512VL-NEXT: vmovq %rax, %xmm0
+; AVX512VL-NEXT: shrq $32, %rax
+; AVX512VL-NEXT: vpbroadcastd %eax, %xmm1
+; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT: retq
+ %a0 = load <8 x i8>, <8 x i8>* %p
+ %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
+ ret i8 %1
+}
+
define i8 @test_v16i8(<16 x i8> %a0) {
; SSE2-LABEL: test_v16i8:
; SSE2: # %bb.0: