From 25f6464b320274df9b7cabd5cf33152b56c0c9f6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 13 May 2020 13:30:34 -0700 Subject: [PATCH] [X86] Fix one of the PMADDWD tests to not have dead code. There are two reductions in this test. It looks like I intended to combine them by packing one of them into the upper 32 bits of the result. But the OR instruction was missing. --- llvm/test/CodeGen/X86/madd.ll | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index 5846ebf..5058cf3 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2797,12 +2797,20 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; SSE2-NEXT: addq $-8, %rax ; SSE2-NEXT: jne .LBB33_1 ; SSE2-NEXT: # %bb.2: # %middle.block +; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] +; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: shlq $32, %rcx +; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: retq ; ; AVX1-LABEL: sum_and_sum_of_squares: @@ -2829,6 +2837,13 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; AVX1-NEXT: addq $-8, %rax ; AVX1-NEXT: jne .LBB33_1 ; AVX1-NEXT: # %bb.2: # %middle.block +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm1, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] @@ -2836,6 +2851,8 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: shlq $32, %rcx +; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -2855,6 +2872,13 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; AVX256-NEXT: addq $-8, %rax ; AVX256-NEXT: jne .LBB33_1 ; AVX256-NEXT: # %bb.2: # %middle.block +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vmovd %xmm1, %ecx ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] @@ -2862,6 +2886,8 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax +; AVX256-NEXT: shlq $32, %rcx +; AVX256-NEXT: orq %rcx, %rax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq entry: @@ -2901,5 +2927,6 @@ middle.block: %tmp = zext i32 %8 to i64 %tmp28 = shl nuw i64 %tmp, 32 %tmp29 = zext i32 %9 to i64 - ret i64 %tmp29 + %tmp30 = or i64 %tmp28, %tmp29 + ret i64 %tmp30 } -- 2.7.4