From 3d9511a311a038d83022a31f2a846e76e21d70be Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 13 May 2020 14:13:25 -0700 Subject: [PATCH] [X86] Add test case for a regression from D76649. NFC When combineLoopMAdd was moved to IR we got stricter about ensuring the truncate was free. This prevents us from matching this sum of squares of byte differences pattern show here. We used to get this case when it was in SelectionDAG. --- llvm/test/CodeGen/X86/madd.ll | 129 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index 5058cf3..3f221d4 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2930,3 +2930,132 @@ middle.block: %tmp30 = or i64 %tmp28, %tmp29 ret i64 %tmp30 } + +define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) { +; SSE2-LABEL: sum_of_square_differences: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB34_1: # %vector.body +; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: psubw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pmulhw %xmm4, %xmm3 +; SSE2-NEXT: pmullw %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: addq $8, %rcx +; SSE2-NEXT: cmpq %rcx, %rax +; SSE2-NEXT: jne .LBB34_1 +; SSE2-NEXT: # %bb.2: # %middle.block +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: retq +; +; AVX1-LABEL: sum_of_square_differences: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB34_1: # %vector.body +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: addq $8, %rcx +; AVX1-NEXT: cmpq %rcx, %rax +; AVX1-NEXT: jne .LBB34_1 +; AVX1-NEXT: # %bb.2: # %middle.block +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX256-LABEL: sum_of_square_differences: +; AVX256: # %bb.0: # %entry +; AVX256-NEXT: movl %edx, %eax +; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX256-NEXT: xorl %ecx, %ecx +; AVX256-NEXT: .p2align 4, 0x90 +; AVX256-NEXT: .LBB34_1: # %vector.body +; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX256-NEXT: vpsubd %ymm1, %ymm2, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm1, %ymm1 +; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX256-NEXT: addq $8, %rcx +; AVX256-NEXT: cmpq %rcx, %rax +; AVX256-NEXT: jne .LBB34_1 +; AVX256-NEXT: # %bb.2: # %middle.block +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vmovd %xmm0, %eax +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq +entry: + %0 = zext i32 %n to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %vec.phi = phi <8 x i32> [ %9, %vector.body ], [ zeroinitializer, %entry ] + %1 = getelementptr inbounds i8, i8* %a, i64 %index + %2 = bitcast i8* %1 to <8 x i8>* + %wide.load = load <8 x i8>, <8 x i8>* %2, align 1 + %3 = zext <8 x i8> %wide.load to <8 x i32> + %4 = getelementptr inbounds i8, i8* %b, i64 %index + %5 = bitcast i8* %4 to <8 x i8>* + %wide.load2 = load <8 x i8>, <8 x i8>* %5, align 1 + %6 = zext <8 x i8> %wide.load2 to <8 x i32> + %7 = sub <8 x i32> %6, %3 + %8 = mul <8 x i32> %7, %7 + %9 = add nsw <8 x i32> %8, %vec.phi + %index.next = add i64 %index, 8 + %10 = icmp eq i64 %index.next, %0 + br i1 %10, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf31 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx32 = add <8 x i32> %bin.rdx, %rdx.shuf31 + %rdx.shuf33 = shufflevector <8 x i32> %bin.rdx32, <8 x i32> undef, <8 x i32> + %bin.rdx34 = add <8 x i32> %bin.rdx32, %rdx.shuf33 + %11 = extractelement <8 x i32> %bin.rdx34, i32 0 + ret i32 %11 +} -- 2.7.4