%tmp30 = or i64 %tmp28, %tmp29
ret i64 %tmp30
}
+
+define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) {
+; SSE2-LABEL: sum_of_square_differences:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movl %edx, %eax
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: xorl %ecx, %ecx
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB34_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
+; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT: psubw %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: pmulhw %xmm4, %xmm3
+; SSE2-NEXT: pmullw %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; SSE2-NEXT: paddd %xmm5, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT: paddd %xmm4, %xmm2
+; SSE2-NEXT: addq $8, %rcx
+; SSE2-NEXT: cmpq %rcx, %rax
+; SSE2-NEXT: jne .LBB34_1
+; SSE2-NEXT: # %bb.2: # %middle.block
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: sum_of_square_differences:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: movl %edx, %eax
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: xorl %ecx, %ecx
+; AVX1-NEXT: .p2align 4, 0x90
+; AVX1-NEXT: .LBB34_1: # %vector.body
+; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: addq $8, %rcx
+; AVX1-NEXT: cmpq %rcx, %rax
+; AVX1-NEXT: jne .LBB34_1
+; AVX1-NEXT: # %bb.2: # %middle.block
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX256-LABEL: sum_of_square_differences:
+; AVX256: # %bb.0: # %entry
+; AVX256-NEXT: movl %edx, %eax
+; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX256-NEXT: xorl %ecx, %ecx
+; AVX256-NEXT: .p2align 4, 0x90
+; AVX256-NEXT: .LBB34_1: # %vector.body
+; AVX256-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX256-NEXT: vpsubd %ymm1, %ymm2, %ymm1
+; AVX256-NEXT: vpmulld %ymm1, %ymm1, %ymm1
+; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX256-NEXT: addq $8, %rcx
+; AVX256-NEXT: cmpq %rcx, %rax
+; AVX256-NEXT: jne .LBB34_1
+; AVX256-NEXT: # %bb.2: # %middle.block
+; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX256-NEXT: vmovd %xmm0, %eax
+; AVX256-NEXT: vzeroupper
+; AVX256-NEXT: retq
+entry:
+ %0 = zext i32 %n to i64
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+ %vec.phi = phi <8 x i32> [ %9, %vector.body ], [ zeroinitializer, %entry ]
+ %1 = getelementptr inbounds i8, i8* %a, i64 %index
+ %2 = bitcast i8* %1 to <8 x i8>*
+ %wide.load = load <8 x i8>, <8 x i8>* %2, align 1
+ %3 = zext <8 x i8> %wide.load to <8 x i32>
+ %4 = getelementptr inbounds i8, i8* %b, i64 %index
+ %5 = bitcast i8* %4 to <8 x i8>*
+ %wide.load2 = load <8 x i8>, <8 x i8>* %5, align 1
+ %6 = zext <8 x i8> %wide.load2 to <8 x i32>
+ %7 = sub <8 x i32> %6, %3
+ %8 = mul <8 x i32> %7, %7
+ %9 = add nsw <8 x i32> %8, %vec.phi
+ %index.next = add i64 %index, 8
+ %10 = icmp eq i64 %index.next, %0
+ br i1 %10, label %middle.block, label %vector.body
+
+middle.block:
+ %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx = add <8 x i32> %9, %rdx.shuf
+ %rdx.shuf31 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx32 = add <8 x i32> %bin.rdx, %rdx.shuf31
+ %rdx.shuf33 = shufflevector <8 x i32> %bin.rdx32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx34 = add <8 x i32> %bin.rdx32, %rdx.shuf33
+ %11 = extractelement <8 x i32> %bin.rdx34, i32 0
+ ret i32 %11
+}