- add third_party src.
[platform/framework/web/crosswalk.git] / src / third_party / libvpx / source / libvpx / vp9 / encoder / x86 / vp9_variance_impl_sse2.asm
1 ;
2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ;  Use of this source code is governed by a BSD-style license
5 ;  that can be found in the LICENSE file in the root of the source
6 ;  tree. An additional intellectual property rights grant can be found
7 ;  in the file PATENTS.  All contributing project authors may
8 ;  be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;unsigned int vp9_get_mb_ss_sse2
15 ;(
16 ;    short *src_ptr
17 ;)
18 global sym(vp9_get_mb_ss_sse2) PRIVATE
19 sym(vp9_get_mb_ss_sse2):
20     push        rbp
21     mov         rbp, rsp
22     SHADOW_ARGS_TO_STACK 1
23     GET_GOT     rbx
24     push rsi
25     push rdi
26     sub         rsp, 16
27     ; end prolog
28
29
30         mov         rax, arg(0) ;[src_ptr]
31         mov         rcx, 8
32         pxor        xmm4, xmm4
33
34 .NEXTROW:
35         movdqa      xmm0, [rax]
36         movdqa      xmm1, [rax+16]
37         movdqa      xmm2, [rax+32]
38         movdqa      xmm3, [rax+48]
39         pmaddwd     xmm0, xmm0
40         pmaddwd     xmm1, xmm1
41         pmaddwd     xmm2, xmm2
42         pmaddwd     xmm3, xmm3
43
44         paddd       xmm0, xmm1
45         paddd       xmm2, xmm3
46         paddd       xmm4, xmm0
47         paddd       xmm4, xmm2
48
49         add         rax, 0x40
50         dec         rcx
51         ja          .NEXTROW
52
53         movdqa      xmm3,xmm4
54         psrldq      xmm4,8
55         paddd       xmm4,xmm3
56         movdqa      xmm3,xmm4
57         psrldq      xmm4,4
58         paddd       xmm4,xmm3
59         movq        rax,xmm4
60
61
62     ; begin epilog
63     add rsp, 16
64     pop rdi
65     pop rsi
66     RESTORE_GOT
67     UNSHADOW_ARGS
68     pop         rbp
69     ret
70
71
72 ;unsigned int vp9_get16x16var_sse2
73 ;(
74 ;    unsigned char   *  src_ptr,
75 ;    int             source_stride,
76 ;    unsigned char   *  ref_ptr,
77 ;    int             recon_stride,
78 ;    unsigned int    *  SSE,
79 ;    int             *  Sum
80 ;)
81 global sym(vp9_get16x16var_sse2) PRIVATE
82 sym(vp9_get16x16var_sse2):
83     push        rbp
84     mov         rbp, rsp
85     SHADOW_ARGS_TO_STACK 6
86     SAVE_XMM 7
87     push rbx
88     push rsi
89     push rdi
90     ; end prolog
91
92         mov         rsi,            arg(0) ;[src_ptr]
93         mov         rdi,            arg(2) ;[ref_ptr]
94
95         movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
96         movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
97
98         ; Prefetch data
99         lea             rcx,    [rax+rax*2]
100         prefetcht0      [rsi]
101         prefetcht0      [rsi+rax]
102         prefetcht0      [rsi+rax*2]
103         prefetcht0      [rsi+rcx]
104         lea             rbx,    [rsi+rax*4]
105         prefetcht0      [rbx]
106         prefetcht0      [rbx+rax]
107         prefetcht0      [rbx+rax*2]
108         prefetcht0      [rbx+rcx]
109
110         lea             rcx,    [rdx+rdx*2]
111         prefetcht0      [rdi]
112         prefetcht0      [rdi+rdx]
113         prefetcht0      [rdi+rdx*2]
114         prefetcht0      [rdi+rcx]
115         lea             rbx,    [rdi+rdx*4]
116         prefetcht0      [rbx]
117         prefetcht0      [rbx+rdx]
118         prefetcht0      [rbx+rdx*2]
119         prefetcht0      [rbx+rcx]
120
121         pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
122         pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
123
124         pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
125         mov         rcx,            16
126
127 .var16loop:
128         movdqu      xmm1,           XMMWORD PTR [rsi]
129         movdqu      xmm2,           XMMWORD PTR [rdi]
130
131         prefetcht0      [rsi+rax*8]
132         prefetcht0      [rdi+rdx*8]
133
134         movdqa      xmm3,           xmm1
135         movdqa      xmm4,           xmm2
136
137
138         punpcklbw   xmm1,           xmm0
139         punpckhbw   xmm3,           xmm0
140
141         punpcklbw   xmm2,           xmm0
142         punpckhbw   xmm4,           xmm0
143
144
145         psubw       xmm1,           xmm2
146         psubw       xmm3,           xmm4
147
148         paddw       xmm7,           xmm1
149         pmaddwd     xmm1,           xmm1
150
151         paddw       xmm7,           xmm3
152         pmaddwd     xmm3,           xmm3
153
154         paddd       xmm6,           xmm1
155         paddd       xmm6,           xmm3
156
157         add         rsi,            rax
158         add         rdi,            rdx
159
160         sub         rcx,            1
161         jnz         .var16loop
162
163
164         movdqa      xmm1,           xmm6
165         pxor        xmm6,           xmm6
166
167         pxor        xmm5,           xmm5
168         punpcklwd   xmm6,           xmm7
169
170         punpckhwd   xmm5,           xmm7
171         psrad       xmm5,           16
172
173         psrad       xmm6,           16
174         paddd       xmm6,           xmm5
175
176         movdqa      xmm2,           xmm1
177         punpckldq   xmm1,           xmm0
178
179         punpckhdq   xmm2,           xmm0
180         movdqa      xmm7,           xmm6
181
182         paddd       xmm1,           xmm2
183         punpckldq   xmm6,           xmm0
184
185         punpckhdq   xmm7,           xmm0
186         paddd       xmm6,           xmm7
187
188         movdqa      xmm2,           xmm1
189         movdqa      xmm7,           xmm6
190
191         psrldq      xmm1,           8
192         psrldq      xmm6,           8
193
194         paddd       xmm7,           xmm6
195         paddd       xmm1,           xmm2
196
197         mov         rax,            arg(5) ;[Sum]
198         mov         rdi,            arg(4) ;[SSE]
199
200         movd DWORD PTR [rax],       xmm7
201         movd DWORD PTR [rdi],       xmm1
202
203
204     ; begin epilog
205     pop rdi
206     pop rsi
207     pop rbx
208     RESTORE_XMM
209     UNSHADOW_ARGS
210     pop         rbp
211     ret
212
213
214
215
216 ;unsigned int vp9_get8x8var_sse2
217 ;(
218 ;    unsigned char   *  src_ptr,
219 ;    int             source_stride,
220 ;    unsigned char   *  ref_ptr,
221 ;    int             recon_stride,
222 ;    unsigned int    *  SSE,
223 ;    int             *  Sum
224 ;)
225 global sym(vp9_get8x8var_sse2) PRIVATE
226 sym(vp9_get8x8var_sse2):
227     push        rbp
228     mov         rbp, rsp
229     SHADOW_ARGS_TO_STACK 6
230     SAVE_XMM 7
231     GET_GOT     rbx
232     push rsi
233     push rdi
234     sub         rsp, 16
235     ; end prolog
236
237         mov         rsi,            arg(0) ;[src_ptr]
238         mov         rdi,            arg(2) ;[ref_ptr]
239
240         movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
241         movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
242
243         pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
244         pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
245
246         movq        xmm1,           QWORD PTR [rsi]
247         movq        xmm2,           QWORD PTR [rdi]
248
249         punpcklbw   xmm1,           xmm0
250         punpcklbw   xmm2,           xmm0
251
252         psubsw      xmm1,           xmm2
253         paddw       xmm7,           xmm1
254
255         pmaddwd     xmm1,           xmm1
256
257         movq        xmm2,           QWORD PTR[rsi + rax]
258         movq        xmm3,           QWORD PTR[rdi + rdx]
259
260         punpcklbw   xmm2,           xmm0
261         punpcklbw   xmm3,           xmm0
262
263         psubsw      xmm2,           xmm3
264         paddw       xmm7,           xmm2
265
266         pmaddwd     xmm2,           xmm2
267         paddd       xmm1,           xmm2
268
269
270         movq        xmm2,           QWORD PTR[rsi + rax * 2]
271         movq        xmm3,           QWORD PTR[rdi + rdx * 2]
272
273         punpcklbw   xmm2,           xmm0
274         punpcklbw   xmm3,           xmm0
275
276         psubsw      xmm2,           xmm3
277         paddw       xmm7,           xmm2
278
279         pmaddwd     xmm2,           xmm2
280         paddd       xmm1,           xmm2
281
282
283         lea         rsi,            [rsi + rax * 2]
284         lea         rdi,            [rdi + rdx * 2]
285         movq        xmm2,           QWORD PTR[rsi + rax]
286         movq        xmm3,           QWORD PTR[rdi + rdx]
287
288         punpcklbw   xmm2,           xmm0
289         punpcklbw   xmm3,           xmm0
290
291         psubsw      xmm2,           xmm3
292         paddw       xmm7,           xmm2
293
294         pmaddwd     xmm2,           xmm2
295         paddd       xmm1,           xmm2
296
297         movq        xmm2,           QWORD PTR[rsi + rax *2]
298         movq        xmm3,           QWORD PTR[rdi + rdx *2]
299
300         punpcklbw   xmm2,           xmm0
301         punpcklbw   xmm3,           xmm0
302
303         psubsw      xmm2,           xmm3
304         paddw       xmm7,           xmm2
305
306         pmaddwd     xmm2,           xmm2
307         paddd       xmm1,           xmm2
308
309
310         lea         rsi,            [rsi + rax * 2]
311         lea         rdi,            [rdi + rdx * 2]
312
313
314         movq        xmm2,           QWORD PTR[rsi + rax]
315         movq        xmm3,           QWORD PTR[rdi + rdx]
316
317         punpcklbw   xmm2,           xmm0
318         punpcklbw   xmm3,           xmm0
319
320         psubsw      xmm2,           xmm3
321         paddw       xmm7,           xmm2
322
323         pmaddwd     xmm2,           xmm2
324         paddd       xmm1,           xmm2
325
326         movq        xmm2,           QWORD PTR[rsi + rax *2]
327         movq        xmm3,           QWORD PTR[rdi + rdx *2]
328
329         punpcklbw   xmm2,           xmm0
330         punpcklbw   xmm3,           xmm0
331
332         psubsw      xmm2,           xmm3
333         paddw       xmm7,           xmm2
334
335         pmaddwd     xmm2,           xmm2
336         paddd       xmm1,           xmm2
337
338
339         lea         rsi,            [rsi + rax * 2]
340         lea         rdi,            [rdi + rdx * 2]
341
342         movq        xmm2,           QWORD PTR[rsi + rax]
343         movq        xmm3,           QWORD PTR[rdi + rdx]
344
345         punpcklbw   xmm2,           xmm0
346         punpcklbw   xmm3,           xmm0
347
348         psubsw      xmm2,           xmm3
349         paddw       xmm7,           xmm2
350
351         pmaddwd     xmm2,           xmm2
352         paddd       xmm1,           xmm2
353
354
355         movdqa      xmm6,           xmm7
356         punpcklwd   xmm6,           xmm0
357
358         punpckhwd   xmm7,           xmm0
359         movdqa      xmm2,           xmm1
360
361         paddw       xmm6,           xmm7
362         punpckldq   xmm1,           xmm0
363
364         punpckhdq   xmm2,           xmm0
365         movdqa      xmm7,           xmm6
366
367         paddd       xmm1,           xmm2
368         punpckldq   xmm6,           xmm0
369
370         punpckhdq   xmm7,           xmm0
371         paddw       xmm6,           xmm7
372
373         movdqa      xmm2,           xmm1
374         movdqa      xmm7,           xmm6
375
376         psrldq      xmm1,           8
377         psrldq      xmm6,           8
378
379         paddw       xmm7,           xmm6
380         paddd       xmm1,           xmm2
381
382         mov         rax,            arg(5) ;[Sum]
383         mov         rdi,            arg(4) ;[SSE]
384
385         movq        rdx,            xmm7
386         movsx       rcx,            dx
387
388         mov  dword ptr [rax],       ecx
389         movd DWORD PTR [rdi],       xmm1
390
391     ; begin epilog
392     add rsp, 16
393     pop rdi
394     pop rsi
395     RESTORE_GOT
396     RESTORE_XMM
397     UNSHADOW_ARGS
398     pop         rbp
399     ret
400
401 ;void vp9_half_horiz_vert_variance8x_h_sse2
402 ;(
403 ;    unsigned char *ref_ptr,
404 ;    int ref_pixels_per_line,
405 ;    unsigned char *src_ptr,
406 ;    int src_pixels_per_line,
407 ;    unsigned int Height,
408 ;    int *sum,
409 ;    unsigned int *sumsquared
410 ;)
411 global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
412 sym(vp9_half_horiz_vert_variance8x_h_sse2):
413     push        rbp
414     mov         rbp, rsp
415     SHADOW_ARGS_TO_STACK 7
416     SAVE_XMM 7
417     GET_GOT     rbx
418     push rsi
419     push rdi
420     ; end prolog
421
422 %if ABI_IS_32BIT=0
423     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
424     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
425 %endif
426
427         pxor            xmm6,           xmm6                ;  error accumulator
428         pxor            xmm7,           xmm7                ;  sse eaccumulator
429         mov             rsi,            arg(0) ;ref_ptr              ;
430
431         mov             rdi,            arg(2) ;src_ptr              ;
432         movsxd          rcx,            dword ptr arg(4) ;Height              ;
433         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
434
435         pxor            xmm0,           xmm0                ;
436
437         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
438         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
439         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
440
441 %if ABI_IS_32BIT
442         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
443 %else
444         add             rsi, r8
445 %endif
446
447 .half_horiz_vert_variance8x_h_1:
448
449         movq            xmm1,           QWORD PTR [rsi]     ;
450         movq            xmm2,           QWORD PTR [rsi+1]   ;
451         pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
452
453         pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
454         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
455
456         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
457         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
458
459         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
460         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
461         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
462         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
463
464         movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
465
466 %if ABI_IS_32BIT
467         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
468         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
469 %else
470         add             rsi, r8
471         add             rdi, r9
472 %endif
473
474         sub             rcx,            1                   ;
475         jnz             .half_horiz_vert_variance8x_h_1     ;
476
477         movdq2q         mm6,            xmm6                ;
478         movdq2q         mm7,            xmm7                ;
479
480         psrldq          xmm6,           8
481         psrldq          xmm7,           8
482
483         movdq2q         mm2,            xmm6
484         movdq2q         mm3,            xmm7
485
486         paddw           mm6,            mm2
487         paddd           mm7,            mm3
488
489         pxor            mm3,            mm3                 ;
490         pxor            mm2,            mm2                 ;
491
492         punpcklwd       mm2,            mm6                 ;
493         punpckhwd       mm3,            mm6                 ;
494
495         paddd           mm2,            mm3                 ;
496         movq            mm6,            mm2                 ;
497
498         psrlq           mm6,            32                  ;
499         paddd           mm2,            mm6                 ;
500
501         psrad           mm2,            16                  ;
502         movq            mm4,            mm7                 ;
503
504         psrlq           mm4,            32                  ;
505         paddd           mm4,            mm7                 ;
506
507         mov             rsi,            arg(5) ; sum
508         mov             rdi,            arg(6) ; sumsquared
509
510         movd            [rsi],          mm2                 ;
511         movd            [rdi],          mm4                 ;
512
513
514     ; begin epilog
515     pop rdi
516     pop rsi
517     RESTORE_GOT
518     RESTORE_XMM
519     UNSHADOW_ARGS
520     pop         rbp
521     ret
522
523 ;void vp9_half_vert_variance8x_h_sse2
524 ;(
525 ;    unsigned char *ref_ptr,
526 ;    int ref_pixels_per_line,
527 ;    unsigned char *src_ptr,
528 ;    int src_pixels_per_line,
529 ;    unsigned int Height,
530 ;    int *sum,
531 ;    unsigned int *sumsquared
532 ;)
533 global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
534 sym(vp9_half_vert_variance8x_h_sse2):
535     push        rbp
536     mov         rbp, rsp
537     SHADOW_ARGS_TO_STACK 7
538     SAVE_XMM 7
539     GET_GOT     rbx
540     push rsi
541     push rdi
542     ; end prolog
543
544 %if ABI_IS_32BIT=0
545     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
546     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
547 %endif
548
549         pxor            xmm6,           xmm6                ;  error accumulator
550         pxor            xmm7,           xmm7                ;  sse eaccumulator
551         mov             rsi,            arg(0) ;ref_ptr              ;
552
553         mov             rdi,            arg(2) ;src_ptr              ;
554         movsxd          rcx,            dword ptr arg(4) ;Height              ;
555         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
556
557         pxor            xmm0,           xmm0                ;
558 .half_vert_variance8x_h_1:
559         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
560         movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
561
562         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
563         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
564
565         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
566         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
567
568         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
569         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
570         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
571         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
572
573 %if ABI_IS_32BIT
574         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
575         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
576 %else
577         add             rsi, r8
578         add             rdi, r9
579 %endif
580
581         sub             rcx,            1                   ;
582         jnz             .half_vert_variance8x_h_1          ;
583
584         movdq2q         mm6,            xmm6                ;
585         movdq2q         mm7,            xmm7                ;
586
587         psrldq          xmm6,           8
588         psrldq          xmm7,           8
589
590         movdq2q         mm2,            xmm6
591         movdq2q         mm3,            xmm7
592
593         paddw           mm6,            mm2
594         paddd           mm7,            mm3
595
596         pxor            mm3,            mm3                 ;
597         pxor            mm2,            mm2                 ;
598
599         punpcklwd       mm2,            mm6                 ;
600         punpckhwd       mm3,            mm6                 ;
601
602         paddd           mm2,            mm3                 ;
603         movq            mm6,            mm2                 ;
604
605         psrlq           mm6,            32                  ;
606         paddd           mm2,            mm6                 ;
607
608         psrad           mm2,            16                  ;
609         movq            mm4,            mm7                 ;
610
611         psrlq           mm4,            32                  ;
612         paddd           mm4,            mm7                 ;
613
614         mov             rsi,            arg(5) ; sum
615         mov             rdi,            arg(6) ; sumsquared
616
617         movd            [rsi],          mm2                 ;
618         movd            [rdi],          mm4                 ;
619
620
621     ; begin epilog
622     pop rdi
623     pop rsi
624     RESTORE_GOT
625     RESTORE_XMM
626     UNSHADOW_ARGS
627     pop         rbp
628     ret
629
630
631 ;void vp9_half_horiz_variance8x_h_sse2
632 ;(
633 ;    unsigned char *ref_ptr,
634 ;    int ref_pixels_per_line,
635 ;    unsigned char *src_ptr,
636 ;    int src_pixels_per_line,
637 ;    unsigned int Height,
638 ;    int *sum,
639 ;    unsigned int *sumsquared
640 ;)
641 global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
642 sym(vp9_half_horiz_variance8x_h_sse2):
643     push        rbp
644     mov         rbp, rsp
645     SHADOW_ARGS_TO_STACK 7
646     SAVE_XMM 7
647     GET_GOT     rbx
648     push rsi
649     push rdi
650     ; end prolog
651
652 %if ABI_IS_32BIT=0
653     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
654     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
655 %endif
656
657         pxor            xmm6,           xmm6                ;  error accumulator
658         pxor            xmm7,           xmm7                ;  sse eaccumulator
659         mov             rsi,            arg(0) ;ref_ptr              ;
660
661         mov             rdi,            arg(2) ;src_ptr              ;
662         movsxd          rcx,            dword ptr arg(4) ;Height              ;
663
664         pxor            xmm0,           xmm0                ;
665 .half_horiz_variance8x_h_1:
666         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
667         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
668
669         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
670         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
671
672         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
673         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
674
675         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
676         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
677         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
678         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
679
680 %if ABI_IS_32BIT
681         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
682         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
683 %else
684         add             rsi, r8
685         add             rdi, r9
686 %endif
687         sub             rcx,            1                   ;
688         jnz             .half_horiz_variance8x_h_1          ;
689
690         movdq2q         mm6,            xmm6                ;
691         movdq2q         mm7,            xmm7                ;
692
693         psrldq          xmm6,           8
694         psrldq          xmm7,           8
695
696         movdq2q         mm2,            xmm6
697         movdq2q         mm3,            xmm7
698
699         paddw           mm6,            mm2
700         paddd           mm7,            mm3
701
702         pxor            mm3,            mm3                 ;
703         pxor            mm2,            mm2                 ;
704
705         punpcklwd       mm2,            mm6                 ;
706         punpckhwd       mm3,            mm6                 ;
707
708         paddd           mm2,            mm3                 ;
709         movq            mm6,            mm2                 ;
710
711         psrlq           mm6,            32                  ;
712         paddd           mm2,            mm6                 ;
713
714         psrad           mm2,            16                  ;
715         movq            mm4,            mm7                 ;
716
717         psrlq           mm4,            32                  ;
718         paddd           mm4,            mm7                 ;
719
720         mov             rsi,            arg(5) ; sum
721         mov             rdi,            arg(6) ; sumsquared
722
723         movd            [rsi],          mm2                 ;
724         movd            [rdi],          mm4                 ;
725
726
727     ; begin epilog
728     pop rdi
729     pop rsi
730     RESTORE_GOT
731     RESTORE_XMM
732     UNSHADOW_ARGS
733     pop         rbp
734     ret