Make libvpx Chromium build friendly
[profile/ivi/libvpx.git] / vp8 / common / x86 / postproc_sse2.asm
1 ;
2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ;  Use of this source code is governed by a BSD-style license
5 ;  that can be found in the LICENSE file in the root of the source
6 ;  tree. An additional intellectual property rights grant can be found
7 ;  in the file PATENTS.  All contributing project authors may
8 ;  be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;void vp8_post_proc_down_and_across_xmm
15 ;(
16 ;    unsigned char *src_ptr,
17 ;    unsigned char *dst_ptr,
18 ;    int src_pixels_per_line,
19 ;    int dst_pixels_per_line,
20 ;    int rows,
21 ;    int cols,
22 ;    int flimit
23 ;)
24 global sym(vp8_post_proc_down_and_across_xmm) PRIVATE
25 sym(vp8_post_proc_down_and_across_xmm):
26     push        rbp
27     mov         rbp, rsp
28     SHADOW_ARGS_TO_STACK 7
29     SAVE_XMM 7
30     GET_GOT     rbx
31     push        rsi
32     push        rdi
33     ; end prolog
34
35 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
36     ALIGN_STACK 16, rax
37     ; move the global rd onto the stack, since we don't have enough registers
38     ; to do PIC addressing
39     movdqa      xmm0, [GLOBAL(rd42)]
40     sub         rsp, 16
41     movdqa      [rsp], xmm0
42 %define RD42 [rsp]
43 %else
44 %define RD42 [GLOBAL(rd42)]
45 %endif
46
47
48         movd        xmm2,       dword ptr arg(6) ;flimit
49         punpcklwd   xmm2,       xmm2
50         punpckldq   xmm2,       xmm2
51         punpcklqdq  xmm2,       xmm2
52
53         mov         rsi,        arg(0) ;src_ptr
54         mov         rdi,        arg(1) ;dst_ptr
55
56         movsxd      rcx,        DWORD PTR arg(4) ;rows
57         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
58         pxor        xmm0,       xmm0              ; mm0 = 00000000
59
60 .nextrow:
61
62         xor         rdx,        rdx       ; clear out rdx for use as loop counter
63 .nextcol:
64         movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
65         punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
66         movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
67         psllw       xmm3,       2                       ;
68
69         movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
70         punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
71         paddusw     xmm3,       xmm5                    ; mm3 += mm6
72
73         ; thresholding
74         movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
75         psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
76         psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
77         paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
78         pcmpgtw     xmm7,       xmm2
79
80         movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
81         punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
82         paddusw     xmm3,       xmm5                    ; mm3 += mm5
83
84         ; thresholding
85         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
86         psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
87         psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
88         paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
89         pcmpgtw     xmm6,       xmm2
90         por         xmm7,       xmm6                    ; accumulate thresholds
91
92
93         neg         rax
94         movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
95         punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
96         paddusw     xmm3,       xmm5                    ; mm3 += mm5
97
98         ; thresholding
99         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
100         psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
101         psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
102         paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
103         pcmpgtw     xmm6,       xmm2
104         por         xmm7,       xmm6                    ; accumulate thresholds
105
106         movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
107         punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
108         paddusw     xmm3,       xmm4                    ; mm3 += mm5
109
110         ; thresholding
111         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
112         psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
113         psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
114         paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
115         pcmpgtw     xmm6,       xmm2
116         por         xmm7,       xmm6                    ; accumulate thresholds
117
118
119         paddusw     xmm3,       RD42                    ; mm3 += round value
120         psraw       xmm3,       3                       ; mm3 /= 8
121
122         pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
123         pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
124         paddusw     xmm1,       xmm7                    ; combination
125
126         packuswb    xmm1,       xmm0                    ; pack to bytes
127         movq        QWORD PTR [rdi], xmm1             ;
128
129         neg         rax                   ; pitch is positive
130         add         rsi,        8
131         add         rdi,        8
132
133         add         rdx,        8
134         cmp         edx,        dword arg(5) ;cols
135
136         jl          .nextcol
137
138         ; done with the all cols, start the across filtering in place
139         sub         rsi,        rdx
140         sub         rdi,        rdx
141
142
143         ; dup the first byte into the left border 8 times
144         movq        mm1,   [rdi]
145         punpcklbw   mm1,   mm1
146         punpcklwd   mm1,   mm1
147         punpckldq   mm1,   mm1
148
149         mov         rdx,    -8
150         movq        [rdi+rdx], mm1
151
152         ; dup the last byte into the right border
153         movsxd      rdx,    dword arg(5)
154         movq        mm1,   [rdi + rdx + -1]
155         punpcklbw   mm1,   mm1
156         punpcklwd   mm1,   mm1
157         punpckldq   mm1,   mm1
158         movq        [rdi+rdx], mm1
159
160         xor         rdx,        rdx
161         movq        mm0,        QWORD PTR [rdi-8];
162
163 .acrossnextcol:
164         movq        xmm7,       QWORD PTR [rdi +rdx -2]
165         movd        xmm4,       DWORD PTR [rdi +rdx +6]
166
167         pslldq      xmm4,       8
168         por         xmm4,       xmm7
169
170         movdqa      xmm3,       xmm4
171         psrldq      xmm3,       2
172         punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
173         movdqa      xmm1,       xmm3              ; mm1 = p0..p3
174         psllw       xmm3,       2
175
176
177         movdqa      xmm5,       xmm4
178         psrldq      xmm5,       3
179         punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
180         paddusw     xmm3,       xmm5              ; mm3 += mm6
181
182         ; thresholding
183         movdqa      xmm7,       xmm1              ; mm7 = p0..p3
184         psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
185         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
186         paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
187         pcmpgtw     xmm7,       xmm2
188
189         movdqa      xmm5,       xmm4
190         psrldq      xmm5,       4
191         punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
192         paddusw     xmm3,       xmm5              ; mm3 += mm5
193
194         ; thresholding
195         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
196         psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
197         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
198         paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
199         pcmpgtw     xmm6,       xmm2
200         por         xmm7,       xmm6              ; accumulate thresholds
201
202
203         movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
204         punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
205         paddusw     xmm3,       xmm5              ; mm3 += mm5
206
207         ; thresholding
208         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
209         psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
210         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
211         paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
212         pcmpgtw     xmm6,       xmm2
213         por         xmm7,       xmm6              ; accumulate thresholds
214
215         psrldq      xmm4,       1                   ; mm4 = p-1..p5
216         punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
217         paddusw     xmm3,       xmm4              ; mm3 += mm5
218
219         ; thresholding
220         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
221         psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
222         psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
223         paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
224         pcmpgtw     xmm6,       xmm2
225         por         xmm7,       xmm6              ; accumulate thresholds
226
227         paddusw     xmm3,       RD42              ; mm3 += round value
228         psraw       xmm3,       3                 ; mm3 /= 8
229
230         pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
231         pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
232         paddusw     xmm1,       xmm7              ; combination
233
234         packuswb    xmm1,       xmm0              ; pack to bytes
235         movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
236         movdq2q     mm0,        xmm1
237
238         add         rdx,        8
239         cmp         edx,        dword arg(5) ;cols
240         jl          .acrossnextcol;
241
242         ; last 8 pixels
243         movq        QWORD PTR [rdi+rdx-8],  mm0
244
245         ; done with this rwo
246         add         rsi,rax               ; next line
247         mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
248         add         rdi,rax               ; next destination
249         mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
250
251         dec         rcx                   ; decrement count
252         jnz         .nextrow              ; next row
253
254 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
255     add rsp,16
256     pop rsp
257 %endif
258     ; begin epilog
259     pop rdi
260     pop rsi
261     RESTORE_GOT
262     RESTORE_XMM
263     UNSHADOW_ARGS
264     pop         rbp
265     ret
266 %undef RD42
267
268
269 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
270 ;                            int pitch, int rows, int cols,int flimit)
271 extern sym(vp8_rv)
272 global sym(vp8_mbpost_proc_down_xmm) PRIVATE
273 sym(vp8_mbpost_proc_down_xmm):
274     push        rbp
275     mov         rbp, rsp
276     SHADOW_ARGS_TO_STACK 5
277     SAVE_XMM 7
278     GET_GOT     rbx
279     push        rsi
280     push        rdi
281     ; end prolog
282
283     ALIGN_STACK 16, rax
284     sub         rsp, 128+16
285
286     ; unsigned char d[16][8] at [rsp]
287     ; create flimit2 at [rsp+128]
288     mov         eax, dword ptr arg(4) ;flimit
289     mov         [rsp+128], eax
290     mov         [rsp+128+4], eax
291     mov         [rsp+128+8], eax
292     mov         [rsp+128+12], eax
293 %define flimit4 [rsp+128]
294
295 %if ABI_IS_32BIT=0
296     lea         r8,       [GLOBAL(sym(vp8_rv))]
297 %endif
298
299     ;rows +=8;
300     add         dword arg(2), 8
301
302     ;for(c=0; c<cols; c+=8)
303 .loop_col:
304             mov         rsi,        arg(0) ; s
305             pxor        xmm0,       xmm0        ;
306
307             movsxd      rax,        dword ptr arg(1) ;pitch       ;
308
309             ; this copies the last row down into the border 8 rows
310             mov         rdi,        rsi
311             mov         rdx,        arg(2)
312             sub         rdx,        9
313             imul        rdx,        rax
314             lea         rdi,        [rdi+rdx]
315             movq        xmm1,       QWORD ptr[rdi]              ; first row
316             mov         rcx,        8
317 .init_borderd                                                    ; initialize borders
318             lea         rdi,        [rdi + rax]
319             movq        [rdi],      xmm1
320
321             dec         rcx
322             jne         .init_borderd
323
324             neg         rax                                     ; rax = -pitch
325
326             ; this copies the first row up into the border 8 rows
327             mov         rdi,        rsi
328             movq        xmm1,       QWORD ptr[rdi]              ; first row
329             mov         rcx,        8
330 .init_border                                                    ; initialize borders
331             lea         rdi,        [rdi + rax]
332             movq        [rdi],      xmm1
333
334             dec         rcx
335             jne         .init_border
336
337
338
339             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
340             neg         rax
341
342             pxor        xmm5,       xmm5
343             pxor        xmm6,       xmm6        ;
344
345             pxor        xmm7,       xmm7        ;
346             mov         rdi,        rsi
347
348             mov         rcx,        15          ;
349
350 .loop_initvar:
351             movq        xmm1,       QWORD PTR [rdi];
352             punpcklbw   xmm1,       xmm0        ;
353
354             paddw       xmm5,       xmm1        ;
355             pmullw      xmm1,       xmm1        ;
356
357             movdqa      xmm2,       xmm1        ;
358             punpcklwd   xmm1,       xmm0        ;
359
360             punpckhwd   xmm2,       xmm0        ;
361             paddd       xmm6,       xmm1        ;
362
363             paddd       xmm7,       xmm2        ;
364             lea         rdi,        [rdi+rax]   ;
365
366             dec         rcx
367             jne         .loop_initvar
368             ;save the var and sum
369             xor         rdx,        rdx
370 .loop_row:
371             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
372             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
373
374             punpcklbw   xmm1,       xmm0
375             punpcklbw   xmm2,       xmm0
376
377             paddw       xmm5,       xmm2
378             psubw       xmm5,       xmm1
379
380             pmullw      xmm2,       xmm2
381             movdqa      xmm4,       xmm2
382
383             punpcklwd   xmm2,       xmm0
384             punpckhwd   xmm4,       xmm0
385
386             paddd       xmm6,       xmm2
387             paddd       xmm7,       xmm4
388
389             pmullw      xmm1,       xmm1
390             movdqa      xmm2,       xmm1
391
392             punpcklwd   xmm1,       xmm0
393             psubd       xmm6,       xmm1
394
395             punpckhwd   xmm2,       xmm0
396             psubd       xmm7,       xmm2
397
398
399             movdqa      xmm3,       xmm6
400             pslld       xmm3,       4
401
402             psubd       xmm3,       xmm6
403             movdqa      xmm1,       xmm5
404
405             movdqa      xmm4,       xmm5
406             pmullw      xmm1,       xmm1
407
408             pmulhw      xmm4,       xmm4
409             movdqa      xmm2,       xmm1
410
411             punpcklwd   xmm1,       xmm4
412             punpckhwd   xmm2,       xmm4
413
414             movdqa      xmm4,       xmm7
415             pslld       xmm4,       4
416
417             psubd       xmm4,       xmm7
418
419             psubd       xmm3,       xmm1
420             psubd       xmm4,       xmm2
421
422             psubd       xmm3,       flimit4
423             psubd       xmm4,       flimit4
424
425             psrad       xmm3,       31
426             psrad       xmm4,       31
427
428             packssdw    xmm3,       xmm4
429             packsswb    xmm3,       xmm0
430
431             movq        xmm1,       QWORD PTR [rsi+rax*8]
432
433             movq        xmm2,       xmm1
434             punpcklbw   xmm1,       xmm0
435
436             paddw       xmm1,       xmm5
437             mov         rcx,        rdx
438
439             and         rcx,        127
440 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
441             push        rax
442             lea         rax,        [GLOBAL(sym(vp8_rv))]
443             movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
444             pop         rax
445 %elif ABI_IS_32BIT=0
446             movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
447 %else
448             movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
449 %endif
450
451             paddw       xmm1,       xmm4
452             ;paddw     xmm1,       eight8s
453             psraw       xmm1,       4
454
455             packuswb    xmm1,       xmm0
456             pand        xmm1,       xmm3
457
458             pandn       xmm3,       xmm2
459             por         xmm1,       xmm3
460
461             and         rcx,        15
462             movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
463
464             mov         rcx,        rdx
465             sub         rcx,        8
466
467             and         rcx,        15
468             movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
469
470             movq        [rsi],      mm0
471             lea         rsi,        [rsi+rax]
472
473             lea         rdi,        [rdi+rax]
474             add         rdx,        1
475
476             cmp         edx,        dword arg(2) ;rows
477             jl          .loop_row
478
479         add         dword arg(0), 8 ; s += 8
480         sub         dword arg(3), 8 ; cols -= 8
481         cmp         dword arg(3), 0
482         jg          .loop_col
483
484     add         rsp, 128+16
485     pop         rsp
486
487     ; begin epilog
488     pop rdi
489     pop rsi
490     RESTORE_GOT
491     RESTORE_XMM
492     UNSHADOW_ARGS
493     pop         rbp
494     ret
495 %undef flimit4
496
497
498 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
499 ;                                int pitch, int rows, int cols,int flimit)
500 global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
501 sym(vp8_mbpost_proc_across_ip_xmm):
502     push        rbp
503     mov         rbp, rsp
504     SHADOW_ARGS_TO_STACK 5
505     SAVE_XMM 7
506     GET_GOT     rbx
507     push        rsi
508     push        rdi
509     ; end prolog
510
511     ALIGN_STACK 16, rax
512     sub         rsp, 16
513
514     ; create flimit4 at [rsp]
515     mov         eax, dword ptr arg(4) ;flimit
516     mov         [rsp], eax
517     mov         [rsp+4], eax
518     mov         [rsp+8], eax
519     mov         [rsp+12], eax
520 %define flimit4 [rsp]
521
522
523     ;for(r=0;r<rows;r++)
524 .ip_row_loop:
525
526         xor         rdx,    rdx ;sumsq=0;
527         xor         rcx,    rcx ;sum=0;
528         mov         rsi,    arg(0); s
529
530
531         ; dup the first byte into the left border 8 times
532         movq        mm1,   [rsi]
533         punpcklbw   mm1,   mm1
534         punpcklwd   mm1,   mm1
535         punpckldq   mm1,   mm1
536
537         mov         rdi,    -8
538         movq        [rsi+rdi], mm1
539
540         ; dup the last byte into the right border
541         movsxd      rdx,    dword arg(3)
542         movq        mm1,   [rsi + rdx + -1]
543         punpcklbw   mm1,   mm1
544         punpcklwd   mm1,   mm1
545         punpckldq   mm1,   mm1
546         movq        [rsi+rdx], mm1
547
548 .ip_var_loop:
549         ;for(i=-8;i<=6;i++)
550         ;{
551         ;    sumsq += s[i]*s[i];
552         ;    sum   += s[i];
553         ;}
554         movzx       eax, byte [rsi+rdi]
555         add         ecx, eax
556         mul         al
557         add         edx, eax
558         add         rdi, 1
559         cmp         rdi, 6
560         jle         .ip_var_loop
561
562
563             ;mov         rax,    sumsq
564             ;movd        xmm7,   rax
565             movd        xmm7,   edx
566
567             ;mov         rax,    sum
568             ;movd        xmm6,   rax
569             movd        xmm6,   ecx
570
571             mov         rsi,    arg(0) ;s
572             xor         rcx,    rcx
573
574             movsxd      rdx,    dword arg(3) ;cols
575             add         rdx,    8
576             pxor        mm0,    mm0
577             pxor        mm1,    mm1
578
579             pxor        xmm0,   xmm0
580 .nextcol4:
581
582             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
583             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
584
585             punpcklbw   xmm1,   xmm0                    ; expanding
586             punpcklbw   xmm2,   xmm0                    ; expanding
587
588             punpcklwd   xmm1,   xmm0                    ; expanding to dwords
589             punpcklwd   xmm2,   xmm0                    ; expanding to dwords
590
591             psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
592             paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
593
594             paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
595             pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
596
597             paddd       xmm6,   xmm2
598             paddd       xmm7,   xmm1
599
600             pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
601             pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
602
603             psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
604             psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
605
606             pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
607             pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
608
609             paddd       xmm6,   xmm4
610             paddd       xmm7,   xmm3
611
612             pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
613             pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
614
615             paddd       xmm7,   xmm3
616             paddd       xmm6,   xmm4
617
618             pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
619             pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
620
621             paddd       xmm7,   xmm3
622             paddd       xmm6,   xmm4
623
624             movdqa      xmm3,   xmm6
625             pmaddwd     xmm3,   xmm3
626
627             movdqa      xmm5,   xmm7
628             pslld       xmm5,   4
629
630             psubd       xmm5,   xmm7
631             psubd       xmm5,   xmm3
632
633             psubd       xmm5,   flimit4
634             psrad       xmm5,   31
635
636             packssdw    xmm5,   xmm0
637             packsswb    xmm5,   xmm0
638
639             movd        xmm1,   DWORD PTR [rsi+rcx]
640             movq        xmm2,   xmm1
641
642             punpcklbw   xmm1,   xmm0
643             punpcklwd   xmm1,   xmm0
644
645             paddd       xmm1,   xmm6
646             paddd       xmm1,   [GLOBAL(four8s)]
647
648             psrad       xmm1,   4
649             packssdw    xmm1,   xmm0
650
651             packuswb    xmm1,   xmm0
652             pand        xmm1,   xmm5
653
654             pandn       xmm5,   xmm2
655             por         xmm5,   xmm1
656
657             movd        [rsi+rcx-8],  mm0
658             movq        mm0,    mm1
659
660             movdq2q     mm1,    xmm5
661             psrldq      xmm7,   12
662
663             psrldq      xmm6,   12
664             add         rcx,    4
665
666             cmp         rcx,    rdx
667             jl          .nextcol4
668
669         ;s+=pitch;
670         movsxd rax, dword arg(1)
671         add    arg(0), rax
672
673         sub dword arg(2), 1 ;rows-=1
674         cmp dword arg(2), 0
675         jg .ip_row_loop
676
677     add         rsp, 16
678     pop         rsp
679
680     ; begin epilog
681     pop rdi
682     pop rsi
683     RESTORE_GOT
684     RESTORE_XMM
685     UNSHADOW_ARGS
686     pop         rbp
687     ret
688 %undef flimit4
689
690
691 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
692 ;                            unsigned char blackclamp[16],
693 ;                            unsigned char whiteclamp[16],
694 ;                            unsigned char bothclamp[16],
695 ;                            unsigned int Width, unsigned int Height, int Pitch)
696 extern sym(rand)
697 global sym(vp8_plane_add_noise_wmt) PRIVATE
698 sym(vp8_plane_add_noise_wmt):
699     push        rbp
700     mov         rbp, rsp
701     SHADOW_ARGS_TO_STACK 8
702     GET_GOT     rbx
703     push        rsi
704     push        rdi
705     ; end prolog
706
707 .addnoise_loop:
708     call sym(rand) WRT_PLT
709     mov     rcx, arg(1) ;noise
710     and     rax, 0xff
711     add     rcx, rax
712
713     ; we rely on the fact that the clamping vectors are stored contiguously
714     ; in black/white/both order. Note that we have to reload this here because
715     ; rdx could be trashed by rand()
716     mov     rdx, arg(2) ; blackclamp
717
718
719             mov     rdi, rcx
720             movsxd  rcx, dword arg(5) ;[Width]
721             mov     rsi, arg(0) ;Pos
722             xor         rax,rax
723
724 .addnoise_nextset:
725             movdqu      xmm1,[rsi+rax]         ; get the source
726
727             psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
728             paddusb     xmm1, [rdx+32] ;bothclamp
729             psubusb     xmm1, [rdx+16] ;whiteclamp
730
731             movdqu      xmm2,[rdi+rax]         ; get the noise for this line
732             paddb       xmm1,xmm2              ; add it in
733             movdqu      [rsi+rax],xmm1         ; store the result
734
735             add         rax,16                 ; move to the next line
736
737             cmp         rax, rcx
738             jl          .addnoise_nextset
739
740     movsxd  rax, dword arg(7) ; Pitch
741     add     arg(0), rax ; Start += Pitch
742     sub     dword arg(6), 1   ; Height -= 1
743     jg      .addnoise_loop
744
745     ; begin epilog
746     pop rdi
747     pop rsi
748     RESTORE_GOT
749     UNSHADOW_ARGS
750     pop         rbp
751     ret
752
753
754 SECTION_RODATA
755 align 16
756 rd42:
757     times 8 dw 0x04
758 four8s:
759     times 4 dd 8