vp8/common/x86/postproc_sse2.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13
  14 ;void vp8_post_proc_down_and_across_xmm
  15 ;(
  16 ;    unsigned char *src_ptr,
  17 ;    unsigned char *dst_ptr,
  18 ;    int src_pixels_per_line,
  19 ;    int dst_pixels_per_line,
  20 ;    int rows,
  21 ;    int cols,
  22 ;    int flimit
  23 ;)
  24 global sym(vp8_post_proc_down_and_across_xmm) PRIVATE
  25 sym(vp8_post_proc_down_and_across_xmm):
  26     push        rbp
  27     mov         rbp, rsp
  28     SHADOW_ARGS_TO_STACK 7
  29     SAVE_XMM 7
  30     GET_GOT     rbx
  31     push        rsi
  32     push        rdi
  33     ; end prolog
  34
  35 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
  36     ALIGN_STACK 16, rax
  37     ; move the global rd onto the stack, since we don't have enough registers
  38     ; to do PIC addressing
  39     movdqa      xmm0, [GLOBAL(rd42)]
  40     sub         rsp, 16
  41     movdqa      [rsp], xmm0
  42 %define RD42 [rsp]
  43 %else
  44 %define RD42 [GLOBAL(rd42)]
  45 %endif
  46
  47
  48         movd        xmm2,       dword ptr arg(6) ;flimit
  49         punpcklwd   xmm2,       xmm2
  50         punpckldq   xmm2,       xmm2
  51         punpcklqdq  xmm2,       xmm2
  52
  53         mov         rsi,        arg(0) ;src_ptr
  54         mov         rdi,        arg(1) ;dst_ptr
  55
  56         movsxd      rcx,        DWORD PTR arg(4) ;rows
  57         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
  58         pxor        xmm0,       xmm0              ; mm0 = 00000000
  59
  60 .nextrow:
  61
  62         xor         rdx,        rdx       ; clear out rdx for use as loop counter
  63 .nextcol:
  64         movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
  65         punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
  66         movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
  67         psllw       xmm3,       2                       ;
  68
  69         movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
  70         punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
  71         paddusw     xmm3,       xmm5                    ; mm3 += mm6
  72
  73         ; thresholding
  74         movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
  75         psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
  76         psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
  77         paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
  78         pcmpgtw     xmm7,       xmm2
  79
  80         movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
  81         punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
  82         paddusw     xmm3,       xmm5                    ; mm3 += mm5
  83
  84         ; thresholding
  85         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
  86         psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
  87         psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
  88         paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
  89         pcmpgtw     xmm6,       xmm2
  90         por         xmm7,       xmm6                    ; accumulate thresholds
  91
  92
  93         neg         rax
  94         movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
  95         punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
  96         paddusw     xmm3,       xmm5                    ; mm3 += mm5
  97
  98         ; thresholding
  99         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
 100         psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
 101         psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
 102         paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
 103         pcmpgtw     xmm6,       xmm2
 104         por         xmm7,       xmm6                    ; accumulate thresholds
 105
 106         movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
 107         punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
 108         paddusw     xmm3,       xmm4                    ; mm3 += mm5
 109
 110         ; thresholding
 111         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
 112         psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
 113         psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
 114         paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
 115         pcmpgtw     xmm6,       xmm2
 116         por         xmm7,       xmm6                    ; accumulate thresholds
 117
 118
 119         paddusw     xmm3,       RD42                    ; mm3 += round value
 120         psraw       xmm3,       3                       ; mm3 /= 8
 121
 122         pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
 123         pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
 124         paddusw     xmm1,       xmm7                    ; combination
 125
 126         packuswb    xmm1,       xmm0                    ; pack to bytes
 127         movq        QWORD PTR [rdi], xmm1             ;
 128
 129         neg         rax                   ; pitch is positive
 130         add         rsi,        8
 131         add         rdi,        8
 132
 133         add         rdx,        8
 134         cmp         edx,        dword arg(5) ;cols
 135
 136         jl          .nextcol
 137
 138         ; done with the all cols, start the across filtering in place
 139         sub         rsi,        rdx
 140         sub         rdi,        rdx
 141
 142
 143         ; dup the first byte into the left border 8 times
 144         movq        mm1,   [rdi]
 145         punpcklbw   mm1,   mm1
 146         punpcklwd   mm1,   mm1
 147         punpckldq   mm1,   mm1
 148
 149         mov         rdx,    -8
 150         movq        [rdi+rdx], mm1
 151
 152         ; dup the last byte into the right border
 153         movsxd      rdx,    dword arg(5)
 154         movq        mm1,   [rdi + rdx + -1]
 155         punpcklbw   mm1,   mm1
 156         punpcklwd   mm1,   mm1
 157         punpckldq   mm1,   mm1
 158         movq        [rdi+rdx], mm1
 159
 160         xor         rdx,        rdx
 161         movq        mm0,        QWORD PTR [rdi-8];
 162
 163 .acrossnextcol:
 164         movq        xmm7,       QWORD PTR [rdi +rdx -2]
 165         movd        xmm4,       DWORD PTR [rdi +rdx +6]
 166
 167         pslldq      xmm4,       8
 168         por         xmm4,       xmm7
 169
 170         movdqa      xmm3,       xmm4
 171         psrldq      xmm3,       2
 172         punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
 173         movdqa      xmm1,       xmm3              ; mm1 = p0..p3
 174         psllw       xmm3,       2
 175
 176
 177         movdqa      xmm5,       xmm4
 178         psrldq      xmm5,       3
 179         punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
 180         paddusw     xmm3,       xmm5              ; mm3 += mm6
 181
 182         ; thresholding
 183         movdqa      xmm7,       xmm1              ; mm7 = p0..p3
 184         psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
 185         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
 186         paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
 187         pcmpgtw     xmm7,       xmm2
 188
 189         movdqa      xmm5,       xmm4
 190         psrldq      xmm5,       4
 191         punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
 192         paddusw     xmm3,       xmm5              ; mm3 += mm5
 193
 194         ; thresholding
 195         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
 196         psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
 197         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
 198         paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
 199         pcmpgtw     xmm6,       xmm2
 200         por         xmm7,       xmm6              ; accumulate thresholds
 201
 202
 203         movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
 204         punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
 205         paddusw     xmm3,       xmm5              ; mm3 += mm5
 206
 207         ; thresholding
 208         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
 209         psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
 210         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
 211         paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
 212         pcmpgtw     xmm6,       xmm2
 213         por         xmm7,       xmm6              ; accumulate thresholds
 214
 215         psrldq      xmm4,       1                   ; mm4 = p-1..p5
 216         punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
 217         paddusw     xmm3,       xmm4              ; mm3 += mm5
 218
 219         ; thresholding
 220         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
 221         psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
 222         psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
 223         paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
 224         pcmpgtw     xmm6,       xmm2
 225         por         xmm7,       xmm6              ; accumulate thresholds
 226
 227         paddusw     xmm3,       RD42              ; mm3 += round value
 228         psraw       xmm3,       3                 ; mm3 /= 8
 229
 230         pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
 231         pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
 232         paddusw     xmm1,       xmm7              ; combination
 233
 234         packuswb    xmm1,       xmm0              ; pack to bytes
 235         movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
 236         movdq2q     mm0,        xmm1
 237
 238         add         rdx,        8
 239         cmp         edx,        dword arg(5) ;cols
 240         jl          .acrossnextcol;
 241
 242         ; last 8 pixels
 243         movq        QWORD PTR [rdi+rdx-8],  mm0
 244
 245         ; done with this rwo
 246         add         rsi,rax               ; next line
 247         mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
 248         add         rdi,rax               ; next destination
 249         mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
 250
 251         dec         rcx                   ; decrement count
 252         jnz         .nextrow              ; next row
 253
 254 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
 255     add rsp,16
 256     pop rsp
 257 %endif
 258     ; begin epilog
 259     pop rdi
 260     pop rsi
 261     RESTORE_GOT
 262     RESTORE_XMM
 263     UNSHADOW_ARGS
 264     pop         rbp
 265     ret
 266 %undef RD42
 267
 268
 269 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
 270 ;                            int pitch, int rows, int cols,int flimit)
 271 extern sym(vp8_rv)
 272 global sym(vp8_mbpost_proc_down_xmm) PRIVATE
 273 sym(vp8_mbpost_proc_down_xmm):
 274     push        rbp
 275     mov         rbp, rsp
 276     SHADOW_ARGS_TO_STACK 5
 277     SAVE_XMM 7
 278     GET_GOT     rbx
 279     push        rsi
 280     push        rdi
 281     ; end prolog
 282
 283     ALIGN_STACK 16, rax
 284     sub         rsp, 128+16
 285
 286     ; unsigned char d[16][8] at [rsp]
 287     ; create flimit2 at [rsp+128]
 288     mov         eax, dword ptr arg(4) ;flimit
 289     mov         [rsp+128], eax
 290     mov         [rsp+128+4], eax
 291     mov         [rsp+128+8], eax
 292     mov         [rsp+128+12], eax
 293 %define flimit4 [rsp+128]
 294
 295 %if ABI_IS_32BIT=0
 296     lea         r8,       [GLOBAL(sym(vp8_rv))]
 297 %endif
 298
 299     ;rows +=8;
 300     add         dword arg(2), 8
 301
 302     ;for(c=0; c<cols; c+=8)
 303 .loop_col:
 304             mov         rsi,        arg(0) ; s
 305             pxor        xmm0,       xmm0        ;
 306
 307             movsxd      rax,        dword ptr arg(1) ;pitch       ;
 308
 309             ; this copies the last row down into the border 8 rows
 310             mov         rdi,        rsi
 311             mov         rdx,        arg(2)
 312             sub         rdx,        9
 313             imul        rdx,        rax
 314             lea         rdi,        [rdi+rdx]
 315             movq        xmm1,       QWORD ptr[rdi]              ; first row
 316             mov         rcx,        8
 317 .init_borderd                                                    ; initialize borders
 318             lea         rdi,        [rdi + rax]
 319             movq        [rdi],      xmm1
 320
 321             dec         rcx
 322             jne         .init_borderd
 323
 324             neg         rax                                     ; rax = -pitch
 325
 326             ; this copies the first row up into the border 8 rows
 327             mov         rdi,        rsi
 328             movq        xmm1,       QWORD ptr[rdi]              ; first row
 329             mov         rcx,        8
 330 .init_border                                                    ; initialize borders
 331             lea         rdi,        [rdi + rax]
 332             movq        [rdi],      xmm1
 333
 334             dec         rcx
 335             jne         .init_border
 336
 337
 338
 339             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
 340             neg         rax
 341
 342             pxor        xmm5,       xmm5
 343             pxor        xmm6,       xmm6        ;
 344
 345             pxor        xmm7,       xmm7        ;
 346             mov         rdi,        rsi
 347
 348             mov         rcx,        15          ;
 349
 350 .loop_initvar:
 351             movq        xmm1,       QWORD PTR [rdi];
 352             punpcklbw   xmm1,       xmm0        ;
 353
 354             paddw       xmm5,       xmm1        ;
 355             pmullw      xmm1,       xmm1        ;
 356
 357             movdqa      xmm2,       xmm1        ;
 358             punpcklwd   xmm1,       xmm0        ;
 359
 360             punpckhwd   xmm2,       xmm0        ;
 361             paddd       xmm6,       xmm1        ;
 362
 363             paddd       xmm7,       xmm2        ;
 364             lea         rdi,        [rdi+rax]   ;
 365
 366             dec         rcx
 367             jne         .loop_initvar
 368             ;save the var and sum
 369             xor         rdx,        rdx
 370 .loop_row:
 371             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
 372             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
 373
 374             punpcklbw   xmm1,       xmm0
 375             punpcklbw   xmm2,       xmm0
 376
 377             paddw       xmm5,       xmm2
 378             psubw       xmm5,       xmm1
 379
 380             pmullw      xmm2,       xmm2
 381             movdqa      xmm4,       xmm2
 382
 383             punpcklwd   xmm2,       xmm0
 384             punpckhwd   xmm4,       xmm0
 385
 386             paddd       xmm6,       xmm2
 387             paddd       xmm7,       xmm4
 388
 389             pmullw      xmm1,       xmm1
 390             movdqa      xmm2,       xmm1
 391
 392             punpcklwd   xmm1,       xmm0
 393             psubd       xmm6,       xmm1
 394
 395             punpckhwd   xmm2,       xmm0
 396             psubd       xmm7,       xmm2
 397
 398
 399             movdqa      xmm3,       xmm6
 400             pslld       xmm3,       4
 401
 402             psubd       xmm3,       xmm6
 403             movdqa      xmm1,       xmm5
 404
 405             movdqa      xmm4,       xmm5
 406             pmullw      xmm1,       xmm1
 407
 408             pmulhw      xmm4,       xmm4
 409             movdqa      xmm2,       xmm1
 410
 411             punpcklwd   xmm1,       xmm4
 412             punpckhwd   xmm2,       xmm4
 413
 414             movdqa      xmm4,       xmm7
 415             pslld       xmm4,       4
 416
 417             psubd       xmm4,       xmm7
 418
 419             psubd       xmm3,       xmm1
 420             psubd       xmm4,       xmm2
 421
 422             psubd       xmm3,       flimit4
 423             psubd       xmm4,       flimit4
 424
 425             psrad       xmm3,       31
 426             psrad       xmm4,       31
 427
 428             packssdw    xmm3,       xmm4
 429             packsswb    xmm3,       xmm0
 430
 431             movq        xmm1,       QWORD PTR [rsi+rax*8]
 432
 433             movq        xmm2,       xmm1
 434             punpcklbw   xmm1,       xmm0
 435
 436             paddw       xmm1,       xmm5
 437             mov         rcx,        rdx
 438
 439             and         rcx,        127
 440 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
 441             push        rax
 442             lea         rax,        [GLOBAL(sym(vp8_rv))]
 443             movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
 444             pop         rax
 445 %elif ABI_IS_32BIT=0
 446             movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
 447 %else
 448             movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
 449 %endif
 450
 451             paddw       xmm1,       xmm4
 452             ;paddw     xmm1,       eight8s
 453             psraw       xmm1,       4
 454
 455             packuswb    xmm1,       xmm0
 456             pand        xmm1,       xmm3
 457
 458             pandn       xmm3,       xmm2
 459             por         xmm1,       xmm3
 460
 461             and         rcx,        15
 462             movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
 463
 464             mov         rcx,        rdx
 465             sub         rcx,        8
 466
 467             and         rcx,        15
 468             movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
 469
 470             movq        [rsi],      mm0
 471             lea         rsi,        [rsi+rax]
 472
 473             lea         rdi,        [rdi+rax]
 474             add         rdx,        1
 475
 476             cmp         edx,        dword arg(2) ;rows
 477             jl          .loop_row
 478
 479         add         dword arg(0), 8 ; s += 8
 480         sub         dword arg(3), 8 ; cols -= 8
 481         cmp         dword arg(3), 0
 482         jg          .loop_col
 483
 484     add         rsp, 128+16
 485     pop         rsp
 486
 487     ; begin epilog
 488     pop rdi
 489     pop rsi
 490     RESTORE_GOT
 491     RESTORE_XMM
 492     UNSHADOW_ARGS
 493     pop         rbp
 494     ret
 495 %undef flimit4
 496
 497
 498 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
 499 ;                                int pitch, int rows, int cols,int flimit)
 500 global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
 501 sym(vp8_mbpost_proc_across_ip_xmm):
 502     push        rbp
 503     mov         rbp, rsp
 504     SHADOW_ARGS_TO_STACK 5
 505     SAVE_XMM 7
 506     GET_GOT     rbx
 507     push        rsi
 508     push        rdi
 509     ; end prolog
 510
 511     ALIGN_STACK 16, rax
 512     sub         rsp, 16
 513
 514     ; create flimit4 at [rsp]
 515     mov         eax, dword ptr arg(4) ;flimit
 516     mov         [rsp], eax
 517     mov         [rsp+4], eax
 518     mov         [rsp+8], eax
 519     mov         [rsp+12], eax
 520 %define flimit4 [rsp]
 521
 522
 523     ;for(r=0;r<rows;r++)
 524 .ip_row_loop:
 525
 526         xor         rdx,    rdx ;sumsq=0;
 527         xor         rcx,    rcx ;sum=0;
 528         mov         rsi,    arg(0); s
 529
 530
 531         ; dup the first byte into the left border 8 times
 532         movq        mm1,   [rsi]
 533         punpcklbw   mm1,   mm1
 534         punpcklwd   mm1,   mm1
 535         punpckldq   mm1,   mm1
 536
 537         mov         rdi,    -8
 538         movq        [rsi+rdi], mm1
 539
 540         ; dup the last byte into the right border
 541         movsxd      rdx,    dword arg(3)
 542         movq        mm1,   [rsi + rdx + -1]
 543         punpcklbw   mm1,   mm1
 544         punpcklwd   mm1,   mm1
 545         punpckldq   mm1,   mm1
 546         movq        [rsi+rdx], mm1
 547
 548 .ip_var_loop:
 549         ;for(i=-8;i<=6;i++)
 550         ;{
 551         ;    sumsq += s[i]*s[i];
 552         ;    sum   += s[i];
 553         ;}
 554         movzx       eax, byte [rsi+rdi]
 555         add         ecx, eax
 556         mul         al
 557         add         edx, eax
 558         add         rdi, 1
 559         cmp         rdi, 6
 560         jle         .ip_var_loop
 561
 562
 563             ;mov         rax,    sumsq
 564             ;movd        xmm7,   rax
 565             movd        xmm7,   edx
 566
 567             ;mov         rax,    sum
 568             ;movd        xmm6,   rax
 569             movd        xmm6,   ecx
 570
 571             mov         rsi,    arg(0) ;s
 572             xor         rcx,    rcx
 573
 574             movsxd      rdx,    dword arg(3) ;cols
 575             add         rdx,    8
 576             pxor        mm0,    mm0
 577             pxor        mm1,    mm1
 578
 579             pxor        xmm0,   xmm0
 580 .nextcol4:
 581
 582             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
 583             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
 584
 585             punpcklbw   xmm1,   xmm0                    ; expanding
 586             punpcklbw   xmm2,   xmm0                    ; expanding
 587
 588             punpcklwd   xmm1,   xmm0                    ; expanding to dwords
 589             punpcklwd   xmm2,   xmm0                    ; expanding to dwords
 590
 591             psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
 592             paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
 593
 594             paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
 595             pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
 596
 597             paddd       xmm6,   xmm2
 598             paddd       xmm7,   xmm1
 599
 600             pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
 601             pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
 602
 603             psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
 604             psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
 605
 606             pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
 607             pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
 608
 609             paddd       xmm6,   xmm4
 610             paddd       xmm7,   xmm3
 611
 612             pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
 613             pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
 614
 615             paddd       xmm7,   xmm3
 616             paddd       xmm6,   xmm4
 617
 618             pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
 619             pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
 620
 621             paddd       xmm7,   xmm3
 622             paddd       xmm6,   xmm4
 623
 624             movdqa      xmm3,   xmm6
 625             pmaddwd     xmm3,   xmm3
 626
 627             movdqa      xmm5,   xmm7
 628             pslld       xmm5,   4
 629
 630             psubd       xmm5,   xmm7
 631             psubd       xmm5,   xmm3
 632
 633             psubd       xmm5,   flimit4
 634             psrad       xmm5,   31
 635
 636             packssdw    xmm5,   xmm0
 637             packsswb    xmm5,   xmm0
 638
 639             movd        xmm1,   DWORD PTR [rsi+rcx]
 640             movq        xmm2,   xmm1
 641
 642             punpcklbw   xmm1,   xmm0
 643             punpcklwd   xmm1,   xmm0
 644
 645             paddd       xmm1,   xmm6
 646             paddd       xmm1,   [GLOBAL(four8s)]
 647
 648             psrad       xmm1,   4
 649             packssdw    xmm1,   xmm0
 650
 651             packuswb    xmm1,   xmm0
 652             pand        xmm1,   xmm5
 653
 654             pandn       xmm5,   xmm2
 655             por         xmm5,   xmm1
 656
 657             movd        [rsi+rcx-8],  mm0
 658             movq        mm0,    mm1
 659
 660             movdq2q     mm1,    xmm5
 661             psrldq      xmm7,   12
 662
 663             psrldq      xmm6,   12
 664             add         rcx,    4
 665
 666             cmp         rcx,    rdx
 667             jl          .nextcol4
 668
 669         ;s+=pitch;
 670         movsxd rax, dword arg(1)
 671         add    arg(0), rax
 672
 673         sub dword arg(2), 1 ;rows-=1
 674         cmp dword arg(2), 0
 675         jg .ip_row_loop
 676
 677     add         rsp, 16
 678     pop         rsp
 679
 680     ; begin epilog
 681     pop rdi
 682     pop rsi
 683     RESTORE_GOT
 684     RESTORE_XMM
 685     UNSHADOW_ARGS
 686     pop         rbp
 687     ret
 688 %undef flimit4
 689
 690
 691 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
 692 ;                            unsigned char blackclamp[16],
 693 ;                            unsigned char whiteclamp[16],
 694 ;                            unsigned char bothclamp[16],
 695 ;                            unsigned int Width, unsigned int Height, int Pitch)
 696 extern sym(rand)
 697 global sym(vp8_plane_add_noise_wmt) PRIVATE
 698 sym(vp8_plane_add_noise_wmt):
 699     push        rbp
 700     mov         rbp, rsp
 701     SHADOW_ARGS_TO_STACK 8
 702     GET_GOT     rbx
 703     push        rsi
 704     push        rdi
 705     ; end prolog
 706
 707 .addnoise_loop:
 708     call sym(rand) WRT_PLT
 709     mov     rcx, arg(1) ;noise
 710     and     rax, 0xff
 711     add     rcx, rax
 712
 713     ; we rely on the fact that the clamping vectors are stored contiguously
 714     ; in black/white/both order. Note that we have to reload this here because
 715     ; rdx could be trashed by rand()
 716     mov     rdx, arg(2) ; blackclamp
 717
 718
 719             mov     rdi, rcx
 720             movsxd  rcx, dword arg(5) ;[Width]
 721             mov     rsi, arg(0) ;Pos
 722             xor         rax,rax
 723
 724 .addnoise_nextset:
 725             movdqu      xmm1,[rsi+rax]         ; get the source
 726
 727             psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
 728             paddusb     xmm1, [rdx+32] ;bothclamp
 729             psubusb     xmm1, [rdx+16] ;whiteclamp
 730
 731             movdqu      xmm2,[rdi+rax]         ; get the noise for this line
 732             paddb       xmm1,xmm2              ; add it in
 733             movdqu      [rsi+rax],xmm1         ; store the result
 734
 735             add         rax,16                 ; move to the next line
 736
 737             cmp         rax, rcx
 738             jl          .addnoise_nextset
 739
 740     movsxd  rax, dword arg(7) ; Pitch
 741     add     arg(0), rax ; Start += Pitch
 742     sub     dword arg(6), 1   ; Height -= 1
 743     jg      .addnoise_loop
 744
 745     ; begin epilog
 746     pop rdi
 747     pop rsi
 748     RESTORE_GOT
 749     UNSHADOW_ARGS
 750     pop         rbp
 751     ret
 752
 753
 754 SECTION_RODATA
 755 align 16
 756 rd42:
 757     times 8 dw 0x04
 758 four8s:
 759     times 4 dd 8