src/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_sse2.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13
  14 ;macro in deblock functions
  15 %macro FIRST_2_ROWS 0
  16         movdqa      xmm4,       xmm0
  17         movdqa      xmm6,       xmm0
  18         movdqa      xmm5,       xmm1
  19         pavgb       xmm5,       xmm3
  20
  21         ;calculate absolute value
  22         psubusb     xmm4,       xmm1
  23         psubusb     xmm1,       xmm0
  24         psubusb     xmm6,       xmm3
  25         psubusb     xmm3,       xmm0
  26         paddusb     xmm4,       xmm1
  27         paddusb     xmm6,       xmm3
  28
  29         ;get threshold
  30         movdqa      xmm2,       flimit
  31         pxor        xmm1,       xmm1
  32         movdqa      xmm7,       xmm2
  33
  34         ;get mask
  35         psubusb     xmm2,       xmm4
  36         psubusb     xmm7,       xmm6
  37         pcmpeqb     xmm2,       xmm1
  38         pcmpeqb     xmm7,       xmm1
  39         por         xmm7,       xmm2
  40 %endmacro
  41
  42 %macro SECOND_2_ROWS 0
  43         movdqa      xmm6,       xmm0
  44         movdqa      xmm4,       xmm0
  45         movdqa      xmm2,       xmm1
  46         pavgb       xmm1,       xmm3
  47
  48         ;calculate absolute value
  49         psubusb     xmm6,       xmm2
  50         psubusb     xmm2,       xmm0
  51         psubusb     xmm4,       xmm3
  52         psubusb     xmm3,       xmm0
  53         paddusb     xmm6,       xmm2
  54         paddusb     xmm4,       xmm3
  55
  56         pavgb       xmm5,       xmm1
  57
  58         ;get threshold
  59         movdqa      xmm2,       flimit
  60         pxor        xmm1,       xmm1
  61         movdqa      xmm3,       xmm2
  62
  63         ;get mask
  64         psubusb     xmm2,       xmm6
  65         psubusb     xmm3,       xmm4
  66         pcmpeqb     xmm2,       xmm1
  67         pcmpeqb     xmm3,       xmm1
  68
  69         por         xmm7,       xmm2
  70         por         xmm7,       xmm3
  71
  72         pavgb       xmm5,       xmm0
  73
  74         ;decide if or not to use filtered value
  75         pand        xmm0,       xmm7
  76         pandn       xmm7,       xmm5
  77         paddusb     xmm0,       xmm7
  78 %endmacro
  79
  80 %macro UPDATE_FLIMIT 0
  81         movdqa      xmm2,       XMMWORD PTR [rbx]
  82         movdqa      [rsp],      xmm2
  83         add         rbx,        16
  84 %endmacro
  85
  86 ;void vp8_post_proc_down_and_across_mb_row_sse2
  87 ;(
  88 ;    unsigned char *src_ptr,
  89 ;    unsigned char *dst_ptr,
  90 ;    int src_pixels_per_line,
  91 ;    int dst_pixels_per_line,
  92 ;    int cols,
  93 ;    int *flimits,
  94 ;    int size
  95 ;)
  96 global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
  97 sym(vp8_post_proc_down_and_across_mb_row_sse2):
  98     push        rbp
  99     mov         rbp, rsp
 100     SHADOW_ARGS_TO_STACK 7
 101     SAVE_XMM 7
 102     push        rbx
 103     push        rsi
 104     push        rdi
 105     ; end prolog
 106     ALIGN_STACK 16, rax
 107     sub         rsp, 16
 108
 109         ; put flimit on stack
 110         mov         rbx,        arg(5)           ;flimits ptr
 111         UPDATE_FLIMIT
 112
 113 %define flimit [rsp]
 114
 115         mov         rsi,        arg(0)           ;src_ptr
 116         mov         rdi,        arg(1)           ;dst_ptr
 117
 118         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
 119         movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
 120 .nextrow:
 121         xor         rdx,        rdx              ;col
 122 .nextcol:
 123         ;load current and next 2 rows
 124         movdqu      xmm0,       XMMWORD PTR [rsi]
 125         movdqu      xmm1,       XMMWORD PTR [rsi + rax]
 126         movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
 127
 128         FIRST_2_ROWS
 129
 130         ;load above 2 rows
 131         neg         rax
 132         movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
 133         movdqu      xmm3,       XMMWORD PTR [rsi + rax]
 134
 135         SECOND_2_ROWS
 136
 137         movdqu      XMMWORD PTR [rdi], xmm0
 138
 139         neg         rax                          ; positive stride
 140         add         rsi,        16
 141         add         rdi,        16
 142
 143         add         rdx,        16
 144         cmp         edx,        dword arg(4)     ;cols
 145         jge         .downdone
 146         UPDATE_FLIMIT
 147         jmp         .nextcol
 148
 149 .downdone:
 150         ; done with the all cols, start the across filtering in place
 151         sub         rsi,        rdx
 152         sub         rdi,        rdx
 153
 154         mov         rbx,        arg(5) ; flimits
 155         UPDATE_FLIMIT
 156
 157         ; dup the first byte into the left border 8 times
 158         movq        mm1,   [rdi]
 159         punpcklbw   mm1,   mm1
 160         punpcklwd   mm1,   mm1
 161         punpckldq   mm1,   mm1
 162         mov         rdx,    -8
 163         movq        [rdi+rdx], mm1
 164
 165         ; dup the last byte into the right border
 166         movsxd      rdx,    dword arg(4)
 167         movq        mm1,   [rdi + rdx + -1]
 168         punpcklbw   mm1,   mm1
 169         punpcklwd   mm1,   mm1
 170         punpckldq   mm1,   mm1
 171         movq        [rdi+rdx], mm1
 172
 173         xor         rdx,        rdx
 174         movq        mm0,        QWORD PTR [rdi-16];
 175         movq        mm1,        QWORD PTR [rdi-8];
 176
 177 .acrossnextcol:
 178         movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
 179         movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
 180         movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
 181
 182         FIRST_2_ROWS
 183
 184         movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
 185         movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
 186
 187         SECOND_2_ROWS
 188
 189         movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
 190         movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
 191         movdq2q     mm0,        xmm0
 192         psrldq      xmm0,       8
 193         movdq2q     mm1,        xmm0
 194
 195         add         rdx,        16
 196         cmp         edx,        dword arg(4)     ;cols
 197         jge         .acrossdone
 198         UPDATE_FLIMIT
 199         jmp         .acrossnextcol
 200
 201 .acrossdone
 202         ; last 16 pixels
 203         movq        QWORD PTR [rdi+rdx-16], mm0
 204
 205         cmp         edx,        dword arg(4)
 206         jne         .throw_last_8
 207         movq        QWORD PTR [rdi+rdx-8], mm1
 208 .throw_last_8:
 209         ; done with this rwo
 210         add         rsi,rax                      ;next src line
 211         mov         eax, dword arg(3)            ;dst_pixels_per_line
 212         add         rdi,rax                      ;next destination
 213         mov         eax, dword arg(2)            ;src_pixels_per_line
 214
 215         mov         rbx,        arg(5)           ;flimits
 216         UPDATE_FLIMIT
 217
 218         dec         rcx                          ;decrement count
 219         jnz         .nextrow                     ;next row
 220
 221     add rsp, 16
 222     pop rsp
 223     ; begin epilog
 224     pop rdi
 225     pop rsi
 226     pop rbx
 227     RESTORE_XMM
 228     UNSHADOW_ARGS
 229     pop         rbp
 230     ret
 231 %undef flimit
 232
 233 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
 234 ;                            int pitch, int rows, int cols,int flimit)
 235 extern sym(vp8_rv)
 236 global sym(vp8_mbpost_proc_down_xmm) PRIVATE
 237 sym(vp8_mbpost_proc_down_xmm):
 238     push        rbp
 239     mov         rbp, rsp
 240     SHADOW_ARGS_TO_STACK 5
 241     SAVE_XMM 7
 242     GET_GOT     rbx
 243     push        rsi
 244     push        rdi
 245     ; end prolog
 246
 247     ALIGN_STACK 16, rax
 248     sub         rsp, 128+16
 249
 250     ; unsigned char d[16][8] at [rsp]
 251     ; create flimit2 at [rsp+128]
 252     mov         eax, dword ptr arg(4) ;flimit
 253     mov         [rsp+128], eax
 254     mov         [rsp+128+4], eax
 255     mov         [rsp+128+8], eax
 256     mov         [rsp+128+12], eax
 257 %define flimit4 [rsp+128]
 258
 259 %if ABI_IS_32BIT=0
 260     lea         r8,       [GLOBAL(sym(vp8_rv))]
 261 %endif
 262
 263     ;rows +=8;
 264     add         dword arg(2), 8
 265
 266     ;for(c=0; c<cols; c+=8)
 267 .loop_col:
 268             mov         rsi,        arg(0) ; s
 269             pxor        xmm0,       xmm0        ;
 270
 271             movsxd      rax,        dword ptr arg(1) ;pitch       ;
 272
 273             ; this copies the last row down into the border 8 rows
 274             mov         rdi,        rsi
 275             mov         rdx,        arg(2)
 276             sub         rdx,        9
 277             imul        rdx,        rax
 278             lea         rdi,        [rdi+rdx]
 279             movq        xmm1,       QWORD ptr[rdi]              ; first row
 280             mov         rcx,        8
 281 .init_borderd                                                    ; initialize borders
 282             lea         rdi,        [rdi + rax]
 283             movq        [rdi],      xmm1
 284
 285             dec         rcx
 286             jne         .init_borderd
 287
 288             neg         rax                                     ; rax = -pitch
 289
 290             ; this copies the first row up into the border 8 rows
 291             mov         rdi,        rsi
 292             movq        xmm1,       QWORD ptr[rdi]              ; first row
 293             mov         rcx,        8
 294 .init_border                                                    ; initialize borders
 295             lea         rdi,        [rdi + rax]
 296             movq        [rdi],      xmm1
 297
 298             dec         rcx
 299             jne         .init_border
 300
 301
 302
 303             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
 304             neg         rax
 305
 306             pxor        xmm5,       xmm5
 307             pxor        xmm6,       xmm6        ;
 308
 309             pxor        xmm7,       xmm7        ;
 310             mov         rdi,        rsi
 311
 312             mov         rcx,        15          ;
 313
 314 .loop_initvar:
 315             movq        xmm1,       QWORD PTR [rdi];
 316             punpcklbw   xmm1,       xmm0        ;
 317
 318             paddw       xmm5,       xmm1        ;
 319             pmullw      xmm1,       xmm1        ;
 320
 321             movdqa      xmm2,       xmm1        ;
 322             punpcklwd   xmm1,       xmm0        ;
 323
 324             punpckhwd   xmm2,       xmm0        ;
 325             paddd       xmm6,       xmm1        ;
 326
 327             paddd       xmm7,       xmm2        ;
 328             lea         rdi,        [rdi+rax]   ;
 329
 330             dec         rcx
 331             jne         .loop_initvar
 332             ;save the var and sum
 333             xor         rdx,        rdx
 334 .loop_row:
 335             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
 336             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
 337
 338             punpcklbw   xmm1,       xmm0
 339             punpcklbw   xmm2,       xmm0
 340
 341             paddw       xmm5,       xmm2
 342             psubw       xmm5,       xmm1
 343
 344             pmullw      xmm2,       xmm2
 345             movdqa      xmm4,       xmm2
 346
 347             punpcklwd   xmm2,       xmm0
 348             punpckhwd   xmm4,       xmm0
 349
 350             paddd       xmm6,       xmm2
 351             paddd       xmm7,       xmm4
 352
 353             pmullw      xmm1,       xmm1
 354             movdqa      xmm2,       xmm1
 355
 356             punpcklwd   xmm1,       xmm0
 357             psubd       xmm6,       xmm1
 358
 359             punpckhwd   xmm2,       xmm0
 360             psubd       xmm7,       xmm2
 361
 362
 363             movdqa      xmm3,       xmm6
 364             pslld       xmm3,       4
 365
 366             psubd       xmm3,       xmm6
 367             movdqa      xmm1,       xmm5
 368
 369             movdqa      xmm4,       xmm5
 370             pmullw      xmm1,       xmm1
 371
 372             pmulhw      xmm4,       xmm4
 373             movdqa      xmm2,       xmm1
 374
 375             punpcklwd   xmm1,       xmm4
 376             punpckhwd   xmm2,       xmm4
 377
 378             movdqa      xmm4,       xmm7
 379             pslld       xmm4,       4
 380
 381             psubd       xmm4,       xmm7
 382
 383             psubd       xmm3,       xmm1
 384             psubd       xmm4,       xmm2
 385
 386             psubd       xmm3,       flimit4
 387             psubd       xmm4,       flimit4
 388
 389             psrad       xmm3,       31
 390             psrad       xmm4,       31
 391
 392             packssdw    xmm3,       xmm4
 393             packsswb    xmm3,       xmm0
 394
 395             movq        xmm1,       QWORD PTR [rsi+rax*8]
 396
 397             movq        xmm2,       xmm1
 398             punpcklbw   xmm1,       xmm0
 399
 400             paddw       xmm1,       xmm5
 401             mov         rcx,        rdx
 402
 403             and         rcx,        127
 404 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
 405             push        rax
 406             lea         rax,        [GLOBAL(sym(vp8_rv))]
 407             movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
 408             pop         rax
 409 %elif ABI_IS_32BIT=0
 410             movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
 411 %else
 412             movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
 413 %endif
 414
 415             paddw       xmm1,       xmm4
 416             ;paddw     xmm1,       eight8s
 417             psraw       xmm1,       4
 418
 419             packuswb    xmm1,       xmm0
 420             pand        xmm1,       xmm3
 421
 422             pandn       xmm3,       xmm2
 423             por         xmm1,       xmm3
 424
 425             and         rcx,        15
 426             movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
 427
 428             cmp         edx,        8
 429             jl          .skip_assignment
 430
 431             mov         rcx,        rdx
 432             sub         rcx,        8
 433             and         rcx,        15
 434             movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
 435             movq        [rsi],      mm0
 436
 437 .skip_assignment
 438             lea         rsi,        [rsi+rax]
 439
 440             lea         rdi,        [rdi+rax]
 441             add         rdx,        1
 442
 443             cmp         edx,        dword arg(2) ;rows
 444             jl          .loop_row
 445
 446         add         dword arg(0), 8 ; s += 8
 447         sub         dword arg(3), 8 ; cols -= 8
 448         cmp         dword arg(3), 0
 449         jg          .loop_col
 450
 451     add         rsp, 128+16
 452     pop         rsp
 453
 454     ; begin epilog
 455     pop rdi
 456     pop rsi
 457     RESTORE_GOT
 458     RESTORE_XMM
 459     UNSHADOW_ARGS
 460     pop         rbp
 461     ret
 462 %undef flimit4
 463
 464
 465 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
 466 ;                                int pitch, int rows, int cols,int flimit)
 467 global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
 468 sym(vp8_mbpost_proc_across_ip_xmm):
 469     push        rbp
 470     mov         rbp, rsp
 471     SHADOW_ARGS_TO_STACK 5
 472     SAVE_XMM 7
 473     GET_GOT     rbx
 474     push        rsi
 475     push        rdi
 476     ; end prolog
 477
 478     ALIGN_STACK 16, rax
 479     sub         rsp, 16
 480
 481     ; create flimit4 at [rsp]
 482     mov         eax, dword ptr arg(4) ;flimit
 483     mov         [rsp], eax
 484     mov         [rsp+4], eax
 485     mov         [rsp+8], eax
 486     mov         [rsp+12], eax
 487 %define flimit4 [rsp]
 488
 489
 490     ;for(r=0;r<rows;r++)
 491 .ip_row_loop:
 492
 493         xor         rdx,    rdx ;sumsq=0;
 494         xor         rcx,    rcx ;sum=0;
 495         mov         rsi,    arg(0); s
 496
 497
 498         ; dup the first byte into the left border 8 times
 499         movq        mm1,   [rsi]
 500         punpcklbw   mm1,   mm1
 501         punpcklwd   mm1,   mm1
 502         punpckldq   mm1,   mm1
 503
 504         mov         rdi,    -8
 505         movq        [rsi+rdi], mm1
 506
 507         ; dup the last byte into the right border
 508         movsxd      rdx,    dword arg(3)
 509         movq        mm1,   [rsi + rdx + -1]
 510         punpcklbw   mm1,   mm1
 511         punpcklwd   mm1,   mm1
 512         punpckldq   mm1,   mm1
 513         movq        [rsi+rdx], mm1
 514
 515 .ip_var_loop:
 516         ;for(i=-8;i<=6;i++)
 517         ;{
 518         ;    sumsq += s[i]*s[i];
 519         ;    sum   += s[i];
 520         ;}
 521         movzx       eax, byte [rsi+rdi]
 522         add         ecx, eax
 523         mul         al
 524         add         edx, eax
 525         add         rdi, 1
 526         cmp         rdi, 6
 527         jle         .ip_var_loop
 528
 529
 530             ;mov         rax,    sumsq
 531             ;movd        xmm7,   rax
 532             movd        xmm7,   edx
 533
 534             ;mov         rax,    sum
 535             ;movd        xmm6,   rax
 536             movd        xmm6,   ecx
 537
 538             mov         rsi,    arg(0) ;s
 539             xor         rcx,    rcx
 540
 541             movsxd      rdx,    dword arg(3) ;cols
 542             add         rdx,    8
 543             pxor        mm0,    mm0
 544             pxor        mm1,    mm1
 545
 546             pxor        xmm0,   xmm0
 547 .nextcol4:
 548
 549             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
 550             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
 551
 552             punpcklbw   xmm1,   xmm0                    ; expanding
 553             punpcklbw   xmm2,   xmm0                    ; expanding
 554
 555             punpcklwd   xmm1,   xmm0                    ; expanding to dwords
 556             punpcklwd   xmm2,   xmm0                    ; expanding to dwords
 557
 558             psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
 559             paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
 560
 561             paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
 562             pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
 563
 564             paddd       xmm6,   xmm2
 565             paddd       xmm7,   xmm1
 566
 567             pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
 568             pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
 569
 570             psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
 571             psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
 572
 573             pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
 574             pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
 575
 576             paddd       xmm6,   xmm4
 577             paddd       xmm7,   xmm3
 578
 579             pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
 580             pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
 581
 582             paddd       xmm7,   xmm3
 583             paddd       xmm6,   xmm4
 584
 585             pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
 586             pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
 587
 588             paddd       xmm7,   xmm3
 589             paddd       xmm6,   xmm4
 590
 591             movdqa      xmm3,   xmm6
 592             pmaddwd     xmm3,   xmm3
 593
 594             movdqa      xmm5,   xmm7
 595             pslld       xmm5,   4
 596
 597             psubd       xmm5,   xmm7
 598             psubd       xmm5,   xmm3
 599
 600             psubd       xmm5,   flimit4
 601             psrad       xmm5,   31
 602
 603             packssdw    xmm5,   xmm0
 604             packsswb    xmm5,   xmm0
 605
 606             movd        xmm1,   DWORD PTR [rsi+rcx]
 607             movq        xmm2,   xmm1
 608
 609             punpcklbw   xmm1,   xmm0
 610             punpcklwd   xmm1,   xmm0
 611
 612             paddd       xmm1,   xmm6
 613             paddd       xmm1,   [GLOBAL(four8s)]
 614
 615             psrad       xmm1,   4
 616             packssdw    xmm1,   xmm0
 617
 618             packuswb    xmm1,   xmm0
 619             pand        xmm1,   xmm5
 620
 621             pandn       xmm5,   xmm2
 622             por         xmm5,   xmm1
 623
 624             movd        [rsi+rcx-8],  mm0
 625             movq        mm0,    mm1
 626
 627             movdq2q     mm1,    xmm5
 628             psrldq      xmm7,   12
 629
 630             psrldq      xmm6,   12
 631             add         rcx,    4
 632
 633             cmp         rcx,    rdx
 634             jl          .nextcol4
 635
 636         ;s+=pitch;
 637         movsxd rax, dword arg(1)
 638         add    arg(0), rax
 639
 640         sub dword arg(2), 1 ;rows-=1
 641         cmp dword arg(2), 0
 642         jg .ip_row_loop
 643
 644     add         rsp, 16
 645     pop         rsp
 646
 647     ; begin epilog
 648     pop rdi
 649     pop rsi
 650     RESTORE_GOT
 651     RESTORE_XMM
 652     UNSHADOW_ARGS
 653     pop         rbp
 654     ret
 655 %undef flimit4
 656
 657
 658 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
 659 ;                            unsigned char blackclamp[16],
 660 ;                            unsigned char whiteclamp[16],
 661 ;                            unsigned char bothclamp[16],
 662 ;                            unsigned int Width, unsigned int Height, int Pitch)
 663 global sym(vp8_plane_add_noise_wmt) PRIVATE
 664 sym(vp8_plane_add_noise_wmt):
 665     push        rbp
 666     mov         rbp, rsp
 667     SHADOW_ARGS_TO_STACK 8
 668     GET_GOT     rbx
 669     push        rsi
 670     push        rdi
 671     ; end prolog
 672
 673 .addnoise_loop:
 674     call sym(LIBVPX_RAND) WRT_PLT
 675     mov     rcx, arg(1) ;noise
 676     and     rax, 0xff
 677     add     rcx, rax
 678
 679     ; we rely on the fact that the clamping vectors are stored contiguously
 680     ; in black/white/both order. Note that we have to reload this here because
 681     ; rdx could be trashed by rand()
 682     mov     rdx, arg(2) ; blackclamp
 683
 684
 685             mov     rdi, rcx
 686             movsxd  rcx, dword arg(5) ;[Width]
 687             mov     rsi, arg(0) ;Pos
 688             xor         rax,rax
 689
 690 .addnoise_nextset:
 691             movdqu      xmm1,[rsi+rax]         ; get the source
 692
 693             psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
 694             paddusb     xmm1, [rdx+32] ;bothclamp
 695             psubusb     xmm1, [rdx+16] ;whiteclamp
 696
 697             movdqu      xmm2,[rdi+rax]         ; get the noise for this line
 698             paddb       xmm1,xmm2              ; add it in
 699             movdqu      [rsi+rax],xmm1         ; store the result
 700
 701             add         rax,16                 ; move to the next line
 702
 703             cmp         rax, rcx
 704             jl          .addnoise_nextset
 705
 706     movsxd  rax, dword arg(7) ; Pitch
 707     add     arg(0), rax ; Start += Pitch
 708     sub     dword arg(6), 1   ; Height -= 1
 709     jg      .addnoise_loop
 710
 711     ; begin epilog
 712     pop rdi
 713     pop rsi
 714     RESTORE_GOT
 715     UNSHADOW_ARGS
 716     pop         rbp
 717     ret
 718
 719
 720 SECTION_RODATA
 721 align 16
 722 four8s:
 723     times 4 dd 8