vp8/common/x86/subpixel_mmx.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13 extern sym(vp8_bilinear_filters_x86_8)
  14
  15
  16 %define BLOCK_HEIGHT_WIDTH 4
  17 %define vp8_filter_weight 128
  18 %define VP8_FILTER_SHIFT  7
  19
  20
  21 ;void vp8_filter_block1d_h6_mmx
  22 ;(
  23 ;    unsigned char   *src_ptr,
  24 ;    unsigned short  *output_ptr,
  25 ;    unsigned int    src_pixels_per_line,
  26 ;    unsigned int    pixel_step,
  27 ;    unsigned int    output_height,
  28 ;    unsigned int    output_width,
  29 ;    short           * vp8_filter
  30 ;)
  31 global sym(vp8_filter_block1d_h6_mmx) PRIVATE
  32 sym(vp8_filter_block1d_h6_mmx):
  33     push        rbp
  34     mov         rbp, rsp
  35     SHADOW_ARGS_TO_STACK 7
  36     GET_GOT     rbx
  37     push        rsi
  38     push        rdi
  39     ; end prolog
  40
  41         mov         rdx,    arg(6) ;vp8_filter
  42
  43         movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
  44         movq        mm2,    [rdx + 32]         ;
  45         movq        mm6,    [rdx + 48]        ;
  46         movq        mm7,    [rdx + 64]        ;
  47
  48         mov         rdi,    arg(1) ;output_ptr
  49         mov         rsi,    arg(0) ;src_ptr
  50         movsxd      rcx,    dword ptr arg(4) ;output_height
  51         movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
  52         pxor        mm0,    mm0              ; mm0 = 00000000
  53
  54 .nextrow:
  55         movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
  56         movq        mm4,    mm3              ; mm4 = p-2..p5
  57         psrlq       mm3,    8                ; mm3 = p-1..p5
  58         punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
  59         pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
  60
  61         movq        mm5,    mm4              ; mm5 = p-2..p5
  62         punpckhbw   mm4,    mm0              ; mm5 = p2..p5
  63         pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
  64         paddsw      mm3,    mm4              ; mm3 += mm5
  65
  66         movq        mm4,    mm5              ; mm4 = p-2..p5;
  67         psrlq       mm5,    16               ; mm5 = p0..p5;
  68         punpcklbw   mm5,    mm0              ; mm5 = p0..p3
  69         pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
  70         paddsw      mm3,    mm5              ; mm3 += mm5
  71
  72         movq        mm5,    mm4              ; mm5 = p-2..p5
  73         psrlq       mm4,    24               ; mm4 = p1..p5
  74         punpcklbw   mm4,    mm0              ; mm4 = p1..p4
  75         pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
  76         paddsw      mm3,    mm4              ; mm3 += mm5
  77
  78         ; do outer positive taps
  79         movd        mm4,    [rsi+3]
  80         punpcklbw   mm4,    mm0              ; mm5 = p3..p6
  81         pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
  82         paddsw      mm3,    mm4              ; mm3 += mm5
  83
  84         punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
  85         pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
  86         paddsw      mm3,    mm5              ; mm3 += mm5
  87
  88         paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
  89         psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
  90         packuswb    mm3,    mm0              ; pack and unpack to saturate
  91         punpcklbw   mm3,    mm0              ;
  92
  93         movq        [rdi],  mm3              ; store the results in the destination
  94
  95 %if ABI_IS_32BIT
  96         add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
  97         add         rdi,    rax;
  98 %else
  99         movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
 100         add         rdi,    rax;
 101
 102         add         rsi,    r8               ; next line
 103 %endif
 104
 105         dec         rcx                      ; decrement count
 106         jnz         .nextrow                 ; next row
 107
 108     ; begin epilog
 109     pop rdi
 110     pop rsi
 111     RESTORE_GOT
 112     UNSHADOW_ARGS
 113     pop         rbp
 114     ret
 115
 116
 117 ;void vp8_filter_block1dc_v6_mmx
 118 ;(
 119 ;   short *src_ptr,
 120 ;   unsigned char *output_ptr,
 121 ;    int output_pitch,
 122 ;   unsigned int pixels_per_line,
 123 ;   unsigned int pixel_step,
 124 ;   unsigned int output_height,
 125 ;   unsigned int output_width,
 126 ;   short * vp8_filter
 127 ;)
 128 global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
 129 sym(vp8_filter_block1dc_v6_mmx):
 130     push        rbp
 131     mov         rbp, rsp
 132     SHADOW_ARGS_TO_STACK 8
 133     GET_GOT     rbx
 134     push        rsi
 135     push        rdi
 136     ; end prolog
 137
 138         movq      mm5, [GLOBAL(rd)]
 139         push        rbx
 140         mov         rbx, arg(7) ;vp8_filter
 141         movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
 142         movq      mm2, [rbx + 32]         ;
 143         movq      mm6, [rbx + 48]        ;
 144         movq      mm7, [rbx + 64]        ;
 145
 146         movsxd      rdx, dword ptr arg(3) ;pixels_per_line
 147         mov         rdi, arg(1) ;output_ptr
 148         mov         rsi, arg(0) ;src_ptr
 149         sub         rsi, rdx
 150         sub         rsi, rdx
 151         movsxd      rcx, DWORD PTR arg(5) ;output_height
 152         movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
 153         pxor        mm0, mm0              ; mm0 = 00000000
 154
 155
 156 .nextrow_cv:
 157         movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
 158         pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
 159
 160
 161         movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
 162         pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
 163         paddsw      mm3, mm4              ; mm3 += mm4
 164
 165         movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
 166         pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
 167         paddsw      mm3, mm4              ; mm3 += mm4
 168
 169         movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
 170         pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
 171         paddsw      mm3, mm4              ; mm3 += mm4
 172
 173
 174         add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
 175         movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
 176         pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
 177         paddsw      mm3, mm4              ; mm3 += mm4
 178
 179         movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
 180         pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
 181         paddsw      mm3, mm4              ; mm3 += mm4
 182
 183
 184         paddsw      mm3, mm5               ; mm3 += round value
 185         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
 186         packuswb    mm3, mm0              ; pack and saturate
 187
 188         movd        [rdi],mm3             ; store the results in the destination
 189         ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
 190         ; recon block should be in cache this shouldn't cost much.  Its obviously
 191         ; avoidable!!!.
 192         lea         rdi,  [rdi+rax] ;
 193         dec         rcx                   ; decrement count
 194         jnz         .nextrow_cv           ; next row
 195
 196         pop         rbx
 197
 198     ; begin epilog
 199     pop rdi
 200     pop rsi
 201     RESTORE_GOT
 202     UNSHADOW_ARGS
 203     pop         rbp
 204     ret
 205
 206
 207 ;void bilinear_predict8x8_mmx
 208 ;(
 209 ;    unsigned char  *src_ptr,
 210 ;    int   src_pixels_per_line,
 211 ;    int  xoffset,
 212 ;    int  yoffset,
 213 ;   unsigned char *dst_ptr,
 214 ;    int dst_pitch
 215 ;)
 216 global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
 217 sym(vp8_bilinear_predict8x8_mmx):
 218     push        rbp
 219     mov         rbp, rsp
 220     SHADOW_ARGS_TO_STACK 6
 221     GET_GOT     rbx
 222     push        rsi
 223     push        rdi
 224     ; end prolog
 225
 226     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
 227     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
 228
 229         movsxd      rax,        dword ptr arg(2) ;xoffset
 230         mov         rdi,        arg(4) ;dst_ptr           ;
 231
 232         shl         rax,        5 ; offset * 32
 233         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
 234
 235         add         rax,        rcx ; HFilter
 236         mov         rsi,        arg(0) ;src_ptr              ;
 237
 238         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
 239         movq        mm1,        [rax]               ;
 240
 241         movq        mm2,        [rax+16]            ;
 242         movsxd      rax,        dword ptr arg(3) ;yoffset
 243
 244         pxor        mm0,        mm0                 ;
 245
 246         shl         rax,        5 ; offset*32
 247         add         rax,        rcx ; VFilter
 248
 249         lea         rcx,        [rdi+rdx*8]          ;
 250         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
 251
 252
 253
 254         ; get the first horizontal line done       ;
 255         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
 256         movq        mm4,        mm3                 ; make a copy of current line
 257
 258         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
 259         punpckhbw   mm4,        mm0                 ;
 260
 261         pmullw      mm3,        mm1                 ;
 262         pmullw      mm4,        mm1                 ;
 263
 264         movq        mm5,        [rsi+1]             ;
 265         movq        mm6,        mm5                 ;
 266
 267         punpcklbw   mm5,        mm0                 ;
 268         punpckhbw   mm6,        mm0                 ;
 269
 270         pmullw      mm5,        mm2                 ;
 271         pmullw      mm6,        mm2                 ;
 272
 273         paddw       mm3,        mm5                 ;
 274         paddw       mm4,        mm6                 ;
 275
 276         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 277         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 278
 279         paddw       mm4,        [GLOBAL(rd)]                 ;
 280         psraw       mm4,        VP8_FILTER_SHIFT        ;
 281
 282         movq        mm7,        mm3                 ;
 283         packuswb    mm7,        mm4                 ;
 284
 285         add         rsi,        rdx                 ; next line
 286 .next_row_8x8:
 287         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
 288         movq        mm4,        mm3                 ; make a copy of current line
 289
 290         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
 291         punpckhbw   mm4,        mm0                 ;
 292
 293         pmullw      mm3,        mm1                 ;
 294         pmullw      mm4,        mm1                 ;
 295
 296         movq        mm5,        [rsi+1]             ;
 297         movq        mm6,        mm5                 ;
 298
 299         punpcklbw   mm5,        mm0                 ;
 300         punpckhbw   mm6,        mm0                 ;
 301
 302         pmullw      mm5,        mm2                 ;
 303         pmullw      mm6,        mm2                 ;
 304
 305         paddw       mm3,        mm5                 ;
 306         paddw       mm4,        mm6                 ;
 307
 308         movq        mm5,        mm7                 ;
 309         movq        mm6,        mm7                 ;
 310
 311         punpcklbw   mm5,        mm0                 ;
 312         punpckhbw   mm6,        mm0
 313
 314         pmullw      mm5,        [rax]               ;
 315         pmullw      mm6,        [rax]               ;
 316
 317         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 318         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 319
 320         paddw       mm4,        [GLOBAL(rd)]                 ;
 321         psraw       mm4,        VP8_FILTER_SHIFT        ;
 322
 323         movq        mm7,        mm3                 ;
 324         packuswb    mm7,        mm4                 ;
 325
 326
 327         pmullw      mm3,        [rax+16]            ;
 328         pmullw      mm4,        [rax+16]            ;
 329
 330         paddw       mm3,        mm5                 ;
 331         paddw       mm4,        mm6                 ;
 332
 333
 334         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 335         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 336
 337         paddw       mm4,        [GLOBAL(rd)]                 ;
 338         psraw       mm4,        VP8_FILTER_SHIFT        ;
 339
 340         packuswb    mm3,        mm4
 341
 342         movq        [rdi],      mm3                 ; store the results in the destination
 343
 344 %if ABI_IS_32BIT
 345         add         rsi,        rdx                 ; next line
 346         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
 347 %else
 348         movsxd      r8,         dword ptr arg(5) ;dst_pitch
 349         add         rsi,        rdx                 ; next line
 350         add         rdi,        r8                  ;dst_pitch
 351 %endif
 352         cmp         rdi,        rcx                 ;
 353         jne         .next_row_8x8
 354
 355     ; begin epilog
 356     pop rdi
 357     pop rsi
 358     RESTORE_GOT
 359     UNSHADOW_ARGS
 360     pop         rbp
 361     ret
 362
 363
 364 ;void bilinear_predict8x4_mmx
 365 ;(
 366 ;    unsigned char  *src_ptr,
 367 ;    int   src_pixels_per_line,
 368 ;    int  xoffset,
 369 ;    int  yoffset,
 370 ;    unsigned char *dst_ptr,
 371 ;    int dst_pitch
 372 ;)
 373 global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
 374 sym(vp8_bilinear_predict8x4_mmx):
 375     push        rbp
 376     mov         rbp, rsp
 377     SHADOW_ARGS_TO_STACK 6
 378     GET_GOT     rbx
 379     push        rsi
 380     push        rdi
 381     ; end prolog
 382
 383     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
 384     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
 385
 386         movsxd      rax,        dword ptr arg(2) ;xoffset
 387         mov         rdi,        arg(4) ;dst_ptr           ;
 388
 389         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
 390         shl         rax,        5
 391
 392         mov         rsi,        arg(0) ;src_ptr              ;
 393         add         rax,        rcx
 394
 395         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
 396         movq        mm1,        [rax]               ;
 397
 398         movq        mm2,        [rax+16]            ;
 399         movsxd      rax,        dword ptr arg(3) ;yoffset
 400
 401         pxor        mm0,        mm0                 ;
 402         shl         rax,        5
 403
 404         add         rax,        rcx
 405         lea         rcx,        [rdi+rdx*4]          ;
 406
 407         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
 408
 409         ; get the first horizontal line done       ;
 410         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
 411         movq        mm4,        mm3                 ; make a copy of current line
 412
 413         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
 414         punpckhbw   mm4,        mm0                 ;
 415
 416         pmullw      mm3,        mm1                 ;
 417         pmullw      mm4,        mm1                 ;
 418
 419         movq        mm5,        [rsi+1]             ;
 420         movq        mm6,        mm5                 ;
 421
 422         punpcklbw   mm5,        mm0                 ;
 423         punpckhbw   mm6,        mm0                 ;
 424
 425         pmullw      mm5,        mm2                 ;
 426         pmullw      mm6,        mm2                 ;
 427
 428         paddw       mm3,        mm5                 ;
 429         paddw       mm4,        mm6                 ;
 430
 431         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 432         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 433
 434         paddw       mm4,        [GLOBAL(rd)]                 ;
 435         psraw       mm4,        VP8_FILTER_SHIFT        ;
 436
 437         movq        mm7,        mm3                 ;
 438         packuswb    mm7,        mm4                 ;
 439
 440         add         rsi,        rdx                 ; next line
 441 .next_row_8x4:
 442         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
 443         movq        mm4,        mm3                 ; make a copy of current line
 444
 445         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
 446         punpckhbw   mm4,        mm0                 ;
 447
 448         pmullw      mm3,        mm1                 ;
 449         pmullw      mm4,        mm1                 ;
 450
 451         movq        mm5,        [rsi+1]             ;
 452         movq        mm6,        mm5                 ;
 453
 454         punpcklbw   mm5,        mm0                 ;
 455         punpckhbw   mm6,        mm0                 ;
 456
 457         pmullw      mm5,        mm2                 ;
 458         pmullw      mm6,        mm2                 ;
 459
 460         paddw       mm3,        mm5                 ;
 461         paddw       mm4,        mm6                 ;
 462
 463         movq        mm5,        mm7                 ;
 464         movq        mm6,        mm7                 ;
 465
 466         punpcklbw   mm5,        mm0                 ;
 467         punpckhbw   mm6,        mm0
 468
 469         pmullw      mm5,        [rax]               ;
 470         pmullw      mm6,        [rax]               ;
 471
 472         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 473         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 474
 475         paddw       mm4,        [GLOBAL(rd)]                 ;
 476         psraw       mm4,        VP8_FILTER_SHIFT        ;
 477
 478         movq        mm7,        mm3                 ;
 479         packuswb    mm7,        mm4                 ;
 480
 481
 482         pmullw      mm3,        [rax+16]            ;
 483         pmullw      mm4,        [rax+16]            ;
 484
 485         paddw       mm3,        mm5                 ;
 486         paddw       mm4,        mm6                 ;
 487
 488
 489         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 490         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 491
 492         paddw       mm4,        [GLOBAL(rd)]                 ;
 493         psraw       mm4,        VP8_FILTER_SHIFT        ;
 494
 495         packuswb    mm3,        mm4
 496
 497         movq        [rdi],      mm3                 ; store the results in the destination
 498
 499 %if ABI_IS_32BIT
 500         add         rsi,        rdx                 ; next line
 501         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
 502 %else
 503         movsxd      r8,         dword ptr arg(5) ;dst_pitch
 504         add         rsi,        rdx                 ; next line
 505         add         rdi,        r8
 506 %endif
 507         cmp         rdi,        rcx                 ;
 508         jne         .next_row_8x4
 509
 510     ; begin epilog
 511     pop rdi
 512     pop rsi
 513     RESTORE_GOT
 514     UNSHADOW_ARGS
 515     pop         rbp
 516     ret
 517
 518
 519 ;void bilinear_predict4x4_mmx
 520 ;(
 521 ;    unsigned char  *src_ptr,
 522 ;    int   src_pixels_per_line,
 523 ;    int  xoffset,
 524 ;    int  yoffset,
 525 ;    unsigned char *dst_ptr,
 526 ;    int dst_pitch
 527 ;)
 528 global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
 529 sym(vp8_bilinear_predict4x4_mmx):
 530     push        rbp
 531     mov         rbp, rsp
 532     SHADOW_ARGS_TO_STACK 6
 533     GET_GOT     rbx
 534     push        rsi
 535     push        rdi
 536     ; end prolog
 537
 538     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
 539     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
 540
 541         movsxd      rax,        dword ptr arg(2) ;xoffset
 542         mov         rdi,        arg(4) ;dst_ptr           ;
 543
 544         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
 545         shl         rax,        5
 546
 547         add         rax,        rcx ; HFilter
 548         mov         rsi,        arg(0) ;src_ptr              ;
 549
 550         movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
 551         movq        mm1,        [rax]               ;
 552
 553         movq        mm2,        [rax+16]            ;
 554         movsxd      rax,        dword ptr arg(3) ;yoffset
 555
 556         pxor        mm0,        mm0                 ;
 557         shl         rax,        5
 558
 559         add         rax,        rcx
 560         lea         rcx,        [rdi+rdx*4]          ;
 561
 562         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
 563
 564         ; get the first horizontal line done       ;
 565         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
 566         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
 567
 568         pmullw      mm3,        mm1                 ;
 569         movd        mm5,        [rsi+1]             ;
 570
 571         punpcklbw   mm5,        mm0                 ;
 572         pmullw      mm5,        mm2                 ;
 573
 574         paddw       mm3,        mm5                 ;
 575         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 576
 577         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 578
 579         movq        mm7,        mm3                 ;
 580         packuswb    mm7,        mm0                 ;
 581
 582         add         rsi,        rdx                 ; next line
 583 .next_row_4x4:
 584         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
 585         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
 586
 587         pmullw      mm3,        mm1                 ;
 588         movd        mm5,        [rsi+1]             ;
 589
 590         punpcklbw   mm5,        mm0                 ;
 591         pmullw      mm5,        mm2                 ;
 592
 593         paddw       mm3,        mm5                 ;
 594
 595         movq        mm5,        mm7                 ;
 596         punpcklbw   mm5,        mm0                 ;
 597
 598         pmullw      mm5,        [rax]               ;
 599         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 600
 601         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 602         movq        mm7,        mm3                 ;
 603
 604         packuswb    mm7,        mm0                 ;
 605
 606         pmullw      mm3,        [rax+16]            ;
 607         paddw       mm3,        mm5                 ;
 608
 609
 610         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 611         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 612
 613         packuswb    mm3,        mm0
 614         movd        [rdi],      mm3                 ; store the results in the destination
 615
 616 %if ABI_IS_32BIT
 617         add         rsi,        rdx                 ; next line
 618         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
 619 %else
 620         movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
 621         add         rsi,        rdx                 ; next line
 622         add         rdi,        r8
 623 %endif
 624
 625         cmp         rdi,        rcx                 ;
 626         jne         .next_row_4x4
 627
 628     ; begin epilog
 629     pop rdi
 630     pop rsi
 631     RESTORE_GOT
 632     UNSHADOW_ARGS
 633     pop         rbp
 634     ret
 635
 636
 637
 638 SECTION_RODATA
 639 align 16
 640 rd:
 641     times 4 dw 0x40
 642
 643 align 16
 644 global HIDDEN_DATA(sym(vp8_six_tap_mmx))
 645 sym(vp8_six_tap_mmx):
 646     times 8 dw 0
 647     times 8 dw 0
 648     times 8 dw 128
 649     times 8 dw 0
 650     times 8 dw 0
 651     times 8 dw 0
 652
 653     times 8 dw 0
 654     times 8 dw -6
 655     times 8 dw 123
 656     times 8 dw 12
 657     times 8 dw -1
 658     times 8 dw 0
 659
 660     times 8 dw 2
 661     times 8 dw -11
 662     times 8 dw 108
 663     times 8 dw 36
 664     times 8 dw -8
 665     times 8 dw 1
 666
 667     times 8 dw 0
 668     times 8 dw -9
 669     times 8 dw 93
 670     times 8 dw 50
 671     times 8 dw -6
 672     times 8 dw 0
 673
 674     times 8 dw 3
 675     times 8 dw -16
 676     times 8 dw 77
 677     times 8 dw 77
 678     times 8 dw -16
 679     times 8 dw 3
 680
 681     times 8 dw 0
 682     times 8 dw -6
 683     times 8 dw 50
 684     times 8 dw 93
 685     times 8 dw -9
 686     times 8 dw 0
 687
 688     times 8 dw 1
 689     times 8 dw -8
 690     times 8 dw 36
 691     times 8 dw 108
 692     times 8 dw -11
 693     times 8 dw 2
 694
 695     times 8 dw 0
 696     times 8 dw -1
 697     times 8 dw 12
 698     times 8 dw 123
 699     times 8 dw -6
 700     times 8 dw 0
 701
 702