simd/x86_64/jcphuff-sse2.asm

   1 ;
   2 ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
   3 ; (64-bit SSE2)
   4 ;
   5 ; Copyright (C) 2016, 2018, Matthieu Darbois
   6 ;
   7 ; Based on the x86 SIMD extension for IJG JPEG library
   8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
   9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10 ;
  11 ; This file should be assembled with NASM (Netwide Assembler),
  12 ; can *not* be assembled with Microsoft's MASM or any compatible
  13 ; assembler (including Borland's Turbo Assembler).
  14 ; NASM is available from http://nasm.sourceforge.net/ or
  15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16 ;
  17 ; This file contains an SSE2 implementation of data preparation for progressive
  18 ; Huffman encoding.  See jcphuff.c for more details.
  19
  20 %include "jsimdext.inc"
  21
  22 ; --------------------------------------------------------------------------
  23     SECTION     SEG_TEXT
  24     BITS        64
  25
  26 ; --------------------------------------------------------------------------
  27 ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
  28 ; jsimd_encode_mcu_AC_refine_prepare_sse2()
  29
  30 %macro LOAD16 0
  31     pxor        N0, N0
  32     pxor        N1, N1
  33
  34     mov         T0d, INT [LUT +  0*SIZEOF_INT]
  35     mov         T1d, INT [LUT +  8*SIZEOF_INT]
  36     pinsrw      X0, word [BLOCK + T0 * 2], 0
  37     pinsrw      X1, word [BLOCK + T1 * 2], 0
  38
  39     mov         T0d, INT [LUT +  1*SIZEOF_INT]
  40     mov         T1d, INT [LUT +  9*SIZEOF_INT]
  41     pinsrw      X0, word [BLOCK + T0 * 2], 1
  42     pinsrw      X1, word [BLOCK + T1 * 2], 1
  43
  44     mov         T0d, INT [LUT +  2*SIZEOF_INT]
  45     mov         T1d, INT [LUT + 10*SIZEOF_INT]
  46     pinsrw      X0, word [BLOCK + T0 * 2], 2
  47     pinsrw      X1, word [BLOCK + T1 * 2], 2
  48
  49     mov         T0d, INT [LUT +  3*SIZEOF_INT]
  50     mov         T1d, INT [LUT + 11*SIZEOF_INT]
  51     pinsrw      X0, word [BLOCK + T0 * 2], 3
  52     pinsrw      X1, word [BLOCK + T1 * 2], 3
  53
  54     mov         T0d, INT [LUT +  4*SIZEOF_INT]
  55     mov         T1d, INT [LUT + 12*SIZEOF_INT]
  56     pinsrw      X0, word [BLOCK + T0 * 2], 4
  57     pinsrw      X1, word [BLOCK + T1 * 2], 4
  58
  59     mov         T0d, INT [LUT +  5*SIZEOF_INT]
  60     mov         T1d, INT [LUT + 13*SIZEOF_INT]
  61     pinsrw      X0, word [BLOCK + T0 * 2], 5
  62     pinsrw      X1, word [BLOCK + T1 * 2], 5
  63
  64     mov         T0d, INT [LUT +  6*SIZEOF_INT]
  65     mov         T1d, INT [LUT + 14*SIZEOF_INT]
  66     pinsrw      X0, word [BLOCK + T0 * 2], 6
  67     pinsrw      X1, word [BLOCK + T1 * 2], 6
  68
  69     mov         T0d, INT [LUT +  7*SIZEOF_INT]
  70     mov         T1d, INT [LUT + 15*SIZEOF_INT]
  71     pinsrw      X0, word [BLOCK + T0 * 2], 7
  72     pinsrw      X1, word [BLOCK + T1 * 2], 7
  73 %endmacro
  74
  75 %macro LOAD15 0
  76     pxor        N0, N0
  77     pxor        N1, N1
  78     pxor        X1, X1
  79
  80     mov         T0d, INT [LUT +  0*SIZEOF_INT]
  81     mov         T1d, INT [LUT +  8*SIZEOF_INT]
  82     pinsrw      X0, word [BLOCK + T0 * 2], 0
  83     pinsrw      X1, word [BLOCK + T1 * 2], 0
  84
  85     mov         T0d, INT [LUT +  1*SIZEOF_INT]
  86     pinsrw      X0, word [BLOCK + T0 * 2], 1
  87
  88     mov         T0d, INT [LUT +  2*SIZEOF_INT]
  89     pinsrw      X0, word [BLOCK + T0 * 2], 2
  90
  91     mov         T0d, INT [LUT +  3*SIZEOF_INT]
  92     pinsrw      X0, word [BLOCK + T0 * 2], 3
  93
  94     mov         T0d, INT [LUT +  4*SIZEOF_INT]
  95     pinsrw      X0, word [BLOCK + T0 * 2], 4
  96
  97     mov         T0d, INT [LUT +  5*SIZEOF_INT]
  98     pinsrw      X0, word [BLOCK + T0 * 2], 5
  99
 100     mov         T0d, INT [LUT +  6*SIZEOF_INT]
 101     pinsrw      X0, word [BLOCK + T0 * 2], 6
 102
 103     mov         T0d, INT [LUT +  7*SIZEOF_INT]
 104     pinsrw      X0, word [BLOCK + T0 * 2], 7
 105
 106     cmp         LENEND, 2
 107     jl          %%.ELOAD15
 108     mov         T1d, INT [LUT +  9*SIZEOF_INT]
 109     pinsrw      X1, word [BLOCK + T1 * 2], 1
 110
 111     cmp         LENEND, 3
 112     jl          %%.ELOAD15
 113     mov         T1d, INT [LUT + 10*SIZEOF_INT]
 114     pinsrw      X1, word [BLOCK + T1 * 2], 2
 115
 116     cmp         LENEND, 4
 117     jl          %%.ELOAD15
 118     mov         T1d, INT [LUT + 11*SIZEOF_INT]
 119     pinsrw      X1, word [BLOCK + T1 * 2], 3
 120
 121     cmp         LENEND, 5
 122     jl          %%.ELOAD15
 123     mov         T1d, INT [LUT + 12*SIZEOF_INT]
 124     pinsrw      X1, word [BLOCK + T1 * 2], 4
 125
 126     cmp         LENEND, 6
 127     jl          %%.ELOAD15
 128     mov         T1d, INT [LUT + 13*SIZEOF_INT]
 129     pinsrw      X1, word [BLOCK + T1 * 2], 5
 130
 131     cmp         LENEND, 7
 132     jl          %%.ELOAD15
 133     mov         T1d, INT [LUT + 14*SIZEOF_INT]
 134     pinsrw      X1, word [BLOCK + T1 * 2], 6
 135 %%.ELOAD15:
 136 %endmacro
 137
 138 %macro LOAD8 0
 139     pxor        N0, N0
 140
 141     mov         T0d, INT [LUT +  0*SIZEOF_INT]
 142     pinsrw      X0, word [BLOCK + T0 * 2], 0
 143
 144     mov         T0d, INT [LUT +  1*SIZEOF_INT]
 145     pinsrw      X0, word [BLOCK + T0 * 2], 1
 146
 147     mov         T0d, INT [LUT +  2*SIZEOF_INT]
 148     pinsrw      X0, word [BLOCK + T0 * 2], 2
 149
 150     mov         T0d, INT [LUT +  3*SIZEOF_INT]
 151     pinsrw      X0, word [BLOCK + T0 * 2], 3
 152
 153     mov         T0d, INT [LUT +  4*SIZEOF_INT]
 154     pinsrw      X0, word [BLOCK + T0 * 2], 4
 155
 156     mov         T0d, INT [LUT +  5*SIZEOF_INT]
 157     pinsrw      X0, word [BLOCK + T0 * 2], 5
 158
 159     mov         T0d, INT [LUT +  6*SIZEOF_INT]
 160     pinsrw      X0, word [BLOCK + T0 * 2], 6
 161
 162     mov         T0d, INT [LUT +  7*SIZEOF_INT]
 163     pinsrw      X0, word [BLOCK + T0 * 2], 7
 164 %endmacro
 165
 166 %macro LOAD7 0
 167     pxor        N0, N0
 168     pxor        X0, X0
 169
 170     mov         T1d, INT [LUT +  0*SIZEOF_INT]
 171     pinsrw      X0, word [BLOCK + T1 * 2], 0
 172
 173     cmp         LENEND, 2
 174     jl          %%.ELOAD7
 175     mov         T1d, INT [LUT +  1*SIZEOF_INT]
 176     pinsrw      X0, word [BLOCK + T1 * 2], 1
 177
 178     cmp         LENEND, 3
 179     jl          %%.ELOAD7
 180     mov         T1d, INT [LUT +  2*SIZEOF_INT]
 181     pinsrw      X0, word [BLOCK + T1 * 2], 2
 182
 183     cmp         LENEND, 4
 184     jl          %%.ELOAD7
 185     mov         T1d, INT [LUT +  3*SIZEOF_INT]
 186     pinsrw      X0, word [BLOCK + T1 * 2], 3
 187
 188     cmp         LENEND, 5
 189     jl          %%.ELOAD7
 190     mov         T1d, INT [LUT +  4*SIZEOF_INT]
 191     pinsrw      X0, word [BLOCK + T1 * 2], 4
 192
 193     cmp         LENEND, 6
 194     jl          %%.ELOAD7
 195     mov         T1d, INT [LUT +  5*SIZEOF_INT]
 196     pinsrw      X0, word [BLOCK + T1 * 2], 5
 197
 198     cmp         LENEND, 7
 199     jl          %%.ELOAD7
 200     mov         T1d, INT [LUT +  6*SIZEOF_INT]
 201     pinsrw      X0, word [BLOCK + T1 * 2], 6
 202 %%.ELOAD7:
 203 %endmacro
 204
 205 %macro REDUCE0 0
 206     movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
 207     movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
 208     movdqa      xmm2, XMMWORD [VALUES + (16*2)]
 209     movdqa      xmm3, XMMWORD [VALUES + (24*2)]
 210     movdqa      xmm4, XMMWORD [VALUES + (32*2)]
 211     movdqa      xmm5, XMMWORD [VALUES + (40*2)]
 212     movdqa      xmm6, XMMWORD [VALUES + (48*2)]
 213     movdqa      xmm7, XMMWORD [VALUES + (56*2)]
 214
 215     pcmpeqw     xmm0, ZERO
 216     pcmpeqw     xmm1, ZERO
 217     pcmpeqw     xmm2, ZERO
 218     pcmpeqw     xmm3, ZERO
 219     pcmpeqw     xmm4, ZERO
 220     pcmpeqw     xmm5, ZERO
 221     pcmpeqw     xmm6, ZERO
 222     pcmpeqw     xmm7, ZERO
 223
 224     packsswb    xmm0, xmm1
 225     packsswb    xmm2, xmm3
 226     packsswb    xmm4, xmm5
 227     packsswb    xmm6, xmm7
 228
 229     pmovmskb    eax, xmm0
 230     pmovmskb    ecx, xmm2
 231     pmovmskb    edx, xmm4
 232     pmovmskb    esi, xmm6
 233
 234     shl         rcx, 16
 235     shl         rdx, 32
 236     shl         rsi, 48
 237
 238     or          rax, rcx
 239     or          rdx, rsi
 240     or          rax, rdx
 241
 242     not         rax
 243
 244     mov         MMWORD [r15], rax
 245 %endmacro
 246
 247 ;
 248 ; Prepare data for jsimd_encode_mcu_AC_first().
 249 ;
 250 ; GLOBAL(void)
 251 ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
 252 ;                                        const int *jpeg_natural_order_start,
 253 ;                                        int Sl, int Al, JCOEF *values,
 254 ;                                        size_t *zerobits)
 255 ;
 256 ; r10 = const JCOEF *block
 257 ; r11 = const int *jpeg_natural_order_start
 258 ; r12 = int Sl
 259 ; r13 = int Al
 260 ; r14 = JCOEF *values
 261 ; r15 = size_t *zerobits
 262
 263 %define ZERO    xmm9
 264 %define X0      xmm0
 265 %define X1      xmm1
 266 %define N0      xmm2
 267 %define N1      xmm3
 268 %define AL      xmm4
 269 %define K       eax
 270 %define LUT     r11
 271 %define T0      rcx
 272 %define T0d     ecx
 273 %define T1      rdx
 274 %define T1d     edx
 275 %define BLOCK   r10
 276 %define VALUES  r14
 277 %define LEN     r12d
 278 %define LENEND  r13d
 279
 280     align       32
 281     GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
 282
 283 EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
 284     push        rbp
 285     mov         rax, rsp                     ; rax = original rbp
 286     sub         rsp, byte 4
 287     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
 288     mov         [rsp], rax
 289     mov         rbp, rsp                     ; rbp = aligned rbp
 290     lea         rsp, [rbp - 16]
 291     collect_args 6
 292
 293     movdqa      XMMWORD [rbp - 16], ZERO
 294
 295     movd        AL, r13d
 296     pxor        ZERO, ZERO
 297     mov         K, LEN
 298     mov         LENEND, LEN
 299     and         K, -16
 300     and         LENEND, 7
 301     shr         K, 4
 302     jz          .ELOOP16
 303 .BLOOP16:
 304     LOAD16
 305     pcmpgtw     N0, X0
 306     pcmpgtw     N1, X1
 307     paddw       X0, N0
 308     paddw       X1, N1
 309     pxor        X0, N0
 310     pxor        X1, N1
 311     psrlw       X0, AL
 312     psrlw       X1, AL
 313     pxor        N0, X0
 314     pxor        N1, X1
 315     movdqa      XMMWORD [VALUES + (0) * 2], X0
 316     movdqa      XMMWORD [VALUES + (8) * 2], X1
 317     movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
 318     movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
 319     add         VALUES, 16*2
 320     add         LUT, 16*SIZEOF_INT
 321     dec         K
 322     jnz         .BLOOP16
 323     test        LEN, 15
 324     je          .PADDING
 325 .ELOOP16:
 326     test        LEN, 8
 327     jz          .TRY7
 328     test        LEN, 7
 329     jz          .TRY8
 330
 331     LOAD15
 332     pcmpgtw     N0, X0
 333     pcmpgtw     N1, X1
 334     paddw       X0, N0
 335     paddw       X1, N1
 336     pxor        X0, N0
 337     pxor        X1, N1
 338     psrlw       X0, AL
 339     psrlw       X1, AL
 340     pxor        N0, X0
 341     pxor        N1, X1
 342     movdqa      XMMWORD [VALUES + (0) * 2], X0
 343     movdqa      XMMWORD [VALUES + (8) * 2], X1
 344     movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
 345     movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
 346     add         VALUES, 16*2
 347     jmp         .PADDING
 348 .TRY8:
 349     LOAD8
 350     pcmpgtw     N0, X0
 351     paddw       X0, N0
 352     pxor        X0, N0
 353     psrlw       X0, AL
 354     pxor        N0, X0
 355     movdqa      XMMWORD [VALUES + (0) * 2], X0
 356     movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
 357     add         VALUES, 8*2
 358     jmp         .PADDING
 359 .TRY7:
 360     LOAD7
 361     pcmpgtw     N0, X0
 362     paddw       X0, N0
 363     pxor        X0, N0
 364     psrlw       X0, AL
 365     pxor        N0, X0
 366     movdqa      XMMWORD [VALUES + (0) * 2], X0
 367     movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
 368     add         VALUES, 8*2
 369 .PADDING:
 370     mov         K, LEN
 371     add         K, 7
 372     and         K, -8
 373     shr         K, 3
 374     sub         K, DCTSIZE2/8
 375     jz          .EPADDING
 376     align       16
 377 .ZEROLOOP:
 378     movdqa      XMMWORD [VALUES + 0], ZERO
 379     add         VALUES, 8*2
 380     inc         K
 381     jnz         .ZEROLOOP
 382 .EPADDING:
 383     sub         VALUES, DCTSIZE2*2
 384
 385     REDUCE0
 386
 387     movdqa      ZERO, XMMWORD [rbp - 16]
 388     uncollect_args 6
 389     mov         rsp, rbp                ; rsp <- aligned rbp
 390     pop         rsp                     ; rsp <- original rbp
 391     pop         rbp
 392     ret
 393
 394 %undef ZERO
 395 %undef X0
 396 %undef X1
 397 %undef N0
 398 %undef N1
 399 %undef AL
 400 %undef K
 401 %undef LUT
 402 %undef T0
 403 %undef T0d
 404 %undef T1
 405 %undef T1d
 406 %undef BLOCK
 407 %undef VALUES
 408 %undef LEN
 409 %undef LENEND
 410
 411 ;
 412 ; Prepare data for jsimd_encode_mcu_AC_refine().
 413 ;
 414 ; GLOBAL(int)
 415 ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
 416 ;                                         const int *jpeg_natural_order_start,
 417 ;                                         int Sl, int Al, JCOEF *absvalues,
 418 ;                                         size_t *bits)
 419 ;
 420 ; r10 = const JCOEF *block
 421 ; r11 = const int *jpeg_natural_order_start
 422 ; r12 = int Sl
 423 ; r13 = int Al
 424 ; r14 = JCOEF *values
 425 ; r15 = size_t *bits
 426
 427 %define ZERO    xmm9
 428 %define ONE     xmm5
 429 %define X0      xmm0
 430 %define X1      xmm1
 431 %define N0      xmm2
 432 %define N1      xmm3
 433 %define AL      xmm4
 434 %define K       eax
 435 %define KK      r9d
 436 %define EOB     r8d
 437 %define SIGN    rdi
 438 %define LUT     r11
 439 %define T0      rcx
 440 %define T0d     ecx
 441 %define T1      rdx
 442 %define T1d     edx
 443 %define BLOCK   r10
 444 %define VALUES  r14
 445 %define LEN     r12d
 446 %define LENEND  r13d
 447
 448     align       32
 449     GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
 450
 451 EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
 452     push        rbp
 453     mov         rax, rsp                     ; rax = original rbp
 454     sub         rsp, byte 4
 455     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
 456     mov         [rsp], rax
 457     mov         rbp, rsp                     ; rbp = aligned rbp
 458     lea         rsp, [rbp - 16]
 459     collect_args 6
 460
 461     movdqa      XMMWORD [rbp - 16], ZERO
 462
 463     xor         SIGN, SIGN
 464     xor         EOB, EOB
 465     xor         KK, KK
 466     movd        AL, r13d
 467     pxor        ZERO, ZERO
 468     pcmpeqw     ONE, ONE
 469     psrlw       ONE, 15
 470     mov         K, LEN
 471     mov         LENEND, LEN
 472     and         K, -16
 473     and         LENEND, 7
 474     shr         K, 4
 475     jz          .ELOOPR16
 476 .BLOOPR16:
 477     LOAD16
 478     pcmpgtw     N0, X0
 479     pcmpgtw     N1, X1
 480     paddw       X0, N0
 481     paddw       X1, N1
 482     pxor        X0, N0
 483     pxor        X1, N1
 484     psrlw       X0, AL
 485     psrlw       X1, AL
 486     movdqa      XMMWORD [VALUES + (0) * 2], X0
 487     movdqa      XMMWORD [VALUES + (8) * 2], X1
 488     pcmpeqw     X0, ONE
 489     pcmpeqw     X1, ONE
 490     packsswb    N0, N1
 491     packsswb    X0, X1
 492     pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
 493     pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
 494     shr         SIGN, 16                ; make room for sizebits
 495     shl         T0, 48
 496     or          SIGN, T0
 497     bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
 498     jz          .CONTINUER16            ; if (idx) {
 499     mov         EOB, KK
 500     add         EOB, T1d                ; EOB = k + idx;
 501 .CONTINUER16:
 502     add         VALUES, 16*2
 503     add         LUT, 16*SIZEOF_INT
 504     add         KK, 16
 505     dec         K
 506     jnz         .BLOOPR16
 507     test        LEN, 15
 508     je          .PADDINGR
 509 .ELOOPR16:
 510     test        LEN, 8
 511     jz          .TRYR7
 512     test        LEN, 7
 513     jz          .TRYR8
 514
 515     LOAD15
 516     pcmpgtw     N0, X0
 517     pcmpgtw     N1, X1
 518     paddw       X0, N0
 519     paddw       X1, N1
 520     pxor        X0, N0
 521     pxor        X1, N1
 522     psrlw       X0, AL
 523     psrlw       X1, AL
 524     movdqa      XMMWORD [VALUES + (0) * 2], X0
 525     movdqa      XMMWORD [VALUES + (8) * 2], X1
 526     pcmpeqw     X0, ONE
 527     pcmpeqw     X1, ONE
 528     packsswb    N0, N1
 529     packsswb    X0, X1
 530     pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
 531     pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
 532     shr         SIGN, 16                ; make room for sizebits
 533     shl         T0, 48
 534     or          SIGN, T0
 535     bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
 536     jz          .CONTINUER15            ; if (idx) {
 537     mov         EOB, KK
 538     add         EOB, T1d                ; EOB = k + idx;
 539 .CONTINUER15:
 540     add         VALUES, 16*2
 541     jmp         .PADDINGR
 542 .TRYR8:
 543     LOAD8
 544
 545     pcmpgtw     N0, X0
 546     paddw       X0, N0
 547     pxor        X0, N0
 548     psrlw       X0, AL
 549     movdqa      XMMWORD [VALUES + (0) * 2], X0
 550     pcmpeqw     X0, ONE
 551     packsswb    N0, ZERO
 552     packsswb    X0, ZERO
 553     pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
 554     pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
 555     shr         SIGN, 8                 ; make room for sizebits
 556     shl         T0, 56
 557     or          SIGN, T0
 558     bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
 559     jz          .CONTINUER8             ; if (idx) {
 560     mov         EOB, KK
 561     add         EOB, T1d                ; EOB = k + idx;
 562 .CONTINUER8:
 563     add         VALUES, 8*2
 564     jmp         .PADDINGR
 565 .TRYR7:
 566     LOAD7
 567
 568     pcmpgtw     N0, X0
 569     paddw       X0, N0
 570     pxor        X0, N0
 571     psrlw       X0, AL
 572     movdqa      XMMWORD [VALUES + (0) * 2], X0
 573     pcmpeqw     X0, ONE
 574     packsswb    N0, ZERO
 575     packsswb    X0, ZERO
 576     pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
 577     pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
 578     shr         SIGN, 8                 ; make room for sizebits
 579     shl         T0, 56
 580     or          SIGN, T0
 581     bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
 582     jz          .CONTINUER7             ; if (idx) {
 583     mov         EOB, KK
 584     add         EOB, T1d                ; EOB = k + idx;
 585 .CONTINUER7:
 586     add         VALUES, 8*2
 587 .PADDINGR:
 588     mov         K, LEN
 589     add         K, 7
 590     and         K, -8
 591     shr         K, 3
 592     sub         K, DCTSIZE2/8
 593     jz          .EPADDINGR
 594     align       16
 595 .ZEROLOOPR:
 596     movdqa      XMMWORD [VALUES + 0], ZERO
 597     shr         SIGN, 8
 598     add         VALUES, 8*2
 599     inc         K
 600     jnz         .ZEROLOOPR
 601 .EPADDINGR:
 602     not         SIGN
 603     sub         VALUES, DCTSIZE2*2
 604     mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
 605
 606     REDUCE0
 607
 608     mov         eax, EOB
 609     movdqa      ZERO, XMMWORD [rbp - 16]
 610     uncollect_args 6
 611     mov         rsp, rbp                ; rsp <- aligned rbp
 612     pop         rsp                     ; rsp <- original rbp
 613     pop         rbp
 614     ret
 615
 616 %undef ZERO
 617 %undef ONE
 618 %undef X0
 619 %undef X1
 620 %undef N0
 621 %undef N1
 622 %undef AL
 623 %undef K
 624 %undef KK
 625 %undef EOB
 626 %undef SIGN
 627 %undef LUT
 628 %undef T0
 629 %undef T0d
 630 %undef T1
 631 %undef T1d
 632 %undef BLOCK
 633 %undef VALUES
 634 %undef LEN
 635 %undef LENEND
 636
 637 ; For some reason, the OS X linker does not honor the request to align the
 638 ; segment unless we do this.
 639     align       32