src/third_party/skia/src/opts/SkBlitRow_opts_SSE4_asm.S

   1 /*
   2  * Copyright 2014 The Android Open Source Project
   3  *
   4  * Use of this source code is governed by a BSD-style license that can be
   5  * found in the LICENSE file.
   6  */
   7
   8 #if defined(__clang__) || (defined(__GNUC__) && !defined(SK_BUILD_FOR_MAC))
   9
  10 #define CFI_PUSH(REG) \
  11     .cfi_adjust_cfa_offset 4; \
  12     .cfi_rel_offset REG, 0
  13
  14 #define CFI_POP(REG) \
  15     .cfi_adjust_cfa_offset -4; \
  16     .cfi_restore REG
  17
  18 #define PUSH(REG) pushl REG; CFI_PUSH (REG)
  19 #define POP(REG)  popl REG; CFI_POP (REG)
  20 #define RETURN    POP(%edi); ret
  21
  22 #define EXTRACT_ALPHA(var1, var2) \
  23     movdqa      %var1, %var2;           /* Clone source pixels to extract alpha */\
  24     psrlw       $8, %var2;              /* Discard red and blue, leaving alpha and green */\
  25     pshufhw     $0xF5, %var2, %var2;    /* Repeat alpha for scaling (high) */\
  26     movdqa      %xmm6, %xmm4;           \
  27     pshuflw     $0xF5, %var2, %var2;    /* Repeat alpha for scaling (low) */\
  28     movdqa      %xmm5, %xmm3;           \
  29     psubw       %var2, %xmm4            /* Finalize alpha calculations */
  30
  31 #define SCALE_PIXELS \
  32     psllw       $8, %xmm5;              /* Filter out red and blue components */\
  33     pmulhuw     %xmm4, %xmm5;           /* Scale red and blue */\
  34     psrlw       $8, %xmm3;              /* Filter out alpha and green components */\
  35     pmullw      %xmm4, %xmm3            /* Scale alpha and green */
  36
  37
  38 /*
  39  * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
  40  *                                 const SkPMColor* SK_RESTRICT src,
  41  *                                 int count, U8CPU alpha)
  42  *
  43  * This function is divided into six blocks: initialization, blit 4-15 pixels,
  44  * blit 0-3 pixels, align destination for 16+ pixel blits,
  45  * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned.
  46  * There are some code reuse between the blocks.
  47  *
  48  * The primary optimization comes from checking the source pixels' alpha value.
  49  * If the alpha is zero, the pixel can be skipped entirely.
  50  * If the alpha is fully opaque, the pixel can be copied directly to the destination.
  51  * According to collected statistics, these two cases are the most common.
  52  * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the
  53  * memory latency worse-case.
  54  */
  55
  56 #ifdef __clang__
  57     .text
  58 #else
  59     .section .text.sse4.2,"ax",@progbits
  60     .type S32A_Opaque_BlitRow32_SSE4_asm, @function
  61 #endif
  62     .p2align 4
  63 #if defined(SK_BUILD_FOR_MAC)
  64     .global _S32A_Opaque_BlitRow32_SSE4_asm
  65     .private_extern _S32A_Opaque_BlitRow32_SSE4_asm
  66 _S32A_Opaque_BlitRow32_SSE4_asm:
  67 #else
  68     .global S32A_Opaque_BlitRow32_SSE4_asm
  69     .hidden S32A_Opaque_BlitRow32_SSE4_asm
  70 S32A_Opaque_BlitRow32_SSE4_asm:
  71 #endif
  72     .cfi_startproc
  73     movl        8(%esp), %eax           // Source pointer
  74     movl        12(%esp), %ecx          // Pixel count
  75     movl        4(%esp), %edx           // Destination pointer
  76     prefetcht0  (%eax)
  77
  78     // Setup SSE constants
  79     pcmpeqd     %xmm7, %xmm7            // 0xFF000000 mask to check alpha
  80     pslld       $24, %xmm7
  81     pcmpeqw     %xmm6, %xmm6            // 16-bit 256 to calculate inv. alpha
  82     psrlw       $15, %xmm6
  83     psllw       $8, %xmm6
  84     pcmpeqw     %xmm0, %xmm0            // 0x00FF00FF mask (Must be in xmm0 because of pblendvb)
  85     psrlw       $8, %xmm0
  86     subl        $4, %ecx                // Check if we have only 0-3 pixels
  87     js          .LReallySmall
  88     PUSH(%edi)
  89     cmpl        $11, %ecx               // Do we have enough pixels to run the main loop?
  90     ja          .LBigBlit
  91
  92     // Handle small blits (4-15 pixels)
  93     ////////////////////////////////////////////////////////////////////////////////
  94     xorl        %edi, %edi              // Reset offset to zero
  95
  96 .LSmallLoop:
  97     lddqu       (%eax, %edi), %xmm1     // Load four source pixels
  98     ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
  99     ja          .LSmallAlphaNotOpaqueOrZero
 100     jz          .LSmallAlphaZero        // If all alphas are zero, skip the pixels completely
 101     movdqu      %xmm1, (%edx, %edi)     // Store four destination pixels
 102 .LSmallAlphaZero:
 103     addl        $16, %edi
 104     subl        $4, %ecx                // Check if there are four additional pixels, at least
 105     jns         .LSmallLoop
 106     jmp         .LSmallRemaining
 107
 108     // Handle mixed alphas (calculate and scale)
 109     .p2align 4
 110 .LSmallAlphaNotOpaqueOrZero:
 111     lddqu       (%edx, %edi), %xmm5     // Load four destination pixels
 112     EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
 113     SCALE_PIXELS                        // Scale pixels using alpha
 114
 115     addl        $16, %edi
 116     subl        $4, %ecx                // Check if there are four additional pixels, at least
 117     pblendvb    %xmm5, %xmm3            // Mask in %xmm0, implicitly
 118     paddb       %xmm3, %xmm1            // Add source and destination pixels together
 119     movdqu      %xmm1, -16(%edx, %edi)  // Store four destination pixels
 120     jns         .LSmallLoop
 121
 122     // Handle the last 0-3 pixels (also used by the main loops)
 123 .LSmallRemaining:
 124     cmpl        $-4, %ecx               // Check if we are done
 125     je          .LSmallExit
 126     sall        $2, %ecx                // Calculate offset for last pixels
 127     addl        %ecx, %edi
 128
 129     lddqu       (%eax, %edi), %xmm1     // Load last four source pixels (overlapping)
 130     ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
 131     jc          .LSmallRemainingStoreAll// If all alphas are opaque, just store (overlapping)
 132     jz          .LSmallExit             // If all alphas are zero, skip the pixels completely
 133
 134     // Handle mixed alphas (calculate and scale)
 135     lddqu       (%edx, %edi), %xmm5     // Load last four destination pixels (overlapping)
 136     EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
 137
 138     psllw       $8, %xmm3               // Filter out red and blue components
 139     pmulhuw     %xmm4, %xmm3            // Scale red and blue
 140     movdqa      %xmm5, %xmm2
 141     psrlw       $8, %xmm2               // Filter out alpha and green components
 142     pmullw      %xmm4, %xmm2            // Scale alpha and green
 143
 144     cmpl        $-8, %ecx               // Check how many pixels should be written
 145     pblendvb    %xmm3, %xmm2            // Combine results (mask in %xmm0, implicitly)
 146     paddb       %xmm2, %xmm1            // Add source and destination pixels together
 147     jb          .LSmallPixelsLeft1
 148     ja          .LSmallPixelsLeft3      // To avoid double-blending the overlapping pixels...
 149     pblendw     $0xF0, %xmm1, %xmm5     // Merge only the final two pixels to the destination
 150     movdqu      %xmm5, (%edx, %edi)     // Store last two destination pixels
 151 .LSmallExit:
 152     RETURN
 153
 154 .LSmallPixelsLeft1:
 155     pblendw     $0xC0, %xmm1, %xmm5     // Merge only the final pixel to the destination
 156     movdqu      %xmm5, (%edx, %edi)     // Store last destination pixel
 157     RETURN
 158
 159 .LSmallPixelsLeft3:
 160     pblendw     $0xFC, %xmm1, %xmm5     // Merge only the final three pixels to the destination
 161     movdqu      %xmm5, (%edx, %edi)     // Store last three destination pixels
 162     RETURN
 163
 164 .LSmallRemainingStoreAll:
 165     movdqu      %xmm1, (%edx, %edi)     // Store last destination pixels (overwrite)
 166     RETURN
 167
 168     // Handle really small blits (0-3 pixels)
 169     ////////////////////////////////////////////////////////////////////////////////
 170 .LReallySmall:
 171     addl        $4, %ecx
 172     jle         .LReallySmallExit
 173     pcmpeqd     %xmm1, %xmm1
 174     cmp         $2, %ecx                // Check how many pixels should be read
 175     pinsrd      $0x0, (%eax), %xmm1     // Load one source pixel
 176     pinsrd      $0x0, (%edx), %xmm5     // Load one destination pixel
 177     jb          .LReallySmallCalc
 178     pinsrd      $0x1, 4(%eax), %xmm1    // Load second source pixel
 179     pinsrd      $0x1, 4(%edx), %xmm5    // Load second destination pixel
 180     je          .LReallySmallCalc
 181     pinsrd      $0x2, 8(%eax), %xmm1    // Load third source pixel
 182     pinsrd      $0x2, 8(%edx), %xmm5    // Load third destination pixel
 183
 184 .LReallySmallCalc:
 185     ptest       %xmm7, %xmm1            // Check if all alphas are opaque
 186     jc          .LReallySmallStore      // If all alphas are opaque, just store
 187
 188     // Handle mixed alphas (calculate and scale)
 189     EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
 190
 191     pand        %xmm0, %xmm5            // Filter out red and blue components
 192     pmullw      %xmm4, %xmm5            // Scale red and blue
 193     psrlw       $8, %xmm3               // Filter out alpha and green components
 194     pmullw      %xmm4, %xmm3            // Scale alpha and green
 195
 196     psrlw       $8, %xmm5               // Combine results
 197     pblendvb    %xmm5, %xmm3            // Mask in %xmm0, implicitly
 198     paddb       %xmm3, %xmm1            // Add source and destination pixels together
 199
 200 .LReallySmallStore:
 201     cmp         $2, %ecx                // Check how many pixels should be written
 202     pextrd      $0x0, %xmm1, (%edx)     // Store one destination pixel
 203     jb          .LReallySmallExit
 204     pextrd      $0x1, %xmm1, 4(%edx)    // Store second destination pixel
 205     je          .LReallySmallExit
 206     pextrd      $0x2, %xmm1, 8(%edx)    // Store third destination pixel
 207 .LReallySmallExit:
 208     ret
 209
 210     // Handle bigger blit operations (16+ pixels)
 211     ////////////////////////////////////////////////////////////////////////////////
 212     .p2align 4
 213 .LBigBlit:
 214     // Align destination?
 215     testl       $0xF, %edx
 216     lddqu       (%eax), %xmm1           // Pre-load four source pixels
 217     jz          .LAligned
 218
 219     movl        %edx, %edi              // Calculate alignment of destination pointer
 220     negl        %edi
 221     andl        $0xF, %edi
 222
 223     // Handle 1-3 pixels to align destination
 224     ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
 225     jz          .LAlignDone             // If all alphas are zero, just skip
 226     lddqu       (%edx), %xmm5           // Load four destination pixels
 227     jc          .LAlignStore            // If all alphas are opaque, just store
 228
 229     // Handle mixed alphas (calculate and scale)
 230     EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
 231
 232     psllw       $8, %xmm3               // Filter out red and blue components
 233     pmulhuw     %xmm4, %xmm3            // Scale red and blue
 234     movdqa      %xmm5, %xmm2
 235     psrlw       $8, %xmm2               // Filter out alpha and green components
 236     pmullw      %xmm4, %xmm2            // Scale alpha and green
 237
 238     pblendvb    %xmm3, %xmm2            // Combine results (mask in %xmm0, implicitly)
 239     paddb       %xmm2, %xmm1            // Add source and destination pixels together
 240
 241 .LAlignStore:
 242     cmp         $8, %edi                // Check how many pixels should be written
 243     jb          .LAlignPixelsLeft1
 244     ja          .LAlignPixelsLeft3
 245     pblendw     $0x0F, %xmm1, %xmm5     // Blend two pixels
 246     jmp .LAlignStorePixels
 247
 248 .LAlignPixelsLeft1:
 249     pblendw     $0x03, %xmm1, %xmm5     // Blend one pixel
 250     jmp .LAlignStorePixels
 251
 252 .LAlignPixelsLeft3:
 253     pblendw     $0x3F, %xmm1, %xmm5     // Blend three pixels
 254
 255 .LAlignStorePixels:
 256     movdqu      %xmm5, (%edx)           // Store destination pixels
 257
 258 .LAlignDone:
 259     addl        %edi, %eax              // Adjust pointers and pixel count
 260     addl        %edi, %edx
 261     shrl        $2, %edi
 262     lddqu       (%eax), %xmm1           // Pre-load new source pixels (after alignment)
 263     subl        %edi, %ecx
 264
 265 .LAligned:                              // Destination is guaranteed to be 16 byte aligned
 266     xorl        %edi, %edi              // Reset offset to zero
 267     subl        $8, %ecx                // Decrease counter (Reserve four pixels for the cleanup)
 268     testl       $0xF, %eax              // Check alignment of source pointer
 269     jz          .LAlignedLoop
 270
 271     // Source not aligned to destination
 272     ////////////////////////////////////////////////////////////////////////////////
 273     .p2align 4
 274 .LUnalignedLoop:                        // Main loop for unaligned, handles eight pixels per iteration
 275     ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
 276     ja          .LAlphaNotOpaqueOrZero00
 277     lddqu       16(%eax, %edi), %xmm2   // Pre-load four source pixels
 278     jz          .LAlphaZero00
 279     movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
 280
 281 .LAlphaZero00:
 282     ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
 283     ja          .LAlphaNotOpaqueOrZero01
 284     lddqu       32(%eax, %edi), %xmm1   // Pre-load four source pixels
 285     jz          .LAlphaZero01
 286     movdqa      %xmm2, 16(%edx, %edi)   // Store four destination pixels
 287
 288 .LAlphaZero01:
 289     addl        $32, %edi               // Adjust offset and pixel count
 290     subl        $8, %ecx
 291     jae         .LUnalignedLoop
 292     addl        $8, %ecx                // Adjust pixel count
 293     jmp         .LLoopCleanup0
 294
 295     .p2align 4
 296 .LAlphaNotOpaqueOrZero00:
 297     movdqa      (%edx, %edi), %xmm5     // Load four destination pixels
 298     EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
 299     SCALE_PIXELS                        // Scale pixels using alpha
 300
 301     lddqu       16(%eax, %edi), %xmm2   // Pre-load four source pixels
 302     pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
 303     paddb       %xmm3, %xmm1            // Add source and destination pixels together
 304     movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
 305
 306     // Handle next four pixels
 307     ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
 308     ja          .LAlphaNotOpaqueOrZero01
 309     lddqu       32(%eax, %edi), %xmm1   // Pre-load four source pixels
 310     jz          .LAlphaZero02
 311     movdqa      %xmm2, 16(%edx, %edi)   // Store four destination pixels
 312 .LAlphaZero02:
 313     addl        $32, %edi               // Adjust offset and pixel count
 314     subl        $8, %ecx
 315     jae         .LUnalignedLoop
 316     addl        $8, %ecx                // Adjust pixel count
 317     jmp         .LLoopCleanup0
 318
 319     .p2align 4
 320 .LAlphaNotOpaqueOrZero01:
 321     movdqa      16(%edx, %edi), %xmm5   // Load four destination pixels
 322     EXTRACT_ALPHA(xmm2, xmm1)           // Extract and clone alpha value
 323     SCALE_PIXELS                        // Scale pixels using alpha
 324
 325     lddqu       32(%eax, %edi), %xmm1   // Pre-load four source pixels
 326     addl        $32, %edi
 327     pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
 328     paddb       %xmm3, %xmm2            // Add source and destination pixels together
 329     subl        $8, %ecx
 330     movdqa      %xmm2, -16(%edx, %edi)  // Store four destination pixels
 331     jae         .LUnalignedLoop
 332     addl        $8, %ecx                // Adjust pixel count
 333
 334     // Cleanup - handle pending pixels from loop
 335 .LLoopCleanup0:
 336     ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
 337     ja          .LAlphaNotOpaqueOrZero02
 338     jz          .LAlphaZero03
 339     movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
 340 .LAlphaZero03:
 341     addl        $16, %edi
 342     subl        $4, %ecx
 343     js          .LSmallRemaining        // Reuse code from small loop
 344
 345 .LRemain0:
 346     lddqu       (%eax, %edi), %xmm1     // Load four source pixels
 347     ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
 348     ja          .LAlphaNotOpaqueOrZero02
 349     jz          .LAlphaZero04
 350     movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
 351 .LAlphaZero04:
 352     addl        $16, %edi
 353     subl        $4, %ecx
 354     jmp         .LSmallRemaining        // Reuse code from small loop
 355
 356 .LAlphaNotOpaqueOrZero02:
 357     movdqa      (%edx, %edi), %xmm5     // Load four destination pixels
 358     EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
 359     SCALE_PIXELS                        // Scale pixels using alpha
 360
 361     addl        $16, %edi
 362     subl        $4, %ecx
 363     pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
 364     paddb       %xmm3, %xmm1            // Add source and destination pixels together
 365     movdqa      %xmm1, -16(%edx, %edi)  // Store four destination pixels
 366     js          .LSmallRemaining        // Reuse code from small loop
 367     jmp         .LRemain0
 368
 369     // Source aligned to destination
 370     ////////////////////////////////////////////////////////////////////////////////
 371     .p2align 4
 372 .LAlignedLoop:                          // Main loop for aligned, handles eight pixels per iteration
 373     ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
 374     ja          .LAlphaNotOpaqueOrZero10
 375     movdqa      16(%eax, %edi), %xmm2   // Pre-load four source pixels
 376     jz          .LAlphaZero10
 377     movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
 378
 379 .LAlphaZero10:
 380     ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
 381     ja          .LAlphaNotOpaqueOrZero11
 382     movdqa      32(%eax, %edi), %xmm1   // Pre-load four source pixels
 383     jz          .LAlphaZero11
 384     movdqa      %xmm2, 16(%edx, %edi)   // Store four destination pixels
 385
 386 .LAlphaZero11:
 387     addl        $32, %edi               // Adjust offset and pixel count
 388     subl        $8, %ecx
 389     jae         .LAlignedLoop
 390     addl        $8, %ecx                // Adjust pixel count
 391     jmp         .LLoopCleanup1
 392
 393     .p2align 4
 394 .LAlphaNotOpaqueOrZero10:
 395     movdqa      (%edx, %edi), %xmm5     // Load four destination pixels
 396     EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
 397     SCALE_PIXELS                        // Scale pixels using alpha
 398
 399     movdqa      16(%eax, %edi), %xmm2   // Pre-load four source pixels
 400     pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
 401     paddb       %xmm3, %xmm1            // Add source and destination pixels together
 402     movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
 403
 404     // Handle next four pixels
 405     ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
 406     ja          .LAlphaNotOpaqueOrZero11
 407     movdqa      32(%eax, %edi), %xmm1   // Pre-load four source pixels
 408     jz          .LAlphaZero12
 409     movdqa      %xmm2, 16(%edx, %edi)   // Store four destination pixels
 410 .LAlphaZero12:
 411     addl        $32, %edi               // Adjust offset and pixel count
 412     subl        $8, %ecx
 413     jae         .LAlignedLoop
 414     addl        $8, %ecx                // Adjust pixel count
 415     jmp         .LLoopCleanup1
 416
 417     .p2align 4
 418 .LAlphaNotOpaqueOrZero11:
 419     movdqa      16(%edx, %edi), %xmm5   // Load four destination pixels
 420     EXTRACT_ALPHA(xmm2, xmm1)           // Extract and clone alpha value
 421     SCALE_PIXELS                        // Scale pixels using alpha
 422     movdqa      32(%eax, %edi), %xmm1   // Pre-load four source pixels
 423
 424     addl        $32, %edi
 425     pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
 426     paddb       %xmm3, %xmm2            // Add source and destination pixels together
 427     subl        $8, %ecx
 428     movdqa      %xmm2, -16(%edx, %edi)  // Store four destination pixels
 429     jae         .LAlignedLoop
 430     addl        $8, %ecx                // Adjust pixel count
 431
 432     // Cleanup - handle pending pixels from loop
 433 .LLoopCleanup1:
 434     ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
 435     ja          .LAlphaNotOpaqueOrZero12
 436     jz          .LAlphaZero13
 437     movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
 438 .LAlphaZero13:
 439     addl        $16, %edi
 440     subl        $4, %ecx
 441     js          .LSmallRemaining        // Reuse code from small loop
 442
 443 .LRemain1:
 444     movdqa      (%eax, %edi), %xmm1     // Load four source pixels
 445     ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
 446     ja          .LAlphaNotOpaqueOrZero12
 447     jz          .LAlphaZero14
 448     movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
 449 .LAlphaZero14:
 450     addl        $16, %edi
 451     subl        $4, %ecx
 452     jmp         .LSmallRemaining        // Reuse code from small loop
 453
 454 .LAlphaNotOpaqueOrZero12:
 455     movdqa      (%edx, %edi), %xmm5     // Load four destination pixels
 456     EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
 457     SCALE_PIXELS                        // Scale pixels using alpha
 458
 459     addl        $16, %edi
 460     subl        $4, %ecx
 461     pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
 462     paddb       %xmm3, %xmm1            // Add source and destination pixels together
 463     movdqa      %xmm1, -16(%edx, %edi)  // Store four destination pixels
 464     js          .LSmallRemaining        // Reuse code from small loop
 465     jmp         .LRemain1
 466
 467     .cfi_endproc
 468 #ifndef __clang__
 469     .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm
 470 #endif
 471 #endif