src/third_party/WebKit/Source/platform/graphics/cpu/arm/WebGLImageConversionNEON.h

   1 /*
   2  * Copyright (C) 2012 Gabor Rapcsanyi (rgabor@inf.u-szeged.hu), University of Szeged
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #ifndef WebGLImageConversionNEON_h
  27 #define WebGLImageConversionNEON_h
  28
  29 #if HAVE(ARM_NEON_INTRINSICS)
  30
  31 #include <arm_neon.h>
  32
  33 namespace blink {
  34
  35 namespace SIMD {
  36
  37 ALWAYS_INLINE void unpackOneRowOfRGBA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
  38 {
  39     unsigned componentsPerRow = pixelsPerRow * 4;
  40     unsigned tailComponents = componentsPerRow % 16;
  41     unsigned componentsSize = componentsPerRow - tailComponents;
  42     const uint8_t* src = reinterpret_cast<const uint8_t*>(source);
  43
  44     for (unsigned i = 0; i < componentsSize; i += 16) {
  45         uint8x16x2_t components = vld2q_u8(src + i * 2);
  46         vst1q_u8(destination + i, components.val[1]);
  47     }
  48
  49     source += componentsSize;
  50     destination += componentsSize;
  51     pixelsPerRow = tailComponents / 4;
  52 }
  53
  54 ALWAYS_INLINE void unpackOneRowOfRGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
  55 {
  56     unsigned componentsPerRow = pixelsPerRow * 3;
  57     unsigned tailComponents = componentsPerRow % 24;
  58     unsigned componentsSize = componentsPerRow - tailComponents;
  59
  60     uint8x8_t componentA = vdup_n_u8(0xFF);
  61     for (unsigned i = 0; i < componentsSize; i += 24) {
  62         uint16x8x3_t RGB16 = vld3q_u16(source + i);
  63         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(RGB16.val[0], 8));
  64         uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(RGB16.val[1], 8));
  65         uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(RGB16.val[2], 8));
  66         uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
  67         vst4_u8(destination, RGBA8);
  68         destination += 32;
  69     }
  70
  71     source += componentsSize;
  72     pixelsPerRow = tailComponents / 3;
  73 }
  74
  75 ALWAYS_INLINE void unpackOneRowOfARGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
  76 {
  77     unsigned componentsPerRow = pixelsPerRow * 4;
  78     unsigned tailComponents = componentsPerRow % 32;
  79     unsigned componentsSize = componentsPerRow - tailComponents;
  80
  81     for (unsigned i = 0; i < componentsSize; i += 32) {
  82         uint16x8x4_t ARGB16 = vld4q_u16(source + i);
  83         uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
  84         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
  85         uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
  86         uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
  87         uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
  88         vst4_u8(destination + i, RGBA8);
  89     }
  90
  91     source += componentsSize;
  92     destination += componentsSize;
  93     pixelsPerRow = tailComponents / 4;
  94 }
  95
  96 ALWAYS_INLINE void unpackOneRowOfBGRA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
  97 {
  98     unsigned componentsPerRow = pixelsPerRow * 4;
  99     unsigned tailComponents = componentsPerRow % 32;
 100     unsigned componentsSize = componentsPerRow - tailComponents;
 101
 102     for (unsigned i = 0; i < componentsSize; i += 32) {
 103         uint16x8x4_t ARGB16 = vld4q_u16(source + i);
 104         uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
 105         uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
 106         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
 107         uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
 108         uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
 109         vst4_u8(destination + i, RGBA8);
 110     }
 111
 112     source += componentsSize;
 113     destination += componentsSize;
 114     pixelsPerRow = tailComponents / 4;
 115 }
 116
 117 ALWAYS_INLINE void unpackOneRowOfRGBA4444ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
 118 {
 119     unsigned tailPixels = pixelsPerRow % 8;
 120     unsigned pixelSize = pixelsPerRow - tailPixels;
 121
 122     uint16x8_t immediate0x0f = vdupq_n_u16(0x0F);
 123     for (unsigned i = 0; i < pixelSize; i += 8) {
 124         uint16x8_t eightPixels = vld1q_u16(source + i);
 125
 126         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 12));
 127         uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 8), immediate0x0f));
 128         uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 4), immediate0x0f));
 129         uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x0f));
 130
 131         componentR = vorr_u8(vshl_n_u8(componentR, 4), componentR);
 132         componentG = vorr_u8(vshl_n_u8(componentG, 4), componentG);
 133         componentB = vorr_u8(vshl_n_u8(componentB, 4), componentB);
 134         componentA = vorr_u8(vshl_n_u8(componentA, 4), componentA);
 135
 136         uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
 137         vst4_u8(destination, destComponents);
 138         destination += 32;
 139     }
 140
 141     source += pixelSize;
 142     pixelsPerRow = tailPixels;
 143 }
 144
 145 ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort4444(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
 146 {
 147     unsigned componentsPerRow = pixelsPerRow * 4;
 148     unsigned tailComponents = componentsPerRow % 32;
 149     unsigned componentsSize = componentsPerRow - tailComponents;
 150
 151     uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
 152     uint8x8_t immediate0xf0 = vdup_n_u8(0xF0);
 153     for (unsigned i = 0; i < componentsSize; i += 32) {
 154         uint8x8x4_t RGBA8 = vld4_u8(source + i);
 155
 156         uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf0);
 157         uint8x8_t componentG = vshr_n_u8(vand_u8(RGBA8.val[1], immediate0xf0), 4);
 158         uint8x8_t componentB = vand_u8(RGBA8.val[2], immediate0xf0);
 159         uint8x8_t componentA = vshr_n_u8(vand_u8(RGBA8.val[3], immediate0xf0), 4);
 160
 161         uint8x8x2_t RGBA4;
 162         RGBA4.val[0] = vorr_u8(componentB, componentA);
 163         RGBA4.val[1] = vorr_u8(componentR, componentG);
 164         vst2_u8(dst, RGBA4);
 165         dst += 16;
 166     }
 167
 168     source += componentsSize;
 169     destination += componentsSize / 4;
 170     pixelsPerRow = tailComponents / 4;
 171 }
 172
 173 ALWAYS_INLINE void unpackOneRowOfRGBA5551ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
 174 {
 175     unsigned tailPixels = pixelsPerRow % 8;
 176     unsigned pixelSize = pixelsPerRow - tailPixels;
 177
 178     uint8x8_t immediate0x7 = vdup_n_u8(0x7);
 179     uint8x8_t immediate0xff = vdup_n_u8(0xFF);
 180     uint16x8_t immediate0x1f = vdupq_n_u16(0x1F);
 181     uint16x8_t immediate0x1 = vdupq_n_u16(0x1);
 182
 183     for (unsigned i = 0; i < pixelSize; i += 8) {
 184         uint16x8_t eightPixels = vld1q_u16(source + i);
 185
 186         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11));
 187         uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 6), immediate0x1f));
 188         uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 1), immediate0x1f));
 189         uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x1));
 190
 191         componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7));
 192         componentG = vorr_u8(vshl_n_u8(componentG, 3), vand_u8(componentG, immediate0x7));
 193         componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7));
 194         componentA = vmul_u8(componentA, immediate0xff);
 195
 196         uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
 197         vst4_u8(destination, destComponents);
 198         destination += 32;
 199     }
 200
 201     source += pixelSize;
 202     pixelsPerRow = tailPixels;
 203 }
 204
 205 ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort5551(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
 206 {
 207     unsigned componentsPerRow = pixelsPerRow * 4;
 208     unsigned tailComponents = componentsPerRow % 32;
 209     unsigned componentsSize = componentsPerRow - tailComponents;
 210
 211     uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
 212
 213     uint8x8_t immediate0xf8 = vdup_n_u8(0xF8);
 214     uint8x8_t immediate0x18 = vdup_n_u8(0x18);
 215     for (unsigned i = 0; i < componentsSize; i += 32) {
 216         uint8x8x4_t RGBA8 = vld4_u8(source + i);
 217
 218         uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8);
 219         uint8x8_t componentG3bit = vshr_n_u8(RGBA8.val[1], 5);
 220
 221         uint8x8_t componentG2bit = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x18), 3);
 222         uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 2);
 223         uint8x8_t componentA = vshr_n_u8(RGBA8.val[3], 7);
 224
 225         uint8x8x2_t RGBA5551;
 226         RGBA5551.val[0] = vorr_u8(vorr_u8(componentG2bit, componentB), componentA);
 227         RGBA5551.val[1] = vorr_u8(componentR, componentG3bit);
 228         vst2_u8(dst, RGBA5551);
 229         dst += 16;
 230     }
 231
 232     source += componentsSize;
 233     destination += componentsSize / 4;
 234     pixelsPerRow = tailComponents / 4;
 235 }
 236
 237 ALWAYS_INLINE void unpackOneRowOfRGB565ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
 238 {
 239     unsigned tailPixels = pixelsPerRow % 8;
 240     unsigned pixelSize = pixelsPerRow - tailPixels;
 241
 242     uint16x8_t immediate0x3f = vdupq_n_u16(0x3F);
 243     uint16x8_t immediate0x1f = vdupq_n_u16(0x1F);
 244     uint8x8_t immediate0x3 = vdup_n_u8(0x3);
 245     uint8x8_t immediate0x7 = vdup_n_u8(0x7);
 246
 247     uint8x8_t componentA = vdup_n_u8(0xFF);
 248
 249     for (unsigned i = 0; i < pixelSize; i += 8) {
 250         uint16x8_t eightPixels = vld1q_u16(source + i);
 251
 252         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11));
 253         uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 5), immediate0x3f));
 254         uint8x8_t componentB = vqmovn_u16(vandq_u16(eightPixels, immediate0x1f));
 255
 256         componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7));
 257         componentG = vorr_u8(vshl_n_u8(componentG, 2), vand_u8(componentG, immediate0x3));
 258         componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7));
 259
 260         uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
 261         vst4_u8(destination, destComponents);
 262         destination += 32;
 263     }
 264
 265     source += pixelSize;
 266     pixelsPerRow = tailPixels;
 267 }
 268
 269 ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort565(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
 270 {
 271     unsigned componentsPerRow = pixelsPerRow * 4;
 272     unsigned tailComponents = componentsPerRow % 32;
 273     unsigned componentsSize = componentsPerRow - tailComponents;
 274     uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
 275
 276     uint8x8_t immediate0xf8 = vdup_n_u8(0xF8);
 277     uint8x8_t immediate0x1c = vdup_n_u8(0x1C);
 278     for (unsigned i = 0; i < componentsSize; i += 32) {
 279         uint8x8x4_t RGBA8 = vld4_u8(source + i);
 280
 281         uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8);
 282         uint8x8_t componentGLeft = vshr_n_u8(RGBA8.val[1], 5);
 283         uint8x8_t componentGRight = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x1c), 3);
 284         uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 3);
 285
 286         uint8x8x2_t RGB565;
 287         RGB565.val[0] = vorr_u8(componentGRight, componentB);
 288         RGB565.val[1] = vorr_u8(componentR, componentGLeft);
 289         vst2_u8(dst, RGB565);
 290         dst += 16;
 291     }
 292
 293     source += componentsSize;
 294     destination += componentsSize / 4;
 295     pixelsPerRow = tailComponents / 4;
 296 }
 297
 298 } // namespace SIMD
 299
 300 } // namespace blink
 301
 302 #endif // HAVE(ARM_NEON_INTRINSICS)
 303
 304 #endif // WebGLImageConversionNEON_h