2 * Copyright (C) 2012 Gabor Rapcsanyi (rgabor@inf.u-szeged.hu), University of Szeged
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 #ifndef WebGLImageConversionNEON_h
27 #define WebGLImageConversionNEON_h
29 #if HAVE(ARM_NEON_INTRINSICS)
37 ALWAYS_INLINE void unpackOneRowOfRGBA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
39 unsigned componentsPerRow = pixelsPerRow * 4;
40 unsigned tailComponents = componentsPerRow % 16;
41 unsigned componentsSize = componentsPerRow - tailComponents;
42 const uint8_t* src = reinterpret_cast<const uint8_t*>(source);
44 for (unsigned i = 0; i < componentsSize; i += 16) {
45 uint8x16x2_t components = vld2q_u8(src + i * 2);
46 vst1q_u8(destination + i, components.val[1]);
49 source += componentsSize;
50 destination += componentsSize;
51 pixelsPerRow = tailComponents / 4;
54 ALWAYS_INLINE void unpackOneRowOfRGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
56 unsigned componentsPerRow = pixelsPerRow * 3;
57 unsigned tailComponents = componentsPerRow % 24;
58 unsigned componentsSize = componentsPerRow - tailComponents;
60 uint8x8_t componentA = vdup_n_u8(0xFF);
61 for (unsigned i = 0; i < componentsSize; i += 24) {
62 uint16x8x3_t RGB16 = vld3q_u16(source + i);
63 uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(RGB16.val[0], 8));
64 uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(RGB16.val[1], 8));
65 uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(RGB16.val[2], 8));
66 uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
67 vst4_u8(destination, RGBA8);
71 source += componentsSize;
72 pixelsPerRow = tailComponents / 3;
75 ALWAYS_INLINE void unpackOneRowOfARGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
77 unsigned componentsPerRow = pixelsPerRow * 4;
78 unsigned tailComponents = componentsPerRow % 32;
79 unsigned componentsSize = componentsPerRow - tailComponents;
81 for (unsigned i = 0; i < componentsSize; i += 32) {
82 uint16x8x4_t ARGB16 = vld4q_u16(source + i);
83 uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
84 uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
85 uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
86 uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
87 uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
88 vst4_u8(destination + i, RGBA8);
91 source += componentsSize;
92 destination += componentsSize;
93 pixelsPerRow = tailComponents / 4;
96 ALWAYS_INLINE void unpackOneRowOfBGRA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
98 unsigned componentsPerRow = pixelsPerRow * 4;
99 unsigned tailComponents = componentsPerRow % 32;
100 unsigned componentsSize = componentsPerRow - tailComponents;
102 for (unsigned i = 0; i < componentsSize; i += 32) {
103 uint16x8x4_t ARGB16 = vld4q_u16(source + i);
104 uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
105 uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
106 uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
107 uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
108 uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
109 vst4_u8(destination + i, RGBA8);
112 source += componentsSize;
113 destination += componentsSize;
114 pixelsPerRow = tailComponents / 4;
117 ALWAYS_INLINE void unpackOneRowOfRGBA4444ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
119 unsigned tailPixels = pixelsPerRow % 8;
120 unsigned pixelSize = pixelsPerRow - tailPixels;
122 uint16x8_t immediate0x0f = vdupq_n_u16(0x0F);
123 for (unsigned i = 0; i < pixelSize; i += 8) {
124 uint16x8_t eightPixels = vld1q_u16(source + i);
126 uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 12));
127 uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 8), immediate0x0f));
128 uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 4), immediate0x0f));
129 uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x0f));
131 componentR = vorr_u8(vshl_n_u8(componentR, 4), componentR);
132 componentG = vorr_u8(vshl_n_u8(componentG, 4), componentG);
133 componentB = vorr_u8(vshl_n_u8(componentB, 4), componentB);
134 componentA = vorr_u8(vshl_n_u8(componentA, 4), componentA);
136 uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
137 vst4_u8(destination, destComponents);
142 pixelsPerRow = tailPixels;
145 ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort4444(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
147 unsigned componentsPerRow = pixelsPerRow * 4;
148 unsigned tailComponents = componentsPerRow % 32;
149 unsigned componentsSize = componentsPerRow - tailComponents;
151 uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
152 uint8x8_t immediate0xf0 = vdup_n_u8(0xF0);
153 for (unsigned i = 0; i < componentsSize; i += 32) {
154 uint8x8x4_t RGBA8 = vld4_u8(source + i);
156 uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf0);
157 uint8x8_t componentG = vshr_n_u8(vand_u8(RGBA8.val[1], immediate0xf0), 4);
158 uint8x8_t componentB = vand_u8(RGBA8.val[2], immediate0xf0);
159 uint8x8_t componentA = vshr_n_u8(vand_u8(RGBA8.val[3], immediate0xf0), 4);
162 RGBA4.val[0] = vorr_u8(componentB, componentA);
163 RGBA4.val[1] = vorr_u8(componentR, componentG);
168 source += componentsSize;
169 destination += componentsSize / 4;
170 pixelsPerRow = tailComponents / 4;
173 ALWAYS_INLINE void unpackOneRowOfRGBA5551ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
175 unsigned tailPixels = pixelsPerRow % 8;
176 unsigned pixelSize = pixelsPerRow - tailPixels;
178 uint8x8_t immediate0x7 = vdup_n_u8(0x7);
179 uint8x8_t immediate0xff = vdup_n_u8(0xFF);
180 uint16x8_t immediate0x1f = vdupq_n_u16(0x1F);
181 uint16x8_t immediate0x1 = vdupq_n_u16(0x1);
183 for (unsigned i = 0; i < pixelSize; i += 8) {
184 uint16x8_t eightPixels = vld1q_u16(source + i);
186 uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11));
187 uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 6), immediate0x1f));
188 uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 1), immediate0x1f));
189 uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x1));
191 componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7));
192 componentG = vorr_u8(vshl_n_u8(componentG, 3), vand_u8(componentG, immediate0x7));
193 componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7));
194 componentA = vmul_u8(componentA, immediate0xff);
196 uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
197 vst4_u8(destination, destComponents);
202 pixelsPerRow = tailPixels;
205 ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort5551(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
207 unsigned componentsPerRow = pixelsPerRow * 4;
208 unsigned tailComponents = componentsPerRow % 32;
209 unsigned componentsSize = componentsPerRow - tailComponents;
211 uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
213 uint8x8_t immediate0xf8 = vdup_n_u8(0xF8);
214 uint8x8_t immediate0x18 = vdup_n_u8(0x18);
215 for (unsigned i = 0; i < componentsSize; i += 32) {
216 uint8x8x4_t RGBA8 = vld4_u8(source + i);
218 uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8);
219 uint8x8_t componentG3bit = vshr_n_u8(RGBA8.val[1], 5);
221 uint8x8_t componentG2bit = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x18), 3);
222 uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 2);
223 uint8x8_t componentA = vshr_n_u8(RGBA8.val[3], 7);
225 uint8x8x2_t RGBA5551;
226 RGBA5551.val[0] = vorr_u8(vorr_u8(componentG2bit, componentB), componentA);
227 RGBA5551.val[1] = vorr_u8(componentR, componentG3bit);
228 vst2_u8(dst, RGBA5551);
232 source += componentsSize;
233 destination += componentsSize / 4;
234 pixelsPerRow = tailComponents / 4;
237 ALWAYS_INLINE void unpackOneRowOfRGB565ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
239 unsigned tailPixels = pixelsPerRow % 8;
240 unsigned pixelSize = pixelsPerRow - tailPixels;
242 uint16x8_t immediate0x3f = vdupq_n_u16(0x3F);
243 uint16x8_t immediate0x1f = vdupq_n_u16(0x1F);
244 uint8x8_t immediate0x3 = vdup_n_u8(0x3);
245 uint8x8_t immediate0x7 = vdup_n_u8(0x7);
247 uint8x8_t componentA = vdup_n_u8(0xFF);
249 for (unsigned i = 0; i < pixelSize; i += 8) {
250 uint16x8_t eightPixels = vld1q_u16(source + i);
252 uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11));
253 uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 5), immediate0x3f));
254 uint8x8_t componentB = vqmovn_u16(vandq_u16(eightPixels, immediate0x1f));
256 componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7));
257 componentG = vorr_u8(vshl_n_u8(componentG, 2), vand_u8(componentG, immediate0x3));
258 componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7));
260 uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
261 vst4_u8(destination, destComponents);
266 pixelsPerRow = tailPixels;
269 ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort565(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
271 unsigned componentsPerRow = pixelsPerRow * 4;
272 unsigned tailComponents = componentsPerRow % 32;
273 unsigned componentsSize = componentsPerRow - tailComponents;
274 uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
276 uint8x8_t immediate0xf8 = vdup_n_u8(0xF8);
277 uint8x8_t immediate0x1c = vdup_n_u8(0x1C);
278 for (unsigned i = 0; i < componentsSize; i += 32) {
279 uint8x8x4_t RGBA8 = vld4_u8(source + i);
281 uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8);
282 uint8x8_t componentGLeft = vshr_n_u8(RGBA8.val[1], 5);
283 uint8x8_t componentGRight = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x1c), 3);
284 uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 3);
287 RGB565.val[0] = vorr_u8(componentGRight, componentB);
288 RGB565.val[1] = vorr_u8(componentR, componentGLeft);
289 vst2_u8(dst, RGB565);
293 source += componentsSize;
294 destination += componentsSize / 4;
295 pixelsPerRow = tailComponents / 4;
302 #endif // HAVE(ARM_NEON_INTRINSICS)
304 #endif // WebGLImageConversionNEON_h