1 /****************************************************************************
3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/
6 ** This file is part of the QtGui module of the Qt Toolkit.
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** GNU Lesser General Public License Usage
10 ** This file may be used under the terms of the GNU Lesser General Public
11 ** License version 2.1 as published by the Free Software Foundation and
12 ** appearing in the file LICENSE.LGPL included in the packaging of this
13 ** file. Please review the following information to ensure the GNU Lesser
14 ** General Public License version 2.1 requirements will be met:
15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
17 ** In addition, as a special exception, Nokia gives you certain additional
18 ** rights. These rights are described in the Nokia Qt LGPL Exception
19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
21 ** GNU General Public License Usage
22 ** Alternatively, this file may be used under the terms of the GNU General
23 ** Public License version 3.0 as published by the Free Software Foundation
24 ** and appearing in the file LICENSE.GPL included in the packaging of this
25 ** file. Please review the following information to ensure the GNU General
26 ** Public License version 3.0 requirements will be met:
27 ** http://www.gnu.org/copyleft/gpl.html.
30 ** Alternatively, this file may be used in accordance with the terms and
31 ** conditions contained in a signed written agreement between you and Nokia.
40 ****************************************************************************/
42 #include <private/qdrawhelper_p.h>
43 #include <private/qblendfunctions_p.h>
44 #include <private/qmath_p.h>
46 #ifdef QT_COMPILER_SUPPORTS_NEON
48 #include <private/qdrawhelper_neon_p.h>
49 #include <private/qpaintengine_raster_p.h>
54 void qt_memfill32_neon(quint32 *dest, quint32 value, int count)
56 const int epilogueSize = count % 16;
58 quint32 *const neonEnd = dest + count - epilogueSize;
59 register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value);
60 register uint32x4_t valueVector2 asm ("q1") = valueVector1;
61 while (dest != neonEnd) {
63 "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
64 "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
66 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
74 case 15: *dest++ = value;
75 case 14: *dest++ = value;
76 case 13: *dest++ = value;
77 case 12: *dest++ = value;
78 case 11: *dest++ = value;
79 case 10: *dest++ = value;
80 case 9: *dest++ = value;
81 case 8: *dest++ = value;
82 case 7: *dest++ = value;
83 case 6: *dest++ = value;
84 case 5: *dest++ = value;
85 case 4: *dest++ = value;
86 case 3: *dest++ = value;
87 case 2: *dest++ = value;
88 case 1: *dest++ = value;
92 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
94 // result = (x + (x >> 8) + 0x80) >> 8
96 const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8
97 const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80
98 const uint16x8_t sum = vaddq_u16(temp, sum_part);
100 return vshrq_n_u16(sum, 8);
103 static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half)
105 // t = qRound(x * alpha / 255.0)
107 const uint16x8_t t = vmulq_u16(x, alpha); // t
108 return qvdiv_255_u16(t, half);
111 static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half)
115 const uint16x8_t ta = vmulq_u16(x, a);
116 const uint16x8_t tb = vmulq_u16(y, b);
118 return qvdiv_255_u16(vaddq_u16(ta, tb), half);
121 static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full)
123 const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3);
124 const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3);
126 const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high));
128 return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
132 pixman_composite_over_8888_0565_asm_neon (int32_t w,
140 pixman_composite_over_8888_8888_asm_neon (int32_t w,
148 pixman_composite_src_0565_8888_asm_neon (int32_t w,
156 pixman_composite_over_n_8_0565_asm_neon (int32_t w,
163 int32_t mask_stride);
166 pixman_composite_scanline_over_asm_neon (int32_t w,
168 const uint32_t *src);
171 pixman_composite_src_0565_0565_asm_neon (int32_t w,
178 // qblendfunctions.cpp
179 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
180 const uchar *srcPixels, int sbpl,
184 void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
185 const uchar *srcPixels, int sbpl,
192 quint32 *dst = (quint32 *) destPixels;
193 quint16 *src = (quint16 *) srcPixels;
195 if (const_alpha != 256) {
196 quint8 a = (255 * const_alpha) >> 8;
200 for (int x=0; x<w; ++x)
201 dst[x] = INTERPOLATE_PIXEL_255(qConvertRgb16To32(src[x]), a, dst[x], ia);
208 pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
211 // qblendfunctions.cpp
212 void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl,
213 const uchar *src, int sbpl,
218 static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride)
221 ((quint32 *)dst)[0] = ((quint32 *)src)[0];
222 __builtin_prefetch(dst + dstride, 1, 0);
224 for (int i = 1; i < N/2; ++i)
225 ((quint32 *)dst)[i] = ((quint32 *)src)[i];
231 static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h)
245 scanLineBlit16<Width-1>(dst + 1, src + 1, dstride);
251 scanLineBlit16<Width>(dst, src, dstride);
259 void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
260 const uchar *srcPixels, int sbpl,
264 // testing show that the default memcpy is faster for widths 150 and up
265 if (const_alpha != 256 || w >= 150) {
266 qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
270 int dstride = dbpl / 2;
271 int sstride = sbpl / 2;
273 quint16 *dst = (quint16 *) destPixels;
274 quint16 *src = (quint16 *) srcPixels;
277 #define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return;
298 pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride);
301 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
303 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
304 const uchar *srcPixels, int sbpl,
308 quint16 *dst = (quint16 *) destPixels;
309 quint32 *src = (quint32 *) srcPixels;
311 if (const_alpha != 256) {
312 for (int y=0; y<h; ++y) {
314 for (; i < w-7; i += 8)
315 blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha);
320 quint16 dstBuffer[8];
321 quint32 srcBuffer[8];
323 for (int j = 0; j < tail; ++j) {
324 dstBuffer[j] = dst[i + j];
325 srcBuffer[j] = src[i + j];
328 blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha);
330 for (int j = 0; j < tail; ++j)
331 dst[i + j] = dstBuffer[j];
334 dst = (quint16 *)(((uchar *) dst) + dbpl);
335 src = (quint32 *)(((uchar *) src) + sbpl);
340 pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
343 void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha)
345 if (const_alpha == 255) {
346 pixman_composite_scanline_over_asm_neon(length, dest, src);
348 qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255);
352 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
353 const uchar *srcPixels, int sbpl,
357 const uint *src = (const uint *) srcPixels;
358 uint *dst = (uint *) destPixels;
359 uint16x8_t half = vdupq_n_u16(0x80);
360 uint16x8_t full = vdupq_n_u16(0xff);
361 if (const_alpha == 256) {
362 pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4);
363 } else if (const_alpha != 0) {
364 const_alpha = (const_alpha * 255) >> 8;
365 uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
366 for (int y = 0; y < h; ++y) {
368 for (; x < w-3; x += 4) {
369 if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
370 uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
371 uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
373 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
374 const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
376 const uint8x8_t src8_low = vget_low_u8(src8);
377 const uint8x8_t dst8_low = vget_low_u8(dst8);
379 const uint8x8_t src8_high = vget_high_u8(src8);
380 const uint8x8_t dst8_high = vget_high_u8(dst8);
382 const uint16x8_t src16_low = vmovl_u8(src8_low);
383 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
385 const uint16x8_t src16_high = vmovl_u8(src8_high);
386 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
388 const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half);
389 const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half);
391 const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full);
392 const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full);
394 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
395 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
397 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
403 s = BYTE_MUL(s, const_alpha);
404 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
407 dst = (quint32 *)(((uchar *) dst) + dbpl);
408 src = (const quint32 *)(((const uchar *) src) + sbpl);
413 // qblendfunctions.cpp
414 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
415 const uchar *srcPixels, int sbpl,
419 void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
420 const uchar *srcPixels, int sbpl,
424 if (const_alpha != 256) {
425 if (const_alpha != 0) {
426 const uint *src = (const uint *) srcPixels;
427 uint *dst = (uint *) destPixels;
428 uint16x8_t half = vdupq_n_u16(0x80);
429 const_alpha = (const_alpha * 255) >> 8;
430 int one_minus_const_alpha = 255 - const_alpha;
431 uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
432 uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha);
433 for (int y = 0; y < h; ++y) {
435 for (; x < w-3; x += 4) {
436 uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
437 uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
439 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
440 const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
442 const uint8x8_t src8_low = vget_low_u8(src8);
443 const uint8x8_t dst8_low = vget_low_u8(dst8);
445 const uint8x8_t src8_high = vget_high_u8(src8);
446 const uint8x8_t dst8_high = vget_high_u8(dst8);
448 const uint16x8_t src16_low = vmovl_u8(src8_low);
449 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
451 const uint16x8_t src16_high = vmovl_u8(src8_high);
452 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
454 const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
455 const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
457 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
458 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
460 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
464 s = BYTE_MUL(s, const_alpha);
465 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
467 dst = (quint32 *)(((uchar *) dst) + dbpl);
468 src = (const quint32 *)(((const uchar *) src) + sbpl);
472 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
476 void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
477 int x, int y, quint32 color,
479 int mapWidth, int mapHeight, int mapStride,
482 quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
483 const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
485 uchar *mask = const_cast<uchar *>(bitmap);
487 pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, color, 0, mask, mapStride);
490 extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha);
492 template <typename SRC, typename BlendFunc>
493 struct Blend_on_RGB16_SourceAndConstAlpha_Neon {
494 Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha)
497 , m_const_alpha(const_alpha)
501 inline void write(quint16 *dst, quint32 src)
503 srcBuffer[m_index++] = src;
506 m_blender(dst - 7, srcBuffer, m_const_alpha);
511 inline void flush(quint16 *dst)
514 quint16 dstBuffer[8];
515 for (int i = 0; i < m_index; ++i)
516 dstBuffer[i] = dst[i - m_index];
518 m_blender(dstBuffer, srcBuffer, m_const_alpha);
520 for (int i = 0; i < m_index; ++i)
521 dst[i - m_index] = dstBuffer[i];
534 template <typename SRC, typename BlendFunc>
535 Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>
536 Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha)
538 return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha);
541 void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
542 const uchar *srcPixels, int sbpl,
543 const QRectF &targetRect,
544 const QRectF &sourceRect,
548 if (const_alpha == 0)
551 qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip,
552 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
555 void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
556 const uchar *srcPixels, int sbpl,
557 const QRectF &targetRect,
558 const QRectF &sourceRect,
562 void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
563 const uchar *srcPixels, int sbpl,
564 const QRectF &targetRect,
565 const QRectF &sourceRect,
569 if (const_alpha == 0)
572 if (const_alpha == 256) {
573 qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, const_alpha);
577 qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip,
578 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
581 extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
582 const uchar *srcPixels, int sbpl,
583 const QRectF &targetRect,
584 const QRectF &sourceRect,
586 const QTransform &targetRectTransform,
589 void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
590 const uchar *srcPixels, int sbpl,
591 const QRectF &targetRect,
592 const QRectF &sourceRect,
594 const QTransform &targetRectTransform,
597 if (const_alpha == 0)
600 if (const_alpha == 256) {
601 qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha);
605 qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
606 reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
607 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
610 void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
611 const uchar *srcPixels, int sbpl,
612 const QRectF &targetRect,
613 const QRectF &sourceRect,
615 const QTransform &targetRectTransform,
618 if (const_alpha == 0)
621 qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
622 reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
623 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
626 static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src)
629 "vld1.16 { d0, d1 }, [%[SRC]]\n\t"
631 /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
632 and put data into d4 - red, d3 - green, d2 - blue */
633 "vshrn.u16 d4, q0, #8\n\t"
634 "vshrn.u16 d3, q0, #3\n\t"
635 "vsli.u16 q0, q0, #5\n\t"
636 "vsri.u8 d4, d4, #5\n\t"
637 "vsri.u8 d3, d3, #6\n\t"
638 "vshrn.u16 d2, q0, #2\n\t"
640 /* fill d5 - alpha with 0xff */
644 "vst4.8 { d2, d3, d4, d5 }, [%[DST]]"
645 : : [DST]"r" (dst), [SRC]"r" (src)
646 : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
650 uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length)
652 const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x;
655 for (; i < length - 7; i += 8)
656 convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
659 quint16 srcBuffer[8];
660 quint32 dstBuffer[8];
662 int tail = length - i;
663 for (int j = 0; j < tail; ++j)
664 srcBuffer[j] = data[i + j];
666 convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
668 for (int j = 0; j < tail; ++j)
669 buffer[i + j] = dstBuffer[j];
675 static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src)
678 "vld4.8 { d0, d1, d2, d3 }, [%[SRC]]\n\t"
680 /* convert to r5g6b5 and store it into {d28, d29} */
681 "vshll.u8 q14, d2, #8\n\t"
682 "vshll.u8 q8, d1, #8\n\t"
683 "vshll.u8 q9, d0, #8\n\t"
684 "vsri.u16 q14, q8, #5\n\t"
685 "vsri.u16 q14, q9, #11\n\t"
687 "vst1.16 { d28, d29 }, [%[DST]]"
688 : : [DST]"r" (dst), [SRC]"r" (src)
689 : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
693 void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length)
695 quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x;
698 for (; i < length - 7; i += 8)
699 convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
702 quint32 srcBuffer[8];
703 quint16 dstBuffer[8];
705 int tail = length - i;
706 for (int j = 0; j < tail; ++j)
707 srcBuffer[j] = buffer[i + j];
709 convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
711 for (int j = 0; j < tail; ++j)
712 data[i + j] = dstBuffer[j];
716 void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, uint color, uint const_alpha)
718 if ((const_alpha & qAlpha(color)) == 255) {
719 QT_MEMFILL_UINT(destPixels, length, color);
721 if (const_alpha != 255)
722 color = BYTE_MUL(color, const_alpha);
724 const quint32 minusAlphaOfColor = qAlpha(~color);
727 uint32_t *dst = (uint32_t *) destPixels;
728 const uint32x4_t colorVector = vdupq_n_u32(color);
729 uint16x8_t half = vdupq_n_u16(0x80);
730 const uint16x8_t minusAlphaOfColorVector = vdupq_n_u16(minusAlphaOfColor);
732 for (; x < length-3; x += 4) {
733 uint32x4_t dstVector = vld1q_u32(&dst[x]);
735 const uint8x16_t dst8 = vreinterpretq_u8_u32(dstVector);
737 const uint8x8_t dst8_low = vget_low_u8(dst8);
738 const uint8x8_t dst8_high = vget_high_u8(dst8);
740 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
741 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
743 const uint16x8_t result16_low = qvbyte_mul_u16(dst16_low, minusAlphaOfColorVector, half);
744 const uint16x8_t result16_high = qvbyte_mul_u16(dst16_high, minusAlphaOfColorVector, half);
746 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
747 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
749 uint32x4_t blendedPixels = vcombine_u32(result32_low, result32_high);
750 uint32x4_t colorPlusBlendedPixels = vaddq_u32(colorVector, blendedPixels);
751 vst1q_u32(&dst[x], colorPlusBlendedPixels);
754 for (;x < length; ++x)
755 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
759 void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha)
761 if (const_alpha == 255) {
762 uint *const end = dst + length;
763 uint *const neonEnd = end - 3;
765 while (dst < neonEnd) {
767 "vld2.8 { d0, d1 }, [%[SRC]] !\n\t"
768 "vld2.8 { d2, d3 }, [%[DST]]\n\t"
769 "vqadd.u8 q0, q0, q1\n\t"
770 "vst2.8 { d0, d1 }, [%[DST]] !\n\t"
771 : [DST]"+r" (dst), [SRC]"+r" (src)
773 : "memory", "d0", "d1", "d2", "d3", "q0", "q1"
778 *dst = comp_func_Plus_one_pixel(*dst, *src);
784 const int one_minus_const_alpha = 255 - const_alpha;
785 const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha);
786 const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha);
788 const uint16x8_t half = vdupq_n_u16(0x80);
789 for (; x < length - 3; x += 4) {
790 const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
791 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
792 uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]);
793 uint8x16_t result = vqaddq_u8(dst8, src8);
795 uint16x8_t result_low = vmovl_u8(vget_low_u8(result));
796 uint16x8_t result_high = vmovl_u8(vget_high_u8(result));
798 uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8));
799 uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8));
801 result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half);
802 result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half);
804 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low));
805 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high));
806 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
809 for (; x < length; ++x)
810 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
814 static const int tileSize = 32;
816 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
818 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride)
820 const ushort *src = (const ushort *)srcPixels;
821 ushort *dest = (ushort *)destPixels;
823 sstride /= sizeof(ushort);
824 dstride /= sizeof(ushort);
826 const int pack = sizeof(quint32) / sizeof(ushort);
827 const int unaligned =
828 qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
829 const int restX = w % tileSize;
830 const int restY = (h - unaligned) % tileSize;
831 const int unoptimizedY = restY % pack;
832 const int numTilesX = w / tileSize + (restX > 0);
833 const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
835 for (int tx = 0; tx < numTilesX; ++tx) {
836 const int startx = w - tx * tileSize - 1;
837 const int stopx = qMax(startx - tileSize, 0);
840 for (int x = startx; x >= stopx; --x) {
841 ushort *d = dest + (w - x - 1) * dstride;
842 for (int y = 0; y < unaligned; ++y) {
843 *d++ = src[y * sstride + x];
848 for (int ty = 0; ty < numTilesY; ++ty) {
849 const int starty = ty * tileSize + unaligned;
850 const int stopy = qMin(starty + tileSize, h - unoptimizedY);
853 // qt_rotate90_16_neon writes to eight rows, four pixels at a time
854 for (; x >= stopx + 7; x -= 8) {
855 ushort *d = dest + (w - x - 1) * dstride + starty;
856 const ushort *s = &src[starty * sstride + x - 7];
857 qt_rotate90_16_neon(d, s, sstride * 2, dstride * 2, stopy - starty);
860 for (; x >= stopx; --x) {
861 quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride + starty);
862 for (int y = starty; y < stopy; y += pack) {
863 quint32 c = src[y * sstride + x];
864 for (int i = 1; i < pack; ++i) {
865 const int shift = (sizeof(int) * 8 / pack * i);
866 const ushort color = src[(y + i) * sstride + x];
875 const int starty = h - unoptimizedY;
876 for (int x = startx; x >= stopx; --x) {
877 ushort *d = dest + (w - x - 1) * dstride + starty;
878 for (int y = starty; y < h; ++y) {
879 *d++ = src[y * sstride + x];
886 extern "C" void qt_rotate270_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
888 void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
890 uchar *destPixels, int dstride)
892 const ushort *src = (const ushort *)srcPixels;
893 ushort *dest = (ushort *)destPixels;
895 sstride /= sizeof(ushort);
896 dstride /= sizeof(ushort);
898 const int pack = sizeof(quint32) / sizeof(ushort);
899 const int unaligned =
900 qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
901 const int restX = w % tileSize;
902 const int restY = (h - unaligned) % tileSize;
903 const int unoptimizedY = restY % pack;
904 const int numTilesX = w / tileSize + (restX > 0);
905 const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
907 for (int tx = 0; tx < numTilesX; ++tx) {
908 const int startx = tx * tileSize;
909 const int stopx = qMin(startx + tileSize, w);
912 for (int x = startx; x < stopx; ++x) {
913 ushort *d = dest + x * dstride;
914 for (int y = h - 1; y >= h - unaligned; --y) {
915 *d++ = src[y * sstride + x];
920 for (int ty = 0; ty < numTilesY; ++ty) {
921 const int starty = h - 1 - unaligned - ty * tileSize;
922 const int stopy = qMax(starty - tileSize, unoptimizedY);
925 // qt_rotate90_16_neon writes to eight rows, four pixels at a time
926 for (; x < stopx - 7; x += 8) {
927 ushort *d = dest + x * dstride + h - 1 - starty;
928 const ushort *s = &src[starty * sstride + x];
929 qt_rotate90_16_neon(d + 7 * dstride, s, -sstride * 2, -dstride * 2, starty - stopy);
932 for (; x < stopx; ++x) {
933 quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
935 for (int y = starty; y > stopy; y -= pack) {
936 quint32 c = src[y * sstride + x];
937 for (int i = 1; i < pack; ++i) {
938 const int shift = (sizeof(int) * 8 / pack * i);
939 const ushort color = src[(y - i) * sstride + x];
947 const int starty = unoptimizedY - 1;
948 for (int x = startx; x < stopx; ++x) {
949 ushort *d = dest + x * dstride + h - 1 - starty;
950 for (int y = starty; y >= 0; --y) {
951 *d++ = src[y * sstride + x];
961 typedef int32x4_t Int32x4;
962 typedef float32x4_t Float32x4;
964 union Vect_buffer_i { Int32x4 v; int i[4]; };
965 union Vect_buffer_f { Float32x4 v; float f[4]; };
967 static inline Float32x4 v_dup(float x) { return vdupq_n_f32(x); }
968 static inline Int32x4 v_dup(int x) { return vdupq_n_s32(x); }
969 static inline Int32x4 v_dup(uint x) { return vdupq_n_s32(x); }
971 static inline Float32x4 v_add(Float32x4 a, Float32x4 b) { return vaddq_f32(a, b); }
972 static inline Int32x4 v_add(Int32x4 a, Int32x4 b) { return vaddq_s32(a, b); }
974 static inline Float32x4 v_max(Float32x4 a, Float32x4 b) { return vmaxq_f32(a, b); }
975 static inline Float32x4 v_min(Float32x4 a, Float32x4 b) { return vminq_f32(a, b); }
976 static inline Int32x4 v_min_16(Int32x4 a, Int32x4 b) { return vminq_s32(a, b); }
978 static inline Int32x4 v_and(Int32x4 a, Int32x4 b) { return vandq_s32(a, b); }
980 static inline Float32x4 v_sub(Float32x4 a, Float32x4 b) { return vsubq_f32(a, b); }
981 static inline Int32x4 v_sub(Int32x4 a, Int32x4 b) { return vsubq_s32(a, b); }
983 static inline Float32x4 v_mul(Float32x4 a, Float32x4 b) { return vmulq_f32(a, b); }
985 static inline Float32x4 v_sqrt(Float32x4 x) { Float32x4 y = vrsqrteq_f32(x); y = vmulq_f32(y, vrsqrtsq_f32(x, vmulq_f32(y, y))); return vmulq_f32(x, y); }
987 static inline Int32x4 v_toInt(Float32x4 x) { return vcvtq_s32_f32(x); }
989 static inline Int32x4 v_greaterOrEqual(Float32x4 a, Float32x4 b) { return vreinterpretq_s32_u32(vcgeq_f32(a, b)); }
992 const uint * QT_FASTCALL qt_fetch_radial_gradient_neon(uint *buffer, const Operator *op, const QSpanData *data,
993 int y, int x, int length)
995 return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdNeon> >(buffer, op, data, y, x, length);
1000 #endif // QT_COMPILER_SUPPORTS_NEON