src/gui/painting/qdrawhelper_neon.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/
   5 **
   6 ** This file is part of the QtGui module of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** GNU Lesser General Public License Usage
  10 ** This file may be used under the terms of the GNU Lesser General Public
  11 ** License version 2.1 as published by the Free Software Foundation and
  12 ** appearing in the file LICENSE.LGPL included in the packaging of this
  13 ** file. Please review the following information to ensure the GNU Lesser
  14 ** General Public License version 2.1 requirements will be met:
  15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  16 **
  17 ** In addition, as a special exception, Nokia gives you certain additional
  18 ** rights. These rights are described in the Nokia Qt LGPL Exception
  19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  20 **
  21 ** GNU General Public License Usage
  22 ** Alternatively, this file may be used under the terms of the GNU General
  23 ** Public License version 3.0 as published by the Free Software Foundation
  24 ** and appearing in the file LICENSE.GPL included in the packaging of this
  25 ** file. Please review the following information to ensure the GNU General
  26 ** Public License version 3.0 requirements will be met:
  27 ** http://www.gnu.org/copyleft/gpl.html.
  28 **
  29 ** Other Usage
  30 ** Alternatively, this file may be used in accordance with the terms and
  31 ** conditions contained in a signed written agreement between you and Nokia.
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 #include <private/qdrawhelper_p.h>
  43 #include <private/qblendfunctions_p.h>
  44 #include <private/qmath_p.h>
  45
  46 #ifdef QT_COMPILER_SUPPORTS_NEON
  47
  48 #include <private/qdrawhelper_neon_p.h>
  49 #include <private/qpaintengine_raster_p.h>
  50 #include <arm_neon.h>
  51
  52 QT_BEGIN_NAMESPACE
  53
  54 void qt_memfill32_neon(quint32 *dest, quint32 value, int count)
  55 {
  56     const int epilogueSize = count % 16;
  57     if (count >= 16) {
  58         quint32 *const neonEnd = dest + count - epilogueSize;
  59         register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value);
  60         register uint32x4_t valueVector2 asm ("q1") = valueVector1;
  61         while (dest != neonEnd) {
  62             asm volatile (
  63                 "vst2.32     { d0, d1, d2, d3 }, [%[DST]] !\n\t"
  64                 "vst2.32     { d0, d1, d2, d3 }, [%[DST]] !\n\t"
  65                 : [DST]"+r" (dest)
  66                 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
  67                 : "memory"
  68             );
  69         }
  70     }
  71
  72     switch (epilogueSize)
  73     {
  74     case 15:     *dest++ = value;
  75     case 14:     *dest++ = value;
  76     case 13:     *dest++ = value;
  77     case 12:     *dest++ = value;
  78     case 11:     *dest++ = value;
  79     case 10:     *dest++ = value;
  80     case 9:      *dest++ = value;
  81     case 8:      *dest++ = value;
  82     case 7:      *dest++ = value;
  83     case 6:      *dest++ = value;
  84     case 5:      *dest++ = value;
  85     case 4:      *dest++ = value;
  86     case 3:      *dest++ = value;
  87     case 2:      *dest++ = value;
  88     case 1:      *dest++ = value;
  89     }
  90 }
  91
  92 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
  93 {
  94     // result = (x + (x >> 8) + 0x80) >> 8
  95
  96     const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8
  97     const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80
  98     const uint16x8_t sum = vaddq_u16(temp, sum_part);
  99
 100     return vshrq_n_u16(sum, 8);
 101 }
 102
 103 static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half)
 104 {
 105     // t = qRound(x * alpha / 255.0)
 106
 107     const uint16x8_t t = vmulq_u16(x, alpha); // t
 108     return qvdiv_255_u16(t, half);
 109 }
 110
 111 static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half)
 112 {
 113     // t = x * a + y * b
 114
 115     const uint16x8_t ta = vmulq_u16(x, a);
 116     const uint16x8_t tb = vmulq_u16(y, b);
 117
 118     return qvdiv_255_u16(vaddq_u16(ta, tb), half);
 119 }
 120
 121 static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full)
 122 {
 123     const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3);
 124     const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3);
 125
 126     const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high));
 127
 128     return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
 129 }
 130
 131 extern "C" void
 132 pixman_composite_over_8888_0565_asm_neon (int32_t   w,
 133                                           int32_t   h,
 134                                           uint16_t *dst,
 135                                           int32_t   dst_stride,
 136                                           uint32_t *src,
 137                                           int32_t   src_stride);
 138
 139 extern "C" void
 140 pixman_composite_over_8888_8888_asm_neon (int32_t   w,
 141                                           int32_t   h,
 142                                           uint32_t *dst,
 143                                           int32_t   dst_stride,
 144                                           uint32_t *src,
 145                                           int32_t   src_stride);
 146
 147 extern "C" void
 148 pixman_composite_src_0565_8888_asm_neon (int32_t   w,
 149                                          int32_t   h,
 150                                          uint32_t *dst,
 151                                          int32_t   dst_stride,
 152                                          uint16_t *src,
 153                                          int32_t   src_stride);
 154
 155 extern "C" void
 156 pixman_composite_over_n_8_0565_asm_neon (int32_t    w,
 157                                          int32_t    h,
 158                                          uint16_t  *dst,
 159                                          int32_t    dst_stride,
 160                                          uint32_t   src,
 161                                          int32_t    unused,
 162                                          uint8_t   *mask,
 163                                          int32_t    mask_stride);
 164
 165 extern "C" void
 166 pixman_composite_scanline_over_asm_neon (int32_t         w,
 167                                          const uint32_t *dst,
 168                                          const uint32_t *src);
 169
 170 extern "C" void
 171 pixman_composite_src_0565_0565_asm_neon (int32_t   w,
 172                                          int32_t   h,
 173                                          uint16_t *dst,
 174                                          int32_t   dst_stride,
 175                                          uint16_t *src,
 176                                          int32_t   src_stride);
 177
 178 // qblendfunctions.cpp
 179 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
 180                                           const uchar *srcPixels, int sbpl,
 181                                           int w, int h,
 182                                           int const_alpha);
 183
 184 void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
 185                                    const uchar *srcPixels, int sbpl,
 186                                    int w, int h,
 187                                    int const_alpha)
 188 {
 189     dbpl /= 4;
 190     sbpl /= 2;
 191
 192     quint32 *dst = (quint32 *) destPixels;
 193     quint16 *src = (quint16 *) srcPixels;
 194
 195     if (const_alpha != 256) {
 196         quint8 a = (255 * const_alpha) >> 8;
 197         quint8 ia = 255 - a;
 198
 199         while (h--) {
 200             for (int x=0; x<w; ++x)
 201                 dst[x] = INTERPOLATE_PIXEL_255(qConvertRgb16To32(src[x]), a, dst[x], ia);
 202             dst += dbpl;
 203             src += sbpl;
 204         }
 205         return;
 206     }
 207
 208     pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
 209 }
 210
 211 // qblendfunctions.cpp
 212 void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl,
 213                              const uchar *src, int sbpl,
 214                              int w, int h,
 215                              int const_alpha);
 216
 217 template <int N>
 218 static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride)
 219 {
 220     if (N >= 2) {
 221         ((quint32 *)dst)[0] = ((quint32 *)src)[0];
 222         __builtin_prefetch(dst + dstride, 1, 0);
 223     }
 224     for (int i = 1; i < N/2; ++i)
 225         ((quint32 *)dst)[i] = ((quint32 *)src)[i];
 226     if (N & 1)
 227         dst[N-1] = src[N-1];
 228 }
 229
 230 template <int Width>
 231 static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h)
 232 {
 233     union {
 234         quintptr address;
 235         quint16 *pointer;
 236     } u;
 237
 238     u.pointer = dst;
 239
 240     if (u.address & 2) {
 241         while (h--) {
 242             // align dst
 243             dst[0] = src[0];
 244             if (Width > 1)
 245                 scanLineBlit16<Width-1>(dst + 1, src + 1, dstride);
 246             dst += dstride;
 247             src += sstride;
 248         }
 249     } else {
 250         while (h--) {
 251             scanLineBlit16<Width>(dst, src, dstride);
 252
 253             dst += dstride;
 254             src += sstride;
 255         }
 256     }
 257 }
 258
 259 void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
 260                                   const uchar *srcPixels, int sbpl,
 261                                   int w, int h,
 262                                   int const_alpha)
 263 {
 264     // testing show that the default memcpy is faster for widths 150 and up
 265     if (const_alpha != 256 || w >= 150) {
 266         qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
 267         return;
 268     }
 269
 270     int dstride = dbpl / 2;
 271     int sstride = sbpl / 2;
 272
 273     quint16 *dst = (quint16 *) destPixels;
 274     quint16 *src = (quint16 *) srcPixels;
 275
 276     switch (w) {
 277 #define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return;
 278     BLOCKBLIT(1);
 279     BLOCKBLIT(2);
 280     BLOCKBLIT(3);
 281     BLOCKBLIT(4);
 282     BLOCKBLIT(5);
 283     BLOCKBLIT(6);
 284     BLOCKBLIT(7);
 285     BLOCKBLIT(8);
 286     BLOCKBLIT(9);
 287     BLOCKBLIT(10);
 288     BLOCKBLIT(11);
 289     BLOCKBLIT(12);
 290     BLOCKBLIT(13);
 291     BLOCKBLIT(14);
 292     BLOCKBLIT(15);
 293 #undef BLOCKBLIT
 294     default:
 295         break;
 296     }
 297
 298     pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride);
 299 }
 300
 301 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
 302
 303 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
 304                                    const uchar *srcPixels, int sbpl,
 305                                    int w, int h,
 306                                    int const_alpha)
 307 {
 308     quint16 *dst = (quint16 *) destPixels;
 309     quint32 *src = (quint32 *) srcPixels;
 310
 311     if (const_alpha != 256) {
 312         for (int y=0; y<h; ++y) {
 313             int i = 0;
 314             for (; i < w-7; i += 8)
 315                 blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha);
 316
 317             if (i < w) {
 318                 int tail = w - i;
 319
 320                 quint16 dstBuffer[8];
 321                 quint32 srcBuffer[8];
 322
 323                 for (int j = 0; j < tail; ++j) {
 324                     dstBuffer[j] = dst[i + j];
 325                     srcBuffer[j] = src[i + j];
 326                 }
 327
 328                 blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha);
 329
 330                 for (int j = 0; j < tail; ++j)
 331                     dst[i + j] = dstBuffer[j];
 332             }
 333
 334             dst = (quint16 *)(((uchar *) dst) + dbpl);
 335             src = (quint32 *)(((uchar *) src) + sbpl);
 336         }
 337         return;
 338     }
 339
 340     pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
 341 }
 342
 343 void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha)
 344 {
 345     if (const_alpha == 255) {
 346         pixman_composite_scanline_over_asm_neon(length, dest, src);
 347     } else {
 348         qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255);
 349     }
 350 }
 351
 352 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
 353                                     const uchar *srcPixels, int sbpl,
 354                                     int w, int h,
 355                                     int const_alpha)
 356 {
 357     const uint *src = (const uint *) srcPixels;
 358     uint *dst = (uint *) destPixels;
 359     uint16x8_t half = vdupq_n_u16(0x80);
 360     uint16x8_t full = vdupq_n_u16(0xff);
 361     if (const_alpha == 256) {
 362         pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4);
 363     } else if (const_alpha != 0) {
 364         const_alpha = (const_alpha * 255) >> 8;
 365         uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
 366         for (int y = 0; y < h; ++y) {
 367             int x = 0;
 368             for (; x < w-3; x += 4) {
 369                 if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
 370                     uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
 371                     uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
 372
 373                     const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
 374                     const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
 375
 376                     const uint8x8_t src8_low = vget_low_u8(src8);
 377                     const uint8x8_t dst8_low = vget_low_u8(dst8);
 378
 379                     const uint8x8_t src8_high = vget_high_u8(src8);
 380                     const uint8x8_t dst8_high = vget_high_u8(dst8);
 381
 382                     const uint16x8_t src16_low = vmovl_u8(src8_low);
 383                     const uint16x8_t dst16_low = vmovl_u8(dst8_low);
 384
 385                     const uint16x8_t src16_high = vmovl_u8(src8_high);
 386                     const uint16x8_t dst16_high = vmovl_u8(dst8_high);
 387
 388                     const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half);
 389                     const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half);
 390
 391                     const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full);
 392                     const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full);
 393
 394                     const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
 395                     const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
 396
 397                     vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
 398                 }
 399             }
 400             for (; x<w; ++x) {
 401                 uint s = src[x];
 402                 if (s != 0) {
 403                     s = BYTE_MUL(s, const_alpha);
 404                     dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
 405                 }
 406             }
 407             dst = (quint32 *)(((uchar *) dst) + dbpl);
 408             src = (const quint32 *)(((const uchar *) src) + sbpl);
 409         }
 410     }
 411 }
 412
 413 // qblendfunctions.cpp
 414 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
 415                              const uchar *srcPixels, int sbpl,
 416                              int w, int h,
 417                              int const_alpha);
 418
 419 void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
 420                                   const uchar *srcPixels, int sbpl,
 421                                   int w, int h,
 422                                   int const_alpha)
 423 {
 424     if (const_alpha != 256) {
 425         if (const_alpha != 0) {
 426             const uint *src = (const uint *) srcPixels;
 427             uint *dst = (uint *) destPixels;
 428             uint16x8_t half = vdupq_n_u16(0x80);
 429             const_alpha = (const_alpha * 255) >> 8;
 430             int one_minus_const_alpha = 255 - const_alpha;
 431             uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
 432             uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha);
 433             for (int y = 0; y < h; ++y) {
 434                 int x = 0;
 435                 for (; x < w-3; x += 4) {
 436                     uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
 437                     uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
 438
 439                     const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
 440                     const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
 441
 442                     const uint8x8_t src8_low = vget_low_u8(src8);
 443                     const uint8x8_t dst8_low = vget_low_u8(dst8);
 444
 445                     const uint8x8_t src8_high = vget_high_u8(src8);
 446                     const uint8x8_t dst8_high = vget_high_u8(dst8);
 447
 448                     const uint16x8_t src16_low = vmovl_u8(src8_low);
 449                     const uint16x8_t dst16_low = vmovl_u8(dst8_low);
 450
 451                     const uint16x8_t src16_high = vmovl_u8(src8_high);
 452                     const uint16x8_t dst16_high = vmovl_u8(dst8_high);
 453
 454                     const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
 455                     const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
 456
 457                     const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
 458                     const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
 459
 460                     vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
 461                 }
 462                 for (; x<w; ++x) {
 463                     uint s = src[x];
 464                     s = BYTE_MUL(s, const_alpha);
 465                     dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
 466                 }
 467                 dst = (quint32 *)(((uchar *) dst) + dbpl);
 468                 src = (const quint32 *)(((const uchar *) src) + sbpl);
 469             }
 470         }
 471     } else {
 472         qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
 473     }
 474 }
 475
 476 void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
 477                                   int x, int y, quint32 color,
 478                                   const uchar *bitmap,
 479                                   int mapWidth, int mapHeight, int mapStride,
 480                                   const QClipData *)
 481 {
 482     quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
 483     const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
 484
 485     uchar *mask = const_cast<uchar *>(bitmap);
 486
 487     pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, color, 0, mask, mapStride);
 488 }
 489
 490 extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha);
 491
 492 template <typename SRC, typename BlendFunc>
 493 struct Blend_on_RGB16_SourceAndConstAlpha_Neon {
 494     Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha)
 495         : m_index(0)
 496         , m_blender(blender)
 497         , m_const_alpha(const_alpha)
 498     {
 499     }
 500
 501     inline void write(quint16 *dst, quint32 src)
 502     {
 503         srcBuffer[m_index++] = src;
 504
 505         if (m_index == 8) {
 506             m_blender(dst - 7, srcBuffer, m_const_alpha);
 507             m_index = 0;
 508         }
 509     }
 510
 511     inline void flush(quint16 *dst)
 512     {
 513         if (m_index > 0) {
 514             quint16 dstBuffer[8];
 515             for (int i = 0; i < m_index; ++i)
 516                 dstBuffer[i] = dst[i - m_index];
 517
 518             m_blender(dstBuffer, srcBuffer, m_const_alpha);
 519
 520             for (int i = 0; i < m_index; ++i)
 521                 dst[i - m_index] = dstBuffer[i];
 522
 523             m_index = 0;
 524         }
 525     }
 526
 527     SRC srcBuffer[8];
 528
 529     int m_index;
 530     BlendFunc m_blender;
 531     int m_const_alpha;
 532 };
 533
 534 template <typename SRC, typename BlendFunc>
 535 Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>
 536 Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha)
 537 {
 538     return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha);
 539 }
 540
 541 void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
 542                                          const uchar *srcPixels, int sbpl,
 543                                          const QRectF &targetRect,
 544                                          const QRectF &sourceRect,
 545                                          const QRect &clip,
 546                                          int const_alpha)
 547 {
 548     if (const_alpha == 0)
 549         return;
 550
 551     qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip,
 552         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
 553 }
 554
 555 void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
 556                                    const uchar *srcPixels, int sbpl,
 557                                    const QRectF &targetRect,
 558                                    const QRectF &sourceRect,
 559                                    const QRect &clip,
 560                                    int const_alpha);
 561
 562 void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
 563                                         const uchar *srcPixels, int sbpl,
 564                                         const QRectF &targetRect,
 565                                         const QRectF &sourceRect,
 566                                         const QRect &clip,
 567                                         int const_alpha)
 568 {
 569     if (const_alpha == 0)
 570         return;
 571
 572     if (const_alpha == 256) {
 573         qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, const_alpha);
 574         return;
 575     }
 576
 577     qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip,
 578         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
 579 }
 580
 581 extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
 582                                               const uchar *srcPixels, int sbpl,
 583                                               const QRectF &targetRect,
 584                                               const QRectF &sourceRect,
 585                                               const QRect &clip,
 586                                               const QTransform &targetRectTransform,
 587                                               int const_alpha);
 588
 589 void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
 590                                             const uchar *srcPixels, int sbpl,
 591                                             const QRectF &targetRect,
 592                                             const QRectF &sourceRect,
 593                                             const QRect &clip,
 594                                             const QTransform &targetRectTransform,
 595                                             int const_alpha)
 596 {
 597     if (const_alpha == 0)
 598         return;
 599
 600     if (const_alpha == 256) {
 601         qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha);
 602         return;
 603     }
 604
 605     qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
 606                        reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
 607         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
 608 }
 609
 610 void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
 611                                              const uchar *srcPixels, int sbpl,
 612                                              const QRectF &targetRect,
 613                                              const QRectF &sourceRect,
 614                                              const QRect &clip,
 615                                              const QTransform &targetRectTransform,
 616                                              int const_alpha)
 617 {
 618     if (const_alpha == 0)
 619         return;
 620
 621     qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
 622                        reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
 623         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
 624 }
 625
 626 static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src)
 627 {
 628     asm volatile (
 629         "vld1.16     { d0, d1 }, [%[SRC]]\n\t"
 630
 631         /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
 632            and put data into d4 - red, d3 - green, d2 - blue */
 633         "vshrn.u16   d4,  q0,  #8\n\t"
 634         "vshrn.u16   d3,  q0,  #3\n\t"
 635         "vsli.u16    q0,  q0,  #5\n\t"
 636         "vsri.u8     d4,  d4,  #5\n\t"
 637         "vsri.u8     d3,  d3,  #6\n\t"
 638         "vshrn.u16   d2,  q0,  #2\n\t"
 639
 640         /* fill d5 - alpha with 0xff */
 641         "mov         r2, #255\n\t"
 642         "vdup.8      d5, r2\n\t"
 643
 644         "vst4.8      { d2, d3, d4, d5 }, [%[DST]]"
 645         : : [DST]"r" (dst), [SRC]"r" (src)
 646         : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
 647     );
 648 }
 649
 650 uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length)
 651 {
 652     const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x;
 653
 654     int i = 0;
 655     for (; i < length - 7; i += 8)
 656         convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
 657
 658     if (i < length) {
 659         quint16 srcBuffer[8];
 660         quint32 dstBuffer[8];
 661
 662         int tail = length - i;
 663         for (int j = 0; j < tail; ++j)
 664             srcBuffer[j] = data[i + j];
 665
 666         convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
 667
 668         for (int j = 0; j < tail; ++j)
 669             buffer[i + j] = dstBuffer[j];
 670     }
 671
 672     return buffer;
 673 }
 674
 675 static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src)
 676 {
 677     asm volatile (
 678         "vld4.8      { d0, d1, d2, d3 }, [%[SRC]]\n\t"
 679
 680         /* convert to r5g6b5 and store it into {d28, d29} */
 681         "vshll.u8    q14, d2, #8\n\t"
 682         "vshll.u8    q8,  d1, #8\n\t"
 683         "vshll.u8    q9,  d0, #8\n\t"
 684         "vsri.u16    q14, q8, #5\n\t"
 685         "vsri.u16    q14, q9, #11\n\t"
 686
 687         "vst1.16     { d28, d29 }, [%[DST]]"
 688         : : [DST]"r" (dst), [SRC]"r" (src)
 689         : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
 690     );
 691 }
 692
 693 void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length)
 694 {
 695     quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x;
 696
 697     int i = 0;
 698     for (; i < length - 7; i += 8)
 699         convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
 700
 701     if (i < length) {
 702         quint32 srcBuffer[8];
 703         quint16 dstBuffer[8];
 704
 705         int tail = length - i;
 706         for (int j = 0; j < tail; ++j)
 707             srcBuffer[j] = buffer[i + j];
 708
 709         convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
 710
 711         for (int j = 0; j < tail; ++j)
 712             data[i + j] = dstBuffer[j];
 713     }
 714 }
 715
 716 void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, uint color, uint const_alpha)
 717 {
 718     if ((const_alpha & qAlpha(color)) == 255) {
 719         QT_MEMFILL_UINT(destPixels, length, color);
 720     } else {
 721         if (const_alpha != 255)
 722             color = BYTE_MUL(color, const_alpha);
 723
 724         const quint32 minusAlphaOfColor = qAlpha(~color);
 725         int x = 0;
 726
 727         uint32_t *dst = (uint32_t *) destPixels;
 728         const uint32x4_t colorVector = vdupq_n_u32(color);
 729         uint16x8_t half = vdupq_n_u16(0x80);
 730         const uint16x8_t minusAlphaOfColorVector = vdupq_n_u16(minusAlphaOfColor);
 731
 732         for (; x < length-3; x += 4) {
 733             uint32x4_t dstVector = vld1q_u32(&dst[x]);
 734
 735             const uint8x16_t dst8 = vreinterpretq_u8_u32(dstVector);
 736
 737             const uint8x8_t dst8_low = vget_low_u8(dst8);
 738             const uint8x8_t dst8_high = vget_high_u8(dst8);
 739
 740             const uint16x8_t dst16_low = vmovl_u8(dst8_low);
 741             const uint16x8_t dst16_high = vmovl_u8(dst8_high);
 742
 743             const uint16x8_t result16_low = qvbyte_mul_u16(dst16_low, minusAlphaOfColorVector, half);
 744             const uint16x8_t result16_high = qvbyte_mul_u16(dst16_high, minusAlphaOfColorVector, half);
 745
 746             const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
 747             const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
 748
 749             uint32x4_t blendedPixels = vcombine_u32(result32_low, result32_high);
 750             uint32x4_t colorPlusBlendedPixels = vaddq_u32(colorVector, blendedPixels);
 751             vst1q_u32(&dst[x], colorPlusBlendedPixels);
 752         }
 753
 754         for (;x < length; ++x)
 755             destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
 756     }
 757 }
 758
 759 void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha)
 760 {
 761     if (const_alpha == 255) {
 762         uint *const end = dst + length;
 763         uint *const neonEnd = end - 3;
 764
 765         while (dst < neonEnd) {
 766             asm volatile (
 767                 "vld2.8     { d0, d1 }, [%[SRC]] !\n\t"
 768                 "vld2.8     { d2, d3 }, [%[DST]]\n\t"
 769                 "vqadd.u8 q0, q0, q1\n\t"
 770                 "vst2.8     { d0, d1 }, [%[DST]] !\n\t"
 771                 : [DST]"+r" (dst), [SRC]"+r" (src)
 772                 :
 773                 : "memory", "d0", "d1", "d2", "d3", "q0", "q1"
 774             );
 775         }
 776
 777         while (dst != end) {
 778             *dst = comp_func_Plus_one_pixel(*dst, *src);
 779             ++dst;
 780             ++src;
 781         }
 782     } else {
 783         int x = 0;
 784         const int one_minus_const_alpha = 255 - const_alpha;
 785         const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha);
 786         const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha);
 787
 788         const uint16x8_t half = vdupq_n_u16(0x80);
 789         for (; x < length - 3; x += 4) {
 790             const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
 791             const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
 792             uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]);
 793             uint8x16_t result = vqaddq_u8(dst8, src8);
 794
 795             uint16x8_t result_low = vmovl_u8(vget_low_u8(result));
 796             uint16x8_t result_high = vmovl_u8(vget_high_u8(result));
 797
 798             uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8));
 799             uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8));
 800
 801             result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half);
 802             result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half);
 803
 804             const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low));
 805             const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high));
 806             vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
 807         }
 808
 809         for (; x < length; ++x)
 810             dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
 811     }
 812 }
 813
 814 static const int tileSize = 32;
 815
 816 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
 817
 818 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride)
 819 {
 820     const ushort *src = (const ushort *)srcPixels;
 821     ushort *dest = (ushort *)destPixels;
 822
 823     sstride /= sizeof(ushort);
 824     dstride /= sizeof(ushort);
 825
 826     const int pack = sizeof(quint32) / sizeof(ushort);
 827     const int unaligned =
 828         qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
 829     const int restX = w % tileSize;
 830     const int restY = (h - unaligned) % tileSize;
 831     const int unoptimizedY = restY % pack;
 832     const int numTilesX = w / tileSize + (restX > 0);
 833     const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
 834
 835     for (int tx = 0; tx < numTilesX; ++tx) {
 836         const int startx = w - tx * tileSize - 1;
 837         const int stopx = qMax(startx - tileSize, 0);
 838
 839         if (unaligned) {
 840             for (int x = startx; x >= stopx; --x) {
 841                 ushort *d = dest + (w - x - 1) * dstride;
 842                 for (int y = 0; y < unaligned; ++y) {
 843                     *d++ = src[y * sstride + x];
 844                 }
 845             }
 846         }
 847
 848         for (int ty = 0; ty < numTilesY; ++ty) {
 849             const int starty = ty * tileSize + unaligned;
 850             const int stopy = qMin(starty + tileSize, h - unoptimizedY);
 851
 852             int x = startx;
 853             // qt_rotate90_16_neon writes to eight rows, four pixels at a time
 854             for (; x >= stopx + 7; x -= 8) {
 855                 ushort *d = dest + (w - x - 1) * dstride + starty;
 856                 const ushort *s = &src[starty * sstride + x - 7];
 857                 qt_rotate90_16_neon(d, s, sstride * 2, dstride * 2, stopy - starty);
 858             }
 859
 860             for (; x >= stopx; --x) {
 861                 quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride + starty);
 862                 for (int y = starty; y < stopy; y += pack) {
 863                     quint32 c = src[y * sstride + x];
 864                     for (int i = 1; i < pack; ++i) {
 865                         const int shift = (sizeof(int) * 8 / pack * i);
 866                         const ushort color = src[(y + i) * sstride + x];
 867                         c |= color << shift;
 868                     }
 869                     *d++ = c;
 870                 }
 871             }
 872         }
 873
 874         if (unoptimizedY) {
 875             const int starty = h - unoptimizedY;
 876             for (int x = startx; x >= stopx; --x) {
 877                 ushort *d = dest + (w - x - 1) * dstride + starty;
 878                 for (int y = starty; y < h; ++y) {
 879                     *d++ = src[y * sstride + x];
 880                 }
 881             }
 882         }
 883     }
 884 }
 885
 886 extern "C" void qt_rotate270_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
 887
 888 void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
 889                              int sstride,
 890                              uchar *destPixels, int dstride)
 891 {
 892     const ushort *src = (const ushort *)srcPixels;
 893     ushort *dest = (ushort *)destPixels;
 894
 895     sstride /= sizeof(ushort);
 896     dstride /= sizeof(ushort);
 897
 898     const int pack = sizeof(quint32) / sizeof(ushort);
 899     const int unaligned =
 900         qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
 901     const int restX = w % tileSize;
 902     const int restY = (h - unaligned) % tileSize;
 903     const int unoptimizedY = restY % pack;
 904     const int numTilesX = w / tileSize + (restX > 0);
 905     const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
 906
 907     for (int tx = 0; tx < numTilesX; ++tx) {
 908         const int startx = tx * tileSize;
 909         const int stopx = qMin(startx + tileSize, w);
 910
 911         if (unaligned) {
 912             for (int x = startx; x < stopx; ++x) {
 913                 ushort *d = dest + x * dstride;
 914                 for (int y = h - 1; y >= h - unaligned; --y) {
 915                     *d++ = src[y * sstride + x];
 916                 }
 917             }
 918         }
 919
 920         for (int ty = 0; ty < numTilesY; ++ty) {
 921             const int starty = h - 1 - unaligned - ty * tileSize;
 922             const int stopy = qMax(starty - tileSize, unoptimizedY);
 923
 924             int x = startx;
 925             // qt_rotate90_16_neon writes to eight rows, four pixels at a time
 926             for (; x < stopx - 7; x += 8) {
 927                 ushort *d = dest + x * dstride + h - 1 - starty;
 928                 const ushort *s = &src[starty * sstride + x];
 929                 qt_rotate90_16_neon(d + 7 * dstride, s, -sstride * 2, -dstride * 2, starty - stopy);
 930             }
 931
 932             for (; x < stopx; ++x) {
 933                 quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
 934                                                         + h - 1 - starty);
 935                 for (int y = starty; y > stopy; y -= pack) {
 936                     quint32 c = src[y * sstride + x];
 937                     for (int i = 1; i < pack; ++i) {
 938                         const int shift = (sizeof(int) * 8 / pack * i);
 939                         const ushort color = src[(y - i) * sstride + x];
 940                         c |= color << shift;
 941                     }
 942                     *d++ = c;
 943                 }
 944             }
 945         }
 946         if (unoptimizedY) {
 947             const int starty = unoptimizedY - 1;
 948             for (int x = startx; x < stopx; ++x) {
 949                 ushort *d = dest + x * dstride + h - 1 - starty;
 950                 for (int y = starty; y >= 0; --y) {
 951                     *d++ = src[y * sstride + x];
 952                 }
 953             }
 954         }
 955     }
 956 }
 957
 958 class QSimdNeon
 959 {
 960 public:
 961     typedef int32x4_t Int32x4;
 962     typedef float32x4_t Float32x4;
 963
 964     union Vect_buffer_i { Int32x4 v; int i[4]; };
 965     union Vect_buffer_f { Float32x4 v; float f[4]; };
 966
 967     static inline Float32x4 v_dup(float x) { return vdupq_n_f32(x); }
 968     static inline Int32x4 v_dup(int x) { return vdupq_n_s32(x); }
 969     static inline Int32x4 v_dup(uint x) { return vdupq_n_s32(x); }
 970
 971     static inline Float32x4 v_add(Float32x4 a, Float32x4 b) { return vaddq_f32(a, b); }
 972     static inline Int32x4 v_add(Int32x4 a, Int32x4 b) { return vaddq_s32(a, b); }
 973
 974     static inline Float32x4 v_max(Float32x4 a, Float32x4 b) { return vmaxq_f32(a, b); }
 975     static inline Float32x4 v_min(Float32x4 a, Float32x4 b) { return vminq_f32(a, b); }
 976     static inline Int32x4 v_min_16(Int32x4 a, Int32x4 b) { return vminq_s32(a, b); }
 977
 978     static inline Int32x4 v_and(Int32x4 a, Int32x4 b) { return vandq_s32(a, b); }
 979
 980     static inline Float32x4 v_sub(Float32x4 a, Float32x4 b) { return vsubq_f32(a, b); }
 981     static inline Int32x4 v_sub(Int32x4 a, Int32x4 b) { return vsubq_s32(a, b); }
 982
 983     static inline Float32x4 v_mul(Float32x4 a, Float32x4 b) { return vmulq_f32(a, b); }
 984
 985     static inline Float32x4 v_sqrt(Float32x4 x) { Float32x4 y = vrsqrteq_f32(x); y = vmulq_f32(y, vrsqrtsq_f32(x, vmulq_f32(y, y))); return vmulq_f32(x, y); }
 986
 987     static inline Int32x4 v_toInt(Float32x4 x) { return vcvtq_s32_f32(x); }
 988
 989     static inline Int32x4 v_greaterOrEqual(Float32x4 a, Float32x4 b) { return vreinterpretq_s32_u32(vcgeq_f32(a, b)); }
 990 };
 991
 992 const uint * QT_FASTCALL qt_fetch_radial_gradient_neon(uint *buffer, const Operator *op, const QSpanData *data,
 993                                                        int y, int x, int length)
 994 {
 995     return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdNeon> >(buffer, op, data, y, x, length);
 996 }
 997
 998 QT_END_NAMESPACE
 999
1000 #endif // QT_COMPILER_SUPPORTS_NEON
1001