src/gui/painting/qdrawhelper_neon_asm.S

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
   4 ** All rights reserved.
   5 ** Contact: Nokia Corporation (qt-info@nokia.com)
   6 **
   7 ** This file is part of the QtGui module of the Qt Toolkit.
   8 **
   9 ** $QT_BEGIN_LICENSE:LGPL$
  10 ** GNU Lesser General Public License Usage
  11 ** This file may be used under the terms of the GNU Lesser General Public
  12 ** License version 2.1 as published by the Free Software Foundation and
  13 ** appearing in the file LICENSE.LGPL included in the packaging of this
  14 ** file. Please review the following information to ensure the GNU Lesser
  15 ** General Public License version 2.1 requirements will be met:
  16 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  17 **
  18 ** In addition, as a special exception, Nokia gives you certain additional
  19 ** rights. These rights are described in the Nokia Qt LGPL Exception
  20 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  21 **
  22 ** GNU General Public License Usage
  23 ** Alternatively, this file may be used under the terms of the GNU General
  24 ** Public License version 3.0 as published by the Free Software Foundation
  25 ** and appearing in the file LICENSE.GPL included in the packaging of this
  26 ** file. Please review the following information to ensure the GNU General
  27 ** Public License version 3.0 requirements will be met:
  28 ** http://www.gnu.org/copyleft/gpl.html.
  29 **
  30 ** Other Usage
  31 ** Alternatively, this file may be used in accordance with the terms and
  32 ** conditions contained in a signed written agreement between you and Nokia.
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 /* Prevent the stack from becoming executable for no reason... */
  43 #if defined(__linux__) && defined(__ELF__)
  44 .section .note.GNU-stack,"",%progbits
  45 #endif
  46
  47 .text
  48 .fpu neon
  49 .arch armv7a
  50 .altmacro
  51
  52 /* void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha) */
  53
  54     .func blend_8_pixels_argb32_on_rgb16_neon
  55     .global blend_8_pixels_argb32_on_rgb16_neon
  56     /* For ELF format also set function visibility to hidden */
  57 #ifdef __ELF__
  58     .hidden blend_8_pixels_argb32_on_rgb16_neon
  59     .type blend_8_pixels_argb32_on_rgb16_neon, %function
  60 #endif
  61 blend_8_pixels_argb32_on_rgb16_neon:
  62     vld4.8      { d0, d1, d2, d3 }, [r1]
  63     vld1.16     { d4, d5 }, [r0]
  64
  65     cmp         r2, #256
  66     beq         .blend_32_inner
  67
  68     vdup.8      d6, r2
  69
  70     /* multiply by const_alpha */
  71     vmull.u8    q8,   d6, d0
  72     vmull.u8    q9,   d6, d1
  73     vmull.u8    q10,  d6, d2
  74     vmull.u8    q11,  d6, d3
  75
  76     vshrn.u16   d0,  q8, #8
  77     vshrn.u16   d1,  q9, #8
  78     vshrn.u16   d2, q10, #8
  79     vshrn.u16   d3, q11, #8
  80
  81 .blend_32_inner:
  82     /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
  83        and put data into d6 - red, d7 - green, d30 - blue */
  84     vshrn.u16   d6, q2, #8
  85     vshrn.u16   d7, q2, #3
  86     vsli.u16    q2, q2, #5
  87     vsri.u8     d6, d6, #5
  88     vmvn.8      d3, d3
  89     vsri.u8     d7, d7, #6
  90     vshrn.u16   d30, q2, #2
  91
  92     pld [r0, #128]
  93
  94     /* now do alpha blending, storing results in 8-bit planar format
  95        into d16 - red, d19 - green, d18 - blue */
  96     vmull.u8    q10, d3, d6
  97     vmull.u8    q11, d3, d7
  98     vmull.u8    q12, d3, d30
  99     vrshr.u16   q13, q10, #8
 100     vrshr.u16   q3,  q11, #8
 101     vrshr.u16   q15, q12, #8
 102     vraddhn.u16 d20, q10, q13
 103     vraddhn.u16 d23, q11, q3
 104     vraddhn.u16 d22, q12, q15
 105     vqadd.u8    d16, d2, d20
 106     vqadd.u8    q9, q0, q11
 107     /* convert the result to r5g6b5 and store it into {d28, d29} */
 108     vshll.u8    q14, d16, #8
 109     vshll.u8    q8, d19, #8
 110     vshll.u8    q9, d18, #8
 111     vsri.u16    q14, q8, #5
 112     vsri.u16    q14, q9, #11
 113
 114     vst1.16     { d28, d29 }, [r0]
 115
 116     bx          lr
 117
 118     .endfunc
 119
 120 /* void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha) */
 121
 122     .func blend_8_pixels_rgb16_on_rgb16_neon
 123     .global blend_8_pixels_rgb16_on_rgb16_neon
 124     /* For ELF format also set function visibility to hidden */
 125 #ifdef __ELF__
 126     .hidden blend_8_pixels_rgb16_on_rgb16_neon
 127     .type blend_8_pixels_rgb16_on_rgb16_neon, %function
 128 #endif
 129 blend_8_pixels_rgb16_on_rgb16_neon:
 130     vld1.16     { d0, d1 }, [r0]
 131     vld1.16     { d2, d3 }, [r1]
 132
 133     rsb         r3, r2, #256
 134     vdup.8      d4, r2
 135     vdup.8      d5, r3
 136
 137     /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
 138        and put data into d6 - red, d7 - green, d30 - blue */
 139     vshrn.u16   d6,  q0,  #8
 140     vshrn.u16   d7,  q0,  #3
 141     vsli.u16    q0,  q0,  #5
 142     vsri.u8     d6,  d6,  #5
 143     vsri.u8     d7,  d7,  #6
 144     vshrn.u16   d30, q0,  #2
 145
 146     /* same from {d2, d3} into {d26, d27, d28} */
 147     vshrn.u16   d26, q1,  #8
 148     vshrn.u16   d27, q1,  #3
 149     vsli.u16    q1,  q1,  #5
 150     vsri.u8     d26, d26, #5
 151     vsri.u8     d27, d27, #6
 152     vshrn.u16   d28, q1,  #2
 153
 154     /* multiply dst by inv const_alpha */
 155     vmull.u8    q10, d5,  d6
 156     vmull.u8    q11, d5,  d7
 157     vmull.u8    q12, d5,  d30
 158
 159     vshrn.u16   d6,  q10, #8
 160     vshrn.u16   d7,  q11, #8
 161     vshrn.u16   d30, q12, #8
 162
 163     /* multiply src by const_alpha */
 164     vmull.u8    q10,  d4, d26
 165     vmull.u8    q11,  d4, d27
 166     vmull.u8    q12,  d4, d28
 167
 168     vshrn.u16   d26, q10, #8
 169     vshrn.u16   d27, q11, #8
 170     vshrn.u16   d28, q12, #8
 171
 172     /* preload dst + 128 */
 173     pld [r0, #128]
 174
 175     /* add components, storing results in 8-bit planar format
 176        into d16 - red, d19 - green, d18 - blue */
 177     vadd.u8     d16, d26, d6
 178     vadd.u8     d19, d27, d7
 179     vadd.u8     d18, d28, d30
 180
 181     /* convert the result to r5g6b5 and store it into {d28, d29} */
 182     vshll.u8    q14, d16, #8
 183     vshll.u8    q8,  d19, #8
 184     vshll.u8    q9,  d18, #8
 185     vsri.u16    q14,  q8, #5
 186     vsri.u16    q14,  q9, #11
 187
 188     vst1.16     { d28, d29 }, [r0]
 189
 190     bx          lr
 191
 192     .endfunc
 193
 194 /* void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count) */
 195     .func qt_rotate90_16_neon
 196     .global qt_rotate90_16_neon
 197     /* For ELF format also set function visibility to hidden */
 198 #ifdef __ELF__
 199     .hidden qt_rotate90_16_neon
 200     .type qt_rotate90_16_neon, %function
 201 #endif
 202 qt_rotate90_16_neon:
 203     push { r4-r11, lr }
 204     ldr r5, [sp, #(9*4)]
 205
 206     /* The preloads are the key to getting good performance */
 207     pld [r1]
 208
 209     mov r4, r5, asr #2
 210     add r6, r0, r3
 211     add r7, r6, r3
 212
 213     add r8, r7, r3
 214     add r9, r8, r3
 215
 216     pld [r1, r2]
 217
 218     add r10, r9, r3
 219     add r11, r10, r3
 220
 221     add r3, r3, r11
 222     and r5, r5, #3
 223
 224     pld [r1, r2, lsl #1]
 225
 226     cmp r4, #0
 227     beq .rotate90_16_tail
 228
 229 .rotate90_16_loop:
 230     vld1.16 { q8  }, [r1], r2
 231
 232     pld [r1, r2, lsl #1]
 233
 234     vld1.16 { q9  }, [r1], r2
 235     vld1.16 { q10 }, [r1], r2
 236     vld1.16 { q11 }, [r1], r2
 237
 238     pld [r1]
 239
 240     /* Could have used four quad-word zips instead,
 241        but those take three cycles as opposed to one. */
 242     vzip.16 d16, d20
 243     vzip.16 d17, d21
 244
 245     vzip.16 d18, d22
 246
 247     pld [r1, r2]
 248
 249     vzip.16 d19, d23
 250
 251     vzip.16 d16, d18
 252     vzip.16 d17, d19
 253
 254     pld [r1, r2, lsl #1]
 255
 256     vzip.16 d20, d22
 257     vzip.16 d21, d23
 258
 259     vst1.16 { d23 }, [r0]!
 260     vst1.16 { d21 }, [r6]!
 261     vst1.16 { d19 }, [r7]!
 262     vst1.16 { d17 }, [r8]!
 263     vst1.16 { d22 }, [r9]!
 264     vst1.16 { d20 }, [r10]!
 265     vst1.16 { d18 }, [r11]!
 266     vst1.16 { d16 }, [r3]!
 267
 268     sub r4, r4, #1
 269     cmp r4, #0
 270     bne .rotate90_16_loop
 271     b .rotate90_16_tail
 272
 273 .rotate90_16_tail_loop:
 274     sub r5, r5, #2
 275
 276     vld1.16 { q8 }, [r1], r2
 277     vld1.16 { q9 }, [r1], r2
 278
 279     vzip.16 d16, d18
 280     vzip.16 d17, d19
 281
 282     vst1.32 { d19[1] }, [r0]!
 283     vst1.32 { d19[0] }, [r6]!
 284     vst1.32 { d17[1] }, [r7]!
 285     vst1.32 { d17[0] }, [r8]!
 286     vst1.32 { d18[1] }, [r9]!
 287     vst1.32 { d18[0] }, [r10]!
 288     vst1.32 { d16[1] }, [r11]!
 289     vst1.32 { d16[0] }, [r3]!
 290
 291 .rotate90_16_tail:
 292     cmp r5, #0
 293     bgt .rotate90_16_tail_loop
 294
 295     pop { r4-r11, pc }
 296
 297     .endfunc