exynos4/libswconverter/csc_tiled_to_linear_deinterleave_crop_neon.s

   1 /*
   2  *
   3  * Copyright 2012 Samsung Electronics S.LSI Co. LTD
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License")
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 /*
  19  * @file    csc_tiled_to_linear_deinterleave_crop_neon.s
  20  * @brief   SEC_OMX specific define
  21  * @author  ShinWon Lee (shinwon.lee@samsung.com)
  22  * @version 1.0
  23  * @history
  24  *   2012.02.01 : Create
  25  */
  26
  27 /*
  28  * Converts and Deinterleaves tiled data to linear
  29  * Crops left, top, right, buttom
  30  * 1. UV of NV12T to UV of YUV420P
  31  *
  32  * @param yuv420_u_dest
  33  *   U plane address of YUV420P[out]
  34  *
  35  * @param yuv420_v_dest
  36  *   V plane address of YUV420P[out]
  37  *
  38  * @param nv12t_src
  39  *   UV plane address of NV12T[in]
  40  *
  41  * @param yuv420_width
  42  *   Width of YUV420[in]
  43  *
  44  * @param yuv420_uv_height
  45  *   Height/2 of YUV420[in]
  46  *
  47  * @param left
  48  *   Crop size of left. It should be even.
  49  *
  50  * @param top
  51  *   Crop size of top. It should be even.
  52  *
  53  * @param right
  54  *   Crop size of right. It should be even.
  55  *
  56  * @param buttom
  57  *   Crop size of buttom. It should be even.
  58  */
  59
  60     .arch armv7-a
  61     .text
  62     .global csc_tiled_to_linear_deinterleave_crop_neon
  63     .type   csc_tiled_to_linear_deinterleave_crop_neon, %function
  64 csc_tiled_to_linear_deinterleave_crop_neon:
  65     .fnstart
  66
  67     @r0     yuv420_u_dest
  68     @r1     yuv420_v_dest
  69     @r2     nv12t_src
  70     @r3     yuv420_width
  71     @r4     yuv420_height
  72     @r5     i
  73     @r6     j
  74     @r7     tiled_offset
  75     @r8     tiled_offset1
  76     @r9     linear_offset
  77     @r10    temp1
  78     @r11    temp2
  79     @r12    temp3
  80     @r14    temp4
  81
  82     stmfd       sp!, {r4-r12,r14}       @ backup registers
  83
  84     ldr         r4, [sp, #40]           @ r4 = yuv420_height
  85
  86     ldr         r12, [sp, #52]          @ r12 = right
  87     ldr         r10, [sp, #44]          @ r10 = left
  88     sub         r12, r3, r12            @ temp3 = yuv420_width-right@
  89     sub         r10, r12, r10           @ temp1 = temp3-left@
  90     cmp         r10, #256               @ if (temp1 >= 256)
  91     blt         LOOP_HEIGHT_64_START
  92
  93     ldr         r5, [sp, #48]           @ top
  94 LOOP_HEIGHT_256:
  95     ldr         r6, [sp, #44]           @ j = left
  96     mov         r14, r5, asr #5         @ temp4 = i>>5
  97     bic         r12, r6, #0xFF          @ temp3 = (j>>8)<<8
  98     mov         r12, r12, asr #6        @ temp3 = temp3>>6
  99     and         r11, r14, #0x1          @ if (temp4 & 0x1)
 100     cmp         r11, #0x1
 101     bne         LOOP_HEIGHT_256_GET_TILED_EVEN
 102 LOOP_HEIGHT_256_GET_TILED_ODD:
 103     sub         r7, r14, #1             @ tiled_offset = temp4-1
 104     add         r10, r3, #127           @ temp1 = ((yuv420_width+127)>>7)<<7
 105     bic         r10, r10, #0x7F
 106     mov         r10, r10, asr #6        @ tiled_offset = tiled_offset*(temp1>>6)
 107     mul         r7, r7, r10
 108     add         r7, r7, r12             @ tiled_offset = tiled_offset+temp3
 109     add         r7, r7, #2              @ tiled_offset = tiled_offset+2
 110     bic         r10, r12, #0x3          @ temp1 = (temp3>>2)<<2
 111     add         r7, r7, r10             @ tiled_offset = tiled_offset+temp1
 112     mov         r7, r7, lsl #11         @ tiled_offset = tiled_offset<<11
 113     add         r8, r7, #4096           @ tiled_offset1 = tiled_offset+2048*2
 114     mov         r14, #8
 115     b           LOOP_HEIGHT_256_GET_TILED_END
 116
 117 LOOP_HEIGHT_256_GET_TILED_EVEN:
 118     add         r11, r4, #31            @ temp2 = ((yuv420_height+31)>>5)<<5
 119     bic         r11, r11, #0x1F
 120     add         r10, r5, #32            @ if ((i+32)<temp2)
 121     cmp         r10, r11
 122     bge         LOOP_HEIGHT_256_GET_TILED_EVEN1
 123     add         r10, r12, #2            @ temp1 = temp3+2
 124     bic         r10, r10, #0x3          @ temp1 = (temp1>>2)<<2
 125     add         r7, r12, r10            @ tiled_offset = temp3+temp1@
 126     add         r10, r3, #127           @ temp1 = ((yuv420_width+127)>>7)<<7
 127     bic         r10, r10, #0x7F
 128     mov         r10, r10, asr #6        @ tiled_offset = tiled_offset+temp4*(temp1>>6)
 129     mla         r7, r14, r10, r7
 130     mov         r7, r7, lsl #11         @ tiled_offset = tiled_offset<<11
 131     add         r8, r7, #12288          @ tiled_offset1 = tiled_offset+2048*6
 132     mov         r14, #8
 133     b           LOOP_HEIGHT_256_GET_TILED_END
 134
 135 LOOP_HEIGHT_256_GET_TILED_EVEN1:
 136     add         r10, r3, #127           @ temp1 = ((yuv420_width+127)>>7)<<7
 137     bic         r10, r10, #0x7F
 138     mov         r10, r10, asr #6        @ tiled_offset = temp4*(temp1>>6)
 139     mul         r7, r14, r10
 140     add         r7, r7, r12             @ tiled_offset = tiled_offset+temp3
 141     mov         r7, r7, lsl #11         @ tiled_offset = tiled_offset<<11
 142     add         r8, r7, #4096           @ tiled_offset1 = tiled_offset+2048*2
 143     mov         r14, #4
 144
 145 LOOP_HEIGHT_256_GET_TILED_END:
 146
 147     ldr         r12, [sp, #52]          @ right
 148     ldr         r9, [sp, #48]           @ top
 149     and         r10, r5, #0x1F          @ temp1 = i&0x1F
 150     add         r7, r7, r10, lsl #6     @ tiled_offset = tiled_offset+64*(temp1)
 151     add         r8, r8, r10, lsl #6     @ tiled_offset1 = tiled_offset1+64*(temp1)
 152     sub         r11, r3, r6             @ temp2 = yuv420_width-left(==j)-right
 153     sub         r11, r11, r12
 154     sub         r9, r5, r9              @ linear_offset = temp2*(i-top)/2@
 155     mul         r9, r11, r9
 156     mov         r9, r9, asr #1
 157     add         r12, r6, #256           @ temp3 = ((j+256)>>8)<<8@
 158     bic         r12, r12, #0xFF
 159     sub         r12, r12, r6            @ temp3 = temp3-j@
 160     and         r10, r6, #0x3F          @ temp1 = left(==j)&0x3F
 161
 162     cmp         r12, #192               @ if (temp3 > 192)
 163     ble         LOOP_HEIGHT_256_LEFT_192
 164     add         r11, r2, r7             @ r11 = nv12t_src+tiled_offset+temp1
 165     add         r11, r11, r10
 166     pld         [r11]
 167     add         r12, r2, r7             @ r12 = nv12t_src+tiled_offset+2048
 168     pld         [r11, #32]
 169     add         r12, r12, #2048
 170     pld         [r12]
 171     cmp         r10, #0
 172     pld         [r12, #32]
 173     stmnefd     sp!, {r8-r12, r14}      @ backup registers
 174     rsbne       r10, r10, #64
 175     blne        INTERLEAVED_MEMCOPY_UNDER_64
 176     ldmnefd     sp!, {r8-r12, r14}      @ restore registers
 177     bne         LOOP_HEIGHT_256_LEFT_256_64
 178     vld2.8      {q0, q1}, [r11]!        @ load {nv12t_src+tiled_offset+temp1, 64}
 179     vld2.8      {q2, q3}, [r11]
 180     add         r11, r0, r9             @ r11 = yuv420_u_dest+linear_offset
 181     vst1.8      {q0}, [r11]!
 182     vst1.8      {q2}, [r11]!
 183     add         r11, r1, r9             @ r11 = yuv420_v_dest+linear_offset
 184     vst1.8      {q1}, [r11]!
 185     vst1.8      {q3}, [r11]!
 186 LOOP_HEIGHT_256_LEFT_256_64:
 187     add         r11, r2, r8             @ r11 = nv12t_src+tiled_offset1
 188     pld         [r11]
 189     vld2.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset+2048, 64}
 190     pld         [r11, #32]
 191     vld2.8      {q6, q7}, [r12]
 192     add         r12, r11, #2048         @ r12 = nv12t_src+tiled_offset1+2048
 193     pld         [r12]
 194     vld2.8      {q8, q9}, [r11]!        @ load {nv12t_src+tiled_offset1, 64}
 195     pld         [r12, #32]
 196     vld2.8      {q10, q11}, [r11]
 197     vld2.8      {q12, q13}, [r12]!      @ load {nv12t_src+tiled_offset1+2048, 64}
 198     vld2.8      {q14, q15}, [r12]
 199
 200     add         r11, r0, r9             @ r11 = yuv420_u_dest+linear_offset+32-temp1/2
 201     add         r11, r11, #32
 202     sub         r11, r11, r10, asr #1
 203     vst1.8      {q4}, [r11]!
 204     vst1.8      {q6}, [r11]!
 205     vst1.8      {q8}, [r11]!
 206     vst1.8      {q10}, [r11]!
 207     vst1.8      {q12}, [r11]!
 208     vst1.8      {q14}, [r11]!
 209
 210     add         r11, r1, r9             @ r11 = yuv420_v_dest+linear_offset+32-temp1/2
 211     add         r11, r11, #32
 212     sub         r11, r11, r10, asr #1
 213     vst1.8      {q5}, [r11]!
 214     vst1.8      {q7}, [r11]!
 215     vst1.8      {q9}, [r11]!
 216     vst1.8      {q11}, [r11]!
 217     vst1.8      {q13}, [r11]!
 218     vst1.8      {q15}, [r11]!
 219
 220     add         r9, r9, #128
 221     sub         r9, r9, r10, asr #1
 222     b           LOOP_HEIGHT_256_LEFT_END
 223
 224 LOOP_HEIGHT_256_LEFT_192:
 225     cmp         r12, #128               @ if (temp3 > 128)
 226     ble         LOOP_HEIGHT_256_LEFT_128
 227     add         r11, r2, r7             @ r11 = nv12t_src+tiled_offset+2048+temp1
 228     add         r11, r11, r10
 229     add         r11, r11, #2048
 230     pld         [r11]
 231     add         r12, r2, r8             @ r12 = nv12t_src+tiled_offset1
 232     pld         [r11, #32]
 233     cmp         r10, #0
 234     pld         [r12]
 235     stmnefd     sp!, {r8-r12, r14}      @ backup registers
 236     pld         [r12, #32]
 237     rsbne       r10, r10, #64
 238     blne        INTERLEAVED_MEMCOPY_UNDER_64
 239     ldmnefd     sp!, {r8-r12, r14}      @ restore registers
 240     bne         LOOP_HEIGHT_256_LEFT_192_64
 241     vld2.8      {q0, q1}, [r11]!        @ load {nv12t_src+tiled_offset+2048+temp1, 64}
 242     vld2.8      {q2, q3}, [r11]
 243     add         r11, r0, r9             @ r11 = yuv420_u_dest+linear_offset
 244     vst1.8      {q0}, [r11]!
 245     vst1.8      {q2}, [r11]!
 246     add         r11, r1, r9             @ r11 = yuv420_v_dest+linear_offset
 247     vst1.8      {q1}, [r11]!
 248     vst1.8      {q3}, [r11]!
 249 LOOP_HEIGHT_256_LEFT_192_64:
 250     add         r11, r2, r8             @ r11 = nv12t_src+tiled_offset1+2048
 251     add         r11, r11, #2048
 252     pld         [r11]
 253     vld2.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset1, 64}
 254     pld         [r11, #32]
 255     vld2.8      {q6, q7}, [r12]
 256     vld2.8      {q8, q9}, [r11]!        @ load {nv12t_src+tiled_offset1+2048, 64}
 257     vld2.8      {q10, q11}, [r11]
 258
 259     add         r11, r0, r9             @ r11 = yuv420_u_dest+linear_offset+32-temp1/2
 260     add         r11, r11, #32
 261     sub         r11, r11, r10, asr #1
 262     vst1.8      {q4}, [r11]!
 263     vst1.8      {q6}, [r11]!
 264     vst1.8      {q8}, [r11]!
 265     vst1.8      {q10}, [r11]!
 266
 267     add         r11, r1, r9             @ r11 = yuv420_v_dest+linear_offset+32-temp1/2
 268     add         r11, r11, #32
 269     sub         r11, r11, r10, asr #1
 270     vst1.8      {q5}, [r11]!
 271     vst1.8      {q7}, [r11]!
 272     vst1.8      {q9}, [r11]!
 273     vst1.8      {q11}, [r11]!
 274
 275     add         r9, r9, #96
 276     sub         r9, r9, r10, asr #1
 277     b           LOOP_HEIGHT_256_LEFT_END
 278
 279 LOOP_HEIGHT_256_LEFT_128:
 280     cmp         r12, #64                @ if (temp3 > 64)
 281     ble         LOOP_HEIGHT_256_LEFT_64
 282     add         r11, r2, r8             @ r11 = nv12t_src+tiled_offset1+temp1
 283     add         r11, r11, r10
 284     pld         [r11]
 285     add         r12, r2, r8             @ r12 = nv12t_src+tiled_offset1
 286     add         r12, r12, #2048
 287     pld         [r11, #32]
 288     cmp         r10, #0
 289     pld         [r12]
 290     stmnefd     sp!, {r8-r12, r14}      @ backup registers
 291     pld         [r12, #32]
 292     rsbne       r10, r10, #64
 293     blne        INTERLEAVED_MEMCOPY_UNDER_64
 294     ldmnefd     sp!, {r8-r12, r14}      @ restore registers
 295     bne         LOOP_HEIGHT_256_LEFT_128_64
 296     vld2.8      {q0, q1}, [r11]!        @ load {nv12t_src+tiled_offset1+temp1, 64}
 297     vld2.8      {q2, q3}, [r11]
 298     add         r11, r0, r9             @ r11 = yuv420_u_dest+linear_offset
 299     vst1.8      {q0}, [r11]!
 300     vst1.8      {q2}, [r11]!
 301     add         r11, r1, r9             @ r11 = yuv420_v_dest+linear_offset
 302     vst1.8      {q1}, [r11]!
 303     vst1.8      {q3}, [r11]!
 304 LOOP_HEIGHT_256_LEFT_128_64:
 305     vld2.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset1, 64}
 306     vld2.8      {q6, q7}, [r12]
 307
 308     add         r11, r0, r9             @ r11 = yuv420_u_dest+linear_offset+32-temp1/2
 309     add         r11, r11, #32
 310     sub         r11, r11, r10, asr #1
 311     vst1.8      {q4}, [r11]!
 312     vst1.8      {q6}, [r11]!
 313
 314     add         r11, r1, r9             @ r11 = yuv420_v_dest+linear_offset+32-temp1/2
 315     add         r11, r11, #32
 316     sub         r11, r11, r10, asr #1
 317     vst1.8      {q5}, [r11]!
 318     vst1.8      {q7}, [r11]!
 319
 320     add         r9, r9, #64
 321     sub         r9, r9, r10, asr #1
 322     b           LOOP_HEIGHT_256_LEFT_END
 323
 324 LOOP_HEIGHT_256_LEFT_64:
 325     add         r11, r2, r8             @ r11 = nv12t_src+tiled_offset1+2048+temp1
 326     add         r11, r11, #2048
 327     add         r11, r11, r10
 328     cmp         r10, #0
 329     pld         [r11]
 330     stmnefd     sp!, {r8-r12, r14}      @ backup registers
 331     pld         [r11, #32]
 332     rsbne       r10, r10, #64
 333     blne        INTERLEAVED_MEMCOPY_UNDER_64
 334     ldmnefd     sp!, {r8-r12, r14}      @ restore registers
 335     bne         LOOP_HEIGHT_256_LEFT_64_64
 336     vld2.8      {q0, q1}, [r11]!        @ load {nv12t_src+tiled_offset1+temp1, 64}
 337     vld2.8      {q2, q3}, [r11]
 338     add         r11, r0, r9             @ r11 = yuv420_dest+linear_offset
 339     vst1.8      {q0, q1}, [r11]!        @ store {yuv420_dest+linear_offset, 64}
 340     vst1.8      {q2, q3}, [r11]!
 341 LOOP_HEIGHT_256_LEFT_64_64:
 342     add         r9, r9, #32
 343     sub         r9, r9, r10, asr #1
 344
 345 LOOP_HEIGHT_256_LEFT_END:
 346
 347     ldr         r12, [sp, #52]          @ right
 348     add         r7, r7, r14, lsl #11    @ tiled_offset = tiled_offset+temp4*2048
 349     add         r10, r2, r7             @ r10 = nv12t_src+tiled_offset
 350     pld         [r10]
 351     bic         r6, r6, #0xFF           @ j = (left>>8)<<8
 352     pld         [r10, #32]
 353     add         r6, r6, #256            @ j = j + 256
 354     sub         r11, r3, r12            @ temp2 = yuv420_width-right-256
 355     sub         r11, r11, #256
 356     cmp         r6, r11
 357     bgt         LOOP_HEIGHT_256_WIDTH_END
 358
 359 LOOP_HEIGHT_256_WIDTH:
 360     add         r12, r10, #2048         @ r12 = nv12t_src+tiled_offset+2048
 361     pld         [r12]
 362     vld2.8      {q0, q1}, [r10]!        @ load {nv12t_src+tiled_offset, 64}
 363     pld         [r12, #32]
 364     vld2.8      {q2, q3}, [r10]
 365
 366     add         r8, r8, r14, lsl #11    @ tiled_offset1 = tiled_offset1+temp4*2048
 367     add         r10, r2, r8             @ r10 = nv12t_src+tiled_offset1
 368     pld         [r10]
 369     vld2.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset+2048, 64}
 370     pld         [r10, #32]
 371     vld2.8      {q6, q7}, [r12]
 372
 373     add         r12, r10, #2048         @ r12 = nv12t_src+tiled_offset+2048
 374     pld         [r12]
 375     vld2.8      {q8, q9}, [r10]!        @ load {nv12t_src+tiled_offset+2048, 64}
 376     pld         [r12, #32]
 377     vld2.8      {q10, q11}, [r10]
 378
 379     add         r7, r7, r14, lsl #11    @ tiled_offset = tiled_offset+temp4*2048
 380     add         r10, r2, r7
 381     pld         [r10]
 382     vld2.8      {q12, q13}, [r12]!      @ load {nv12t_src+tiled_offset+2048, 64}
 383     pld         [r10, #32]
 384     vld2.8      {q14, q15}, [r12]
 385
 386     add         r12, r0, r9             @ r12 = yuv420_u_dest+linear_offset
 387     vst1.8      {q0}, [r12]!
 388     vst1.8      {q2}, [r12]!
 389     vst1.8      {q4}, [r12]!
 390     vst1.8      {q6}, [r12]!
 391     vst1.8      {q8}, [r12]!
 392     vst1.8      {q10}, [r12]!
 393     vst1.8      {q12}, [r12]!
 394     vst1.8      {q14}, [r12]!
 395     add         r12, r1, r9             @ r12 = yuv420_v_dest+linear_offset
 396     vst1.8      {q1}, [r12]!
 397     vst1.8      {q3}, [r12]!
 398     vst1.8      {q5}, [r12]!
 399     vst1.8      {q7}, [r12]!
 400     vst1.8      {q9}, [r12]!
 401     vst1.8      {q11}, [r12]!
 402     vst1.8      {q13}, [r12]!
 403     vst1.8      {q15}, [r12]!
 404     add         r9, r9, #128            @ linear_offset = linear_offset+128
 405
 406     add         r12, r10, #2048         @ r12 = nv12t_src+tiled_offset+2048
 407
 408     add         r6, r6, #256            @ j=j+256
 409     cmp         r6, r11                 @ j<=temp2
 410     ble         LOOP_HEIGHT_256_WIDTH
 411
 412 LOOP_HEIGHT_256_WIDTH_END:
 413
 414     add         r8, r8, r14, lsl #11    @ tiled_offset1 = tiled_offset1+temp4*2048
 415     ldr         r14, [sp, #52]          @ right
 416     sub         r11, r3, r6             @ temp2 = yuv420_width-right-j
 417     sub         r11, r11, r14
 418     cmp         r11, #0
 419     beq         LOOP_HEIGHT_256_RIGHT_END
 420     cmp         r11, #192
 421     ble         LOOP_HEIGHT_256_RIGHT_192
 422     add         r12, r10, #2048
 423     pld         [r12]
 424     vld2.8      {q0, q1}, [r10]!        @ load {nv12t_src+tiled_offset}
 425     pld         [r12, #32]
 426     vld2.8      {q2, q3}, [r10]
 427
 428     add         r10, r2, r8             @ r10 = nv12t_src+tiled_offset1
 429     pld         [r10]
 430     vld2.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset+2048}
 431     pld         [r10, #32]
 432     vld2.8      {q6, q7}, [r12]
 433
 434     add         r14, r10, #2048         @ r10 = nv12t_src+tiled_offset1+2048
 435     pld         [r14]
 436     vld2.8      {q8, q9}, [r10]!        @ load {nv12t_src+tiled_offset1}
 437     pld         [r14, #32]
 438     vld2.8      {q10, q11}, [r10]
 439
 440     add         r12, r0, r9             @ r12 = yuv420_u_dest+linear_offset
 441     vst1.8      {q0}, [r12]!
 442     vst1.8      {q2}, [r12]!
 443     vst1.8      {q4}, [r12]!
 444     vst1.8      {q6}, [r12]!
 445     vst1.8      {q8}, [r12]!
 446     vst1.8      {q10}, [r12]!
 447     add         r12, r1, r9             @ r12 = yuv420_v_dest+linear_offset
 448     vst1.8      {q1}, [r12]!
 449     vst1.8      {q3}, [r12]!
 450     vst1.8      {q5}, [r12]!
 451     vst1.8      {q7}, [r12]!
 452     vst1.8      {q9}, [r12]!
 453     vst1.8      {q11}, [r12]!
 454     add         r9, r9, #96            @ linear_offset = linear_offset+96
 455
 456     stmfd       sp!, {r8-r12, r14}      @ backup registers
 457     sub         r10, r11, #192
 458     mov         r11, r14
 459     bl          INTERLEAVED_MEMCOPY_UNDER_64
 460     ldmfd       sp!, {r8-r12, r14}      @ restore registers
 461     b           LOOP_HEIGHT_256_RIGHT_END
 462
 463 LOOP_HEIGHT_256_RIGHT_192:
 464     cmp         r11, #128
 465     ble         LOOP_HEIGHT_256_RIGHT_128
 466     add         r12, r10, #2048
 467     pld         [r12]
 468     vld2.8      {q0, q1}, [r10]!        @ load {nv12t_src+tiled_offset}
 469     pld         [r12, #32]
 470     vld2.8      {q2, q3}, [r10]
 471
 472     add         r14, r2, r8             @ r10 = nv12t_src+tiled_offset1
 473     pld         [r14]
 474     vld2.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset+2048}
 475     pld         [r14, #32]
 476     vld2.8      {q6, q7}, [r12]
 477
 478     add         r12, r0, r9             @ r12 = yuv420_u_dest+linear_offset
 479     vst1.8      {q0}, [r12]!
 480     vst1.8      {q2}, [r12]!
 481     vst1.8      {q4}, [r12]!
 482     vst1.8      {q6}, [r12]!
 483     add         r12, r1, r9             @ r12 = yuv420_v_dest+linear_offset
 484     vst1.8      {q1}, [r12]!
 485     vst1.8      {q3}, [r12]!
 486     vst1.8      {q5}, [r12]!
 487     vst1.8      {q7}, [r12]!
 488     add         r9, r9, #64            @ linear_offset = linear_offset+64
 489
 490     stmfd       sp!, {r8-r12, r14}      @ backup registers
 491     sub         r10, r11, #128
 492     mov         r11, r14
 493     bl          INTERLEAVED_MEMCOPY_UNDER_64
 494     ldmfd       sp!, {r8-r12, r14}      @ restore registers
 495     b           LOOP_HEIGHT_256_RIGHT_END
 496
 497 LOOP_HEIGHT_256_RIGHT_128:
 498     cmp         r11, #64
 499     ble         LOOP_HEIGHT_256_RIGHT_64
 500     add         r14, r10, #2048
 501     pld         [r14]
 502     vld2.8      {q0, q1}, [r10]!        @ load {nv12t_src+tiled_offset}
 503     pld         [r14, #32]
 504     vld2.8      {q2, q3}, [r10]
 505
 506     add         r12, r0, r9             @ r12 = yuv420_u_dest+linear_offset
 507     vst1.8      {q0}, [r12]!
 508     vst1.8      {q2}, [r12]!
 509     add         r12, r1, r9             @ r12 = yuv420_v_dest+linear_offset
 510     vst1.8      {q1}, [r12]!
 511     vst1.8      {q3}, [r12]!
 512     add         r9, r9, #32            @ linear_offset = linear_offset+32
 513
 514     stmfd       sp!, {r8-r12, r14}      @ backup registers
 515     sub         r10, r11, #64
 516     mov         r11, r14
 517     bl          INTERLEAVED_MEMCOPY_UNDER_64
 518     ldmfd       sp!, {r8-r12, r14}      @ restore registers
 519     b           LOOP_HEIGHT_256_RIGHT_END
 520
 521 LOOP_HEIGHT_256_RIGHT_64:
 522     stmfd       sp!, {r8-r12, r14}      @ backup registers
 523     mov         r14, r11
 524     mov         r11, r10
 525     mov         r10, r14
 526     bl          INTERLEAVED_MEMCOPY_UNDER_64
 527     ldmfd       sp!, {r8-r12, r14}      @ restore registers
 528
 529 LOOP_HEIGHT_256_RIGHT_END:
 530
 531     ldr         r14, [sp, #56]          @ buttom
 532     add         r5, r5, #1              @ i=i+1
 533     sub         r14, r4, r14            @ i<yuv420_height-buttom
 534     cmp         r5, r14
 535     blt         LOOP_HEIGHT_256
 536     b           RESTORE_REG
 537
 538 LOOP_HEIGHT_64_START:
 539     cmp         r10, #64                @ if (temp1 >= 64)
 540     blt         LOOP_HEIGHT_2_START
 541
 542     ldr         r5, [sp, #48]           @ i = top
 543 LOOP_HEIGHT_64:
 544     ldr         r6, [sp, #44]           @ j = left
 545     stmfd       sp!, {r0-r3, r12}       @ backup parameters
 546     mov         r0, r3
 547     mov         r1, r4
 548     mov         r2, r6
 549     mov         r3, r5
 550     bl          tile_4x2_read_asm
 551     mov         r7, r0
 552     ldmfd       sp!, {r0-r3, r12}       @ restore parameters
 553     ldr         r9, [sp, #48]           @ linear_offset = top
 554     ldr         r12, [sp, #52]          @ r12 = right
 555     add         r11, r6, #64            @ temp2 = ((j+64)>>6)<<6
 556     bic         r11, r11, #0x3F
 557     sub         r11, r11, r6            @ temp2 = temp2-j
 558     sub         r12, r3, r12            @ temp3 = yuv420_width-right
 559     sub         r14, r12, r6            @ temp4 = temp3-left
 560     sub         r9, r5, r9              @ linear_offset = temp4*(i-top)/2
 561     mul         r9, r9, r14
 562     mov         r9, r9, asr #1
 563     and         r14, r6, #0x3           @ temp4 = j&0x3
 564     add         r7, r7, r14             @ tiled_offset = tiled_offset+temp4
 565     stmfd       sp!, {r9-r12}           @ backup parameters
 566     mov         r10, r11
 567     add         r11, r2, r7
 568     bl          INTERLEAVED_MEMCOPY_UNDER_64
 569     ldmfd       sp!, {r9-r12}           @ restore parameters
 570     add         r9, r9, r11, asr #1     @ linear_offset = linear_offset+temp2/2
 571     add         r6, r6, r11             @ j = j+temp2@
 572
 573     add         r14, r6, #64
 574     cmp         r14, r12
 575     bgt         LOOP_HEIGHT_64_1
 576     stmfd       sp!, {r0-r3, r12}       @ backup parameters
 577     mov         r0, r3
 578     mov         r1, r4
 579     mov         r2, r6
 580     mov         r3, r5
 581     bl          tile_4x2_read_asm
 582     mov         r7, r0
 583     ldmfd       sp!, {r0-r3, r12}       @ restore parameters
 584     add         r7, r2, r7
 585     vld2.8      {q0, q1}, [r7]!
 586     vld2.8      {q2, q3}, [r7]
 587     add         r7, r0, r9
 588     vst1.8      {q0}, [r7]!
 589     vst1.8      {q2}, [r7]
 590     add         r7, r1, r9
 591     vst1.8      {q1}, [r7]!
 592     vst1.8      {q3}, [r7]
 593     add         r9, r9, #32
 594     add         r6, r6, #64
 595
 596 LOOP_HEIGHT_64_1:
 597     add         r14, r6, #64
 598     cmp         r14, r12
 599     bgt         LOOP_HEIGHT_64_2
 600     stmfd       sp!, {r0-r3, r12}       @ backup parameters
 601     mov         r0, r3
 602     mov         r1, r4
 603     mov         r2, r6
 604     mov         r3, r5
 605     bl          tile_4x2_read_asm
 606     mov         r7, r0
 607     ldmfd       sp!, {r0-r3, r12}       @ restore parameters
 608     add         r7, r2, r7
 609     vld2.8      {q0, q1}, [r7]!
 610     vld2.8      {q2, q3}, [r7]
 611     add         r7, r0, r9
 612     vst1.8      {q0}, [r7]!
 613     vst1.8      {q2}, [r7]
 614     add         r7, r1, r9
 615     vst1.8      {q1}, [r7]!
 616     vst1.8      {q3}, [r7]
 617     add         r9, r9, #32
 618     add         r6, r6, #64
 619
 620 LOOP_HEIGHT_64_2:
 621     cmp         r6, r12
 622     bge         LOOP_HEIGHT_64_3
 623     stmfd       sp!, {r0-r3, r12}       @ backup parameters
 624     mov         r0, r3
 625     mov         r1, r4
 626     mov         r2, r6
 627     mov         r3, r5
 628     bl          tile_4x2_read_asm
 629     mov         r7, r0
 630     ldmfd       sp!, {r0-r3, r12}       @ restore parameters
 631     sub         r11, r12, r6
 632     stmfd       sp!, {r9-r12}           @ backup parameters
 633     mov         r10, r11
 634     add         r11, r2, r7
 635     bl          INTERLEAVED_MEMCOPY_UNDER_64
 636     ldmfd       sp!, {r9-r12}           @ restore parameters
 637
 638 LOOP_HEIGHT_64_3:
 639
 640     ldr         r14, [sp, #56]          @ buttom
 641     add         r5, r5, #1              @ i=i+1
 642     sub         r14, r4, r14            @ i<yuv420_height-buttom
 643     cmp         r5, r14
 644     blt         LOOP_HEIGHT_64
 645     b           RESTORE_REG
 646
 647 LOOP_HEIGHT_2_START:
 648
 649     ldr         r5, [sp, #48]           @ i = top
 650 LOOP_HEIGHT_2:
 651
 652     ldr         r12, [sp, #52]          @ linear_offset = right
 653     ldr         r6, [sp, #44]           @ j = left
 654     ldr         r9, [sp, #48]           @ linear_offset = top
 655
 656     sub         r12, r3, r12            @ temp3 = yuv420_width-right
 657     sub         r14, r12, r6            @ temp4 = temp3-left@
 658     sub         r9, r5, r9              @ r9 = i-top
 659     mul         r9, r14, r9             @ temp4*(i-top)
 660     mov         r9, r9, lsr #1          @ linear_offset = temp4*(i-top)/2
 661     add         r11, r0, r9
 662     add         r12, r1, r9
 663 LOOP_HEIGHT_2_WIDTH:
 664     stmfd       sp!, {r0-r3, r12}       @ backup parameters
 665     mov         r0, r2
 666     mov         r1, r3
 667     mov         r2, r6
 668     mov         r3, r5
 669     bl          tile_4x2_read_asm
 670     mov         r7, r0
 671     ldmfd       sp!, {r0-r3, r12}       @ restore parameters
 672
 673     and         r14, r6, #0x3           @ temp4 = j&0x3@
 674     add         r7, r7, r14             @ tiled_offset = tiled_offset+temp4@
 675     add         r7, r2, r7
 676
 677     ldrh        r14, [r7]
 678     strb        r14, [r11], #1
 679     mov         r14, r14, lsr #8
 680     strb        r14, [r12], #1
 681
 682     ldr         r14, [sp, #52]          @ right
 683     add         r6, r6, #2              @ j=j+2
 684     sub         r14, r3, r14            @ j<yuv420_width-right
 685     cmp         r6, r14
 686     blt         LOOP_HEIGHT_2_WIDTH
 687
 688     ldr         r14, [sp, #56]          @ buttom
 689     add         r5, r5, #1              @ i=i+1
 690     sub         r14, r4, r14            @ i<yuv420_height-buttom
 691     cmp         r5, r14
 692     blt         LOOP_HEIGHT_2
 693
 694 RESTORE_REG:
 695     ldmfd       sp!, {r4-r12,r15}       @ restore registers
 696
 697 INTERLEAVED_MEMCOPY_UNDER_64:           @ count=r10, src=r11
 698     cmp         r10, #32
 699     blt         INTERLEAVED_MEMCOPY_UNDER_32
 700     vld2.8      {q0, q1}, [r11]!        @ load {nv12t_src+tiled_offset+temp1, 64}
 701     sub         r10, r10, #32
 702     cmp         r10, #0
 703     add         r12, r0, r9              @ r12 = yuv420_u_dest+linear_offset
 704     vst1.8      {q0}, [r12]             @ load {nv12t_src+tiled_offset+temp1, 64}
 705     add         r12, r1, r9              @ r12 = yuv420_v_dest+linear_offset
 706     vst1.8      {q1}, [r12]             @ load {nv12t_src+tiled_offset+temp1, 64}
 707     add         r9, r9, #16
 708     beq         INTERLEAVED_MEMCOPY_UNDER_END
 709 INTERLEAVED_MEMCOPY_UNDER_32:
 710     cmp         r10, #16
 711     blt         INTERLEAVED_MEMCOPY_UNDER_16
 712     vld2.8      {q0}, [r11]!            @ load {nv12t_src+tiled_offset+temp1, 64}
 713     sub         r10, r10, #16
 714     cmp         r10, #0
 715     add         r12, r0, r9              @ r12 = yuv420_u_dest+linear_offset
 716     vst1.8      {d0}, [r12]!             @ load {nv12t_src+tiled_offset+temp1, 64}
 717     add         r12, r1, r9              @ r12 = yuv420_v_dest+linear_offset
 718     vst1.8      {d1}, [r12]!             @ load {nv12t_src+tiled_offset+temp1, 64}
 719     add         r9, r9, #8
 720     beq         INTERLEAVED_MEMCOPY_UNDER_END
 721 INTERLEAVED_MEMCOPY_UNDER_16:
 722     ldrh        r12, [r11], #2
 723     add         r8, r0, r9              @ r8 = yuv420_u_dest+linear_offset
 724     strb        r12, [r8]
 725     add         r8, r1, r9              @ r8 = yuv420_v_dest+linear_offset
 726     mov         r12, r12, lsr #8
 727     strb        r12, [r8]
 728     subs        r10, r10, #2
 729     add         r9, r9, #1
 730     bne         INTERLEAVED_MEMCOPY_UNDER_16
 731
 732 INTERLEAVED_MEMCOPY_UNDER_END:
 733     and         r10, r6, #0x3F          @ temp1 = left(==j)&0x3F
 734     cmp         r10, #0
 735     mov         pc, lr
 736
 737 tile_4x2_read_asm:
 738 LFB0:
 739     add     ip, r3, #32
 740     sub     r0, r0, #1
 741     cmp     r1, ip
 742     cmple   r3, r1
 743     mov     ip, r2, asr #2
 744     mov     r0, r0, asr #7
 745     stmfd   sp!, {r4, r5, lr}
 746 LCFI0:
 747     add     r0, r0, #1
 748     bge     L2
 749     sub     r1, r1, #1
 750     tst     r1, #32
 751     bne     L2
 752     tst     r3, #32
 753     bne     L2
 754     mov     r4, r2, asr #7
 755     and     r1, r3, #31
 756     eor     r4, r4, r3, asr #5
 757     ubfx    r3, r3, #6, #8
 758     tst     r4, #1
 759     ubfx    r4, r2, #8, #6
 760     and     ip, ip, #15
 761     mov     r2, r2, asr #6
 762     mla     r3, r0, r3, r4
 763     orr     r1, ip, r1, asl #4
 764     b       L9
 765 L2:
 766     mov     r2, ip, asr #5
 767     and     r4, r3, #31
 768     eor     r1, r2, r3, asr #5
 769     and     r5, r2, #127
 770     ubfx    r3, r3, #6, #8
 771     tst     r1, #1
 772     and     r1, ip, #15
 773     mov     r2, ip, asr #4
 774     mla     r3, r0, r3, r5
 775     orr     r1, r1, r4, asl #4
 776 L9:
 777     andne   r2, r2, #1
 778     andeq   r2, r2, #1
 779     orrne   r2, r2, #2
 780     mov     r1, r1, asl #2
 781     orr     r3, r1, r3, asl #13
 782     orr     r0, r3, r2, asl #11
 783     ldmfd   sp!, {r4, r5, pc}
 784 LFE0:
 785     .fnend
 786