src/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm

   1 ;
   2 ;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11 ;TODO(cd): adjust these constant to be able to use vqdmulh for faster
  12 ;          dct_const_round_shift(a * b) within butterfly calculations.
  13 cospi_1_64  EQU 16364
  14 cospi_2_64  EQU 16305
  15 cospi_3_64  EQU 16207
  16 cospi_4_64  EQU 16069
  17 cospi_5_64  EQU 15893
  18 cospi_6_64  EQU 15679
  19 cospi_7_64  EQU 15426
  20 cospi_8_64  EQU 15137
  21 cospi_9_64  EQU 14811
  22 cospi_10_64 EQU 14449
  23 cospi_11_64 EQU 14053
  24 cospi_12_64 EQU 13623
  25 cospi_13_64 EQU 13160
  26 cospi_14_64 EQU 12665
  27 cospi_15_64 EQU 12140
  28 cospi_16_64 EQU 11585
  29 cospi_17_64 EQU 11003
  30 cospi_18_64 EQU 10394
  31 cospi_19_64 EQU  9760
  32 cospi_20_64 EQU  9102
  33 cospi_21_64 EQU  8423
  34 cospi_22_64 EQU  7723
  35 cospi_23_64 EQU  7005
  36 cospi_24_64 EQU  6270
  37 cospi_25_64 EQU  5520
  38 cospi_26_64 EQU  4756
  39 cospi_27_64 EQU  3981
  40 cospi_28_64 EQU  3196
  41 cospi_29_64 EQU  2404
  42 cospi_30_64 EQU  1606
  43 cospi_31_64 EQU   804
  44
  45
  46     EXPORT  |vp9_idct32x32_1024_add_neon|
  47     ARM
  48     REQUIRE8
  49     PRESERVE8
  50
  51     AREA ||.text||, CODE, READONLY, ALIGN=2
  52
  53     AREA     Block, CODE, READONLY
  54
  55     ; --------------------------------------------------------------------------
  56     ; Load from transposed_buffer
  57     ;   q13 = transposed_buffer[first_offset]
  58     ;   q14 = transposed_buffer[second_offset]
  59     ;   for proper address calculation, the last offset used when manipulating
  60     ;   transposed_buffer must be passed in. use 0 for first use.
  61     MACRO
  62     LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
  63     ; address calculation with proper stride and loading
  64     add r0, #($first_offset  - $prev_offset )*8*2
  65     vld1.s16        {q14}, [r0]
  66     add r0, #($second_offset - $first_offset)*8*2
  67     vld1.s16        {q13}, [r0]
  68     ; (used) two registers (q14, q13)
  69     MEND
  70     ; --------------------------------------------------------------------------
  71     ; Load from output (used as temporary storage)
  72     ;   reg1 = output[first_offset]
  73     ;   reg2 = output[second_offset]
  74     ;   for proper address calculation, the last offset used when manipulating
  75     ;   output, wethere reading or storing) must be passed in. use 0 for first
  76     ;   use.
  77     MACRO
  78     LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
  79     ; address calculation with proper stride and loading
  80     add r1, #($first_offset  - $prev_offset )*32*2
  81     vld1.s16        {$reg1}, [r1]
  82     add r1, #($second_offset - $first_offset)*32*2
  83     vld1.s16        {$reg2}, [r1]
  84     ; (used) two registers ($reg1, $reg2)
  85     MEND
  86     ; --------------------------------------------------------------------------
  87     ; Store into output (sometimes as as temporary storage)
  88     ;   output[first_offset] = reg1
  89     ;   output[second_offset] = reg2
  90     ;   for proper address calculation, the last offset used when manipulating
  91     ;   output, wethere reading or storing) must be passed in. use 0 for first
  92     ;   use.
  93     MACRO
  94     STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
  95     ; address calculation with proper stride and storing
  96     add r1, #($first_offset  - $prev_offset )*32*2
  97     vst1.16 {$reg1}, [r1]
  98     add r1, #($second_offset - $first_offset)*32*2
  99     vst1.16 {$reg2}, [r1]
 100     MEND
 101     ; --------------------------------------------------------------------------
 102     ; Combine-add results with current destination content
 103     ;   q6-q9 contain the results (out[j * 32 + 0-31])
 104     MACRO
 105     STORE_COMBINE_CENTER_RESULTS
 106     ; load dest[j * dest_stride + 0-31]
 107     vld1.s16        {d8}, [r10], r2
 108     vld1.s16        {d11}, [r9], r11
 109     vld1.s16        {d9}, [r10]
 110     vld1.s16        {d10}, [r9]
 111     ; ROUND_POWER_OF_TWO
 112     vrshr.s16       q7, q7, #6
 113     vrshr.s16       q8, q8, #6
 114     vrshr.s16       q9, q9, #6
 115     vrshr.s16       q6, q6, #6
 116     ; add to dest[j * dest_stride + 0-31]
 117     vaddw.u8        q7, q7, d9
 118     vaddw.u8        q8, q8, d10
 119     vaddw.u8        q9, q9, d11
 120     vaddw.u8        q6, q6, d8
 121     ; clip pixel
 122     vqmovun.s16     d9,  q7
 123     vqmovun.s16     d10, q8
 124     vqmovun.s16     d11, q9
 125     vqmovun.s16     d8,  q6
 126     ; store back into dest[j * dest_stride + 0-31]
 127     vst1.16         {d9}, [r10], r11
 128     vst1.16         {d10}, [r9], r2
 129     vst1.16         {d8}, [r10]
 130     vst1.16         {d11}, [r9]
 131     ; update pointers (by dest_stride * 2)
 132     sub r9,  r9,  r2, lsl #1
 133     add r10, r10, r2, lsl #1
 134     MEND
 135     ; --------------------------------------------------------------------------
 136     ; Combine-add results with current destination content
 137     ;   q6-q9 contain the results (out[j * 32 + 0-31])
 138     MACRO
 139     STORE_COMBINE_CENTER_RESULTS_LAST
 140     ; load dest[j * dest_stride + 0-31]
 141     vld1.s16        {d8}, [r10], r2
 142     vld1.s16        {d11}, [r9], r11
 143     vld1.s16        {d9}, [r10]
 144     vld1.s16        {d10}, [r9]
 145     ; ROUND_POWER_OF_TWO
 146     vrshr.s16       q7, q7, #6
 147     vrshr.s16       q8, q8, #6
 148     vrshr.s16       q9, q9, #6
 149     vrshr.s16       q6, q6, #6
 150     ; add to dest[j * dest_stride + 0-31]
 151     vaddw.u8        q7, q7, d9
 152     vaddw.u8        q8, q8, d10
 153     vaddw.u8        q9, q9, d11
 154     vaddw.u8        q6, q6, d8
 155     ; clip pixel
 156     vqmovun.s16     d9,  q7
 157     vqmovun.s16     d10, q8
 158     vqmovun.s16     d11, q9
 159     vqmovun.s16     d8,  q6
 160     ; store back into dest[j * dest_stride + 0-31]
 161     vst1.16         {d9}, [r10], r11
 162     vst1.16         {d10}, [r9], r2
 163     vst1.16         {d8}, [r10]!
 164     vst1.16         {d11}, [r9]!
 165     ; update pointers (by dest_stride * 2)
 166     sub r9,  r9,  r2, lsl #1
 167     add r10, r10, r2, lsl #1
 168     MEND
 169     ; --------------------------------------------------------------------------
 170     ; Combine-add results with current destination content
 171     ;   q4-q7 contain the results (out[j * 32 + 0-31])
 172     MACRO
 173     STORE_COMBINE_EXTREME_RESULTS
 174     ; load dest[j * dest_stride + 0-31]
 175     vld1.s16        {d4}, [r7], r2
 176     vld1.s16        {d7}, [r6], r11
 177     vld1.s16        {d5}, [r7]
 178     vld1.s16        {d6}, [r6]
 179     ; ROUND_POWER_OF_TWO
 180     vrshr.s16       q5, q5, #6
 181     vrshr.s16       q6, q6, #6
 182     vrshr.s16       q7, q7, #6
 183     vrshr.s16       q4, q4, #6
 184     ; add to dest[j * dest_stride + 0-31]
 185     vaddw.u8        q5, q5, d5
 186     vaddw.u8        q6, q6, d6
 187     vaddw.u8        q7, q7, d7
 188     vaddw.u8        q4, q4, d4
 189     ; clip pixel
 190     vqmovun.s16     d5, q5
 191     vqmovun.s16     d6, q6
 192     vqmovun.s16     d7, q7
 193     vqmovun.s16     d4, q4
 194     ; store back into dest[j * dest_stride + 0-31]
 195     vst1.16         {d5}, [r7], r11
 196     vst1.16         {d6}, [r6], r2
 197     vst1.16         {d7}, [r6]
 198     vst1.16         {d4}, [r7]
 199     ; update pointers (by dest_stride * 2)
 200     sub r6, r6, r2, lsl #1
 201     add r7, r7, r2, lsl #1
 202     MEND
 203     ; --------------------------------------------------------------------------
 204     ; Combine-add results with current destination content
 205     ;   q4-q7 contain the results (out[j * 32 + 0-31])
 206     MACRO
 207     STORE_COMBINE_EXTREME_RESULTS_LAST
 208     ; load dest[j * dest_stride + 0-31]
 209     vld1.s16        {d4}, [r7], r2
 210     vld1.s16        {d7}, [r6], r11
 211     vld1.s16        {d5}, [r7]
 212     vld1.s16        {d6}, [r6]
 213     ; ROUND_POWER_OF_TWO
 214     vrshr.s16       q5, q5, #6
 215     vrshr.s16       q6, q6, #6
 216     vrshr.s16       q7, q7, #6
 217     vrshr.s16       q4, q4, #6
 218     ; add to dest[j * dest_stride + 0-31]
 219     vaddw.u8        q5, q5, d5
 220     vaddw.u8        q6, q6, d6
 221     vaddw.u8        q7, q7, d7
 222     vaddw.u8        q4, q4, d4
 223     ; clip pixel
 224     vqmovun.s16     d5, q5
 225     vqmovun.s16     d6, q6
 226     vqmovun.s16     d7, q7
 227     vqmovun.s16     d4, q4
 228     ; store back into dest[j * dest_stride + 0-31]
 229     vst1.16         {d5}, [r7], r11
 230     vst1.16         {d6}, [r6], r2
 231     vst1.16         {d7}, [r6]!
 232     vst1.16         {d4}, [r7]!
 233     ; update pointers (by dest_stride * 2)
 234     sub r6, r6, r2, lsl #1
 235     add r7, r7, r2, lsl #1
 236     MEND
 237     ; --------------------------------------------------------------------------
 238     ; Touches q8-q12, q15 (q13-q14 are preserved)
 239     ; valid output registers are anything but q8-q11
 240     MACRO
 241     DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
 242     ; TODO(cd): have special case to re-use constants when they are similar for
 243     ;           consecutive butterflies
 244     ; TODO(cd): have special case when both constants are the same, do the
 245     ;           additions/substractions before the multiplies.
 246     ; generate the constants
 247     ;   generate scalar constants
 248     mov             r8,  #$first_constant  & 0xFF00
 249     mov             r12, #$second_constant & 0xFF00
 250     add             r8,  #$first_constant  & 0x00FF
 251     add             r12, #$second_constant & 0x00FF
 252     ;   generate vector constants
 253     vdup.16         d30, r8
 254     vdup.16         d31, r12
 255     ; (used) two for inputs (regA-regD), one for constants (q15)
 256     ; do some multiplications (ordered for maximum latency hiding)
 257     vmull.s16 q8,  $regC, d30
 258     vmull.s16 q10, $regA, d31
 259     vmull.s16 q9,  $regD, d30
 260     vmull.s16 q11, $regB, d31
 261     vmull.s16 q12, $regC, d31
 262     ; (used) five for intermediate (q8-q12), one for constants (q15)
 263     ; do some addition/substractions (to get back two register)
 264     vsub.s32  q8, q8, q10
 265     vsub.s32  q9, q9, q11
 266     ; do more multiplications (ordered for maximum latency hiding)
 267     vmull.s16 q10, $regD, d31
 268     vmull.s16 q11, $regA, d30
 269     vmull.s16 q15, $regB, d30
 270     ; (used) six for intermediate (q8-q12, q15)
 271     ; do more addition/substractions
 272     vadd.s32  q11, q12, q11
 273     vadd.s32  q10, q10, q15
 274     ; (used) four for intermediate (q8-q11)
 275     ; dct_const_round_shift
 276     vqrshrn.s32 $reg1, q8,  #14
 277     vqrshrn.s32 $reg2, q9,  #14
 278     vqrshrn.s32 $reg3, q11, #14
 279     vqrshrn.s32 $reg4, q10, #14
 280     ; (used) two for results, well four d registers
 281     MEND
 282     ; --------------------------------------------------------------------------
 283     ; Touches q8-q12, q15 (q13-q14 are preserved)
 284     ; valid output registers are anything but q8-q11
 285     MACRO
 286     DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
 287     DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
 288     MEND
 289     ; --------------------------------------------------------------------------
 290
 291 ;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
 292 ;
 293 ;   r0  int16_t *input,
 294 ;   r1  uint8_t *dest,
 295 ;   r2  int dest_stride)
 296 ; loop counters
 297 ;   r4  bands loop counter
 298 ;   r5  pass loop counter
 299 ;   r8  transpose loop counter
 300 ; combine-add pointers
 301 ;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
 302 ;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
 303 ;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
 304 ;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
 305
 306 |vp9_idct32x32_1024_add_neon| PROC
 307     ; This function does one pass of idct32x32 transform.
 308     ;
 309     ; This is done by transposing the input and then doing a 1d transform on
 310     ; columns. In the first pass, the transposed columns are the original
 311     ; rows. In the second pass, after the transposition, the colums are the
 312     ; original columns.
 313     ; The 1d transform is done by looping over bands of eight columns (the
 314     ; idct32_bands loop). For each band, the transform input transposition
 315     ; is done on demand, one band of four 8x8 matrices at a time. The four
 316     ; matrices are transposed by pairs (the idct32_transpose_pair loop).
 317     push  {r4-r11}
 318     vpush {d8-d15}
 319     ; stack operation
 320     ; internal buffer used to transpose 8 lines into before transforming them
 321     ;   int16_t transpose_buffer[32 * 8];
 322     ;   at sp + [4096, 4607]
 323     ; results of the first pass (transpose and transform rows)
 324     ;   int16_t pass1[32 * 32];
 325     ;   at sp + [0, 2047]
 326     ; results of the second pass (transpose and transform columns)
 327     ;   int16_t pass2[32 * 32];
 328     ;   at sp + [2048, 4095]
 329     sub sp, sp, #512+2048+2048
 330
 331     ; r6  = dest + 31 * dest_stride
 332     ; r7  = dest +  0 * dest_stride
 333     ; r9  = dest + 15 * dest_stride
 334     ; r10 = dest + 16 * dest_stride
 335     rsb r6,  r2, r2, lsl #5
 336     rsb r9,  r2, r2, lsl #4
 337     add r10, r1, r2, lsl #4
 338     mov r7, r1
 339     add r6, r6, r1
 340     add r9, r9, r1
 341     ; r11 = -dest_stride
 342     neg r11, r2
 343     ; r3 = input
 344     mov r3, r0
 345     ; parameters for first pass
 346       ; r0 = transpose_buffer[32 * 8]
 347     add r0, sp, #4096
 348       ; r1 = pass1[32 * 32]
 349     mov r1, sp
 350
 351     mov r5, #0          ; initialize pass loop counter
 352 idct32_pass_loop
 353     mov r4, #4          ; initialize bands loop counter
 354 idct32_bands_loop
 355     mov r8, #2          ; initialize transpose loop counter
 356 idct32_transpose_pair_loop
 357     ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
 358     ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
 359     ; adjusted to 32 because of the two post-increments.
 360     vld1.s16        {q8},  [r3]!
 361     vld1.s16        {q0},  [r3]!
 362     add r3, #32
 363     vld1.s16        {q9},  [r3]!
 364     vld1.s16        {q1},  [r3]!
 365     add r3, #32
 366     vld1.s16        {q10}, [r3]!
 367     vld1.s16        {q2},  [r3]!
 368     add r3, #32
 369     vld1.s16        {q11}, [r3]!
 370     vld1.s16        {q3},  [r3]!
 371     add r3, #32
 372     vld1.s16        {q12}, [r3]!
 373     vld1.s16        {q4},  [r3]!
 374     add r3, #32
 375     vld1.s16        {q13}, [r3]!
 376     vld1.s16        {q5},  [r3]!
 377     add r3, #32
 378     vld1.s16        {q14}, [r3]!
 379     vld1.s16        {q6},  [r3]!
 380     add r3, #32
 381     vld1.s16        {q15}, [r3]!
 382     vld1.s16        {q7},  [r3]!
 383
 384     ; Transpose the two 8x8 16bit data matrices.
 385     vswp            d17, d24
 386     vswp            d23, d30
 387     vswp            d21, d28
 388     vswp            d19, d26
 389     vswp            d1,  d8
 390     vswp            d7,  d14
 391     vswp            d5,  d12
 392     vswp            d3,  d10
 393     vtrn.32         q8,  q10
 394     vtrn.32         q9,  q11
 395     vtrn.32         q12, q14
 396     vtrn.32         q13, q15
 397     vtrn.32         q0,  q2
 398     vtrn.32         q1,  q3
 399     vtrn.32         q4,  q6
 400     vtrn.32         q5,  q7
 401     vtrn.16         q8,  q9
 402     vtrn.16         q10, q11
 403     vtrn.16         q12, q13
 404     vtrn.16         q14, q15
 405     vtrn.16         q0,  q1
 406     vtrn.16         q2,  q3
 407     vtrn.16         q4,  q5
 408     vtrn.16         q6,  q7
 409
 410     ; Store both matrices after each other. There is a stride of 32, which
 411     ; adjusts to nothing because of the post-increments.
 412     vst1.16        {q8},  [r0]!
 413     vst1.16        {q9},  [r0]!
 414     vst1.16        {q10}, [r0]!
 415     vst1.16        {q11}, [r0]!
 416     vst1.16        {q12}, [r0]!
 417     vst1.16        {q13}, [r0]!
 418     vst1.16        {q14}, [r0]!
 419     vst1.16        {q15}, [r0]!
 420     vst1.16        {q0},  [r0]!
 421     vst1.16        {q1},  [r0]!
 422     vst1.16        {q2},  [r0]!
 423     vst1.16        {q3},  [r0]!
 424     vst1.16        {q4},  [r0]!
 425     vst1.16        {q5},  [r0]!
 426     vst1.16        {q6},  [r0]!
 427     vst1.16        {q7},  [r0]!
 428
 429     ; increment pointers by adjusted stride (not necessary for r0/out)
 430     ;   go back by 7*32 for the seven lines moved fully by read and add
 431     ;   go back by 32 for the eigth line only read
 432     ;   advance by 16*2 to go the next pair
 433     sub r3,  r3,  #7*32*2 + 32 - 16*2
 434     ; transpose pair loop processing
 435     subs r8, r8, #1
 436     bne idct32_transpose_pair_loop
 437
 438     ; restore r0/input to its original value
 439     sub r0, r0, #32*8*2
 440
 441     ; Instead of doing the transforms stage by stage, it is done by loading
 442     ; some input values and doing as many stages as possible to minimize the
 443     ; storing/loading of intermediate results. To fit within registers, the
 444     ; final coefficients are cut into four blocks:
 445     ; BLOCK A: 16-19,28-31
 446     ; BLOCK B: 20-23,24-27
 447     ; BLOCK C: 8-10,11-15
 448     ; BLOCK D: 0-3,4-7
 449     ; Blocks A and C are straight calculation through the various stages. In
 450     ; block B, further calculations are performed using the results from
 451     ; block A. In block D, further calculations are performed using the results
 452     ; from block C and then the final calculations are done using results from
 453     ; block A and B which have been combined at the end of block B.
 454
 455     ; --------------------------------------------------------------------------
 456     ; BLOCK A: 16-19,28-31
 457     ; --------------------------------------------------------------------------
 458     ; generate 16,17,30,31
 459     ; --------------------------------------------------------------------------
 460     ; part of stage 1
 461     ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
 462     ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
 463     ;step1b[16][i] = dct_const_round_shift(temp1);
 464     ;step1b[31][i] = dct_const_round_shift(temp2);
 465     LOAD_FROM_TRANSPOSED 0, 1, 31
 466     DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
 467     ; --------------------------------------------------------------------------
 468     ; part of stage 1
 469     ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
 470     ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
 471     ;step1b[17][i] = dct_const_round_shift(temp1);
 472     ;step1b[30][i] = dct_const_round_shift(temp2);
 473     LOAD_FROM_TRANSPOSED 31, 17, 15
 474     DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
 475     ; --------------------------------------------------------------------------
 476     ; part of stage 2
 477     ;step2[16] =  step1b[16][i] + step1b[17][i];
 478     ;step2[17] =  step1b[16][i] - step1b[17][i];
 479     ;step2[30] = -step1b[30][i] + step1b[31][i];
 480     ;step2[31] =  step1b[30][i] + step1b[31][i];
 481     vadd.s16  q4, q0, q1
 482     vsub.s16  q13, q0, q1
 483     vadd.s16  q6, q2, q3
 484     vsub.s16  q14, q2, q3
 485     ; --------------------------------------------------------------------------
 486     ; part of stage 3
 487     ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
 488     ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
 489     ;step3[17] = dct_const_round_shift(temp1);
 490     ;step3[30] = dct_const_round_shift(temp2);
 491     DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
 492     ; --------------------------------------------------------------------------
 493     ; generate 18,19,28,29
 494     ; --------------------------------------------------------------------------
 495     ; part of stage 1
 496     ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
 497     ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
 498     ;step1b[18][i] = dct_const_round_shift(temp1);
 499     ;step1b[29][i] = dct_const_round_shift(temp2);
 500     LOAD_FROM_TRANSPOSED 15, 9, 23
 501     DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
 502     ; --------------------------------------------------------------------------
 503     ; part of stage 1
 504     ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
 505     ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
 506     ;step1b[19][i] = dct_const_round_shift(temp1);
 507     ;step1b[28][i] = dct_const_round_shift(temp2);
 508     LOAD_FROM_TRANSPOSED 23, 25, 7
 509     DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
 510     ; --------------------------------------------------------------------------
 511     ; part of stage 2
 512     ;step2[18] = -step1b[18][i] + step1b[19][i];
 513     ;step2[19] =  step1b[18][i] + step1b[19][i];
 514     ;step2[28] =  step1b[28][i] + step1b[29][i];
 515     ;step2[29] =  step1b[28][i] - step1b[29][i];
 516     vsub.s16  q13, q3, q2
 517     vadd.s16  q3,  q3, q2
 518     vsub.s16  q14, q1, q0
 519     vadd.s16  q2,  q1, q0
 520     ; --------------------------------------------------------------------------
 521     ; part of stage 3
 522     ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
 523     ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
 524     ;step3[29] = dct_const_round_shift(temp1);
 525     ;step3[18] = dct_const_round_shift(temp2);
 526     DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
 527     ; --------------------------------------------------------------------------
 528     ; combine 16-19,28-31
 529     ; --------------------------------------------------------------------------
 530     ; part of stage 4
 531     ;step1[16] = step1b[16][i] + step1b[19][i];
 532     ;step1[17] = step1b[17][i] + step1b[18][i];
 533     ;step1[18] = step1b[17][i] - step1b[18][i];
 534     ;step1[29] = step1b[30][i] - step1b[29][i];
 535     ;step1[30] = step1b[30][i] + step1b[29][i];
 536     ;step1[31] = step1b[31][i] + step1b[28][i];
 537     vadd.s16  q8,  q4, q2
 538     vadd.s16  q9,  q5, q0
 539     vadd.s16  q10, q7, q1
 540     vadd.s16  q15, q6, q3
 541     vsub.s16  q13, q5, q0
 542     vsub.s16  q14, q7, q1
 543     STORE_IN_OUTPUT 0,  16, 31, q8,  q15
 544     STORE_IN_OUTPUT 31, 17, 30, q9,  q10
 545     ; --------------------------------------------------------------------------
 546     ; part of stage 5
 547     ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
 548     ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
 549     ;step2[18] = dct_const_round_shift(temp1);
 550     ;step2[29] = dct_const_round_shift(temp2);
 551     DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
 552     STORE_IN_OUTPUT 30, 29, 18, q1, q0
 553     ; --------------------------------------------------------------------------
 554     ; part of stage 4
 555     ;step1[19] = step1b[16][i] - step1b[19][i];
 556     ;step1[28] = step1b[31][i] - step1b[28][i];
 557     vsub.s16  q13, q4, q2
 558     vsub.s16  q14, q6, q3
 559     ; --------------------------------------------------------------------------
 560     ; part of stage 5
 561     ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
 562     ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
 563     ;step2[19] = dct_const_round_shift(temp1);
 564     ;step2[28] = dct_const_round_shift(temp2);
 565     DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
 566     STORE_IN_OUTPUT 18, 19, 28, q4, q6
 567     ; --------------------------------------------------------------------------
 568
 569
 570     ; --------------------------------------------------------------------------
 571     ; BLOCK B: 20-23,24-27
 572     ; --------------------------------------------------------------------------
 573     ; generate 20,21,26,27
 574     ; --------------------------------------------------------------------------
 575     ; part of stage 1
 576     ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
 577     ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
 578     ;step1b[20][i] = dct_const_round_shift(temp1);
 579     ;step1b[27][i] = dct_const_round_shift(temp2);
 580     LOAD_FROM_TRANSPOSED 7, 5, 27
 581     DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
 582     ; --------------------------------------------------------------------------
 583     ; part of stage 1
 584     ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
 585     ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
 586     ;step1b[21][i] = dct_const_round_shift(temp1);
 587     ;step1b[26][i] = dct_const_round_shift(temp2);
 588     LOAD_FROM_TRANSPOSED 27, 21, 11
 589     DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
 590     ; --------------------------------------------------------------------------
 591     ; part of stage 2
 592     ;step2[20] =  step1b[20][i] + step1b[21][i];
 593     ;step2[21] =  step1b[20][i] - step1b[21][i];
 594     ;step2[26] = -step1b[26][i] + step1b[27][i];
 595     ;step2[27] =  step1b[26][i] + step1b[27][i];
 596     vsub.s16  q13, q0, q1
 597     vadd.s16  q0, q0, q1
 598     vsub.s16  q14, q2, q3
 599     vadd.s16  q2, q2, q3
 600     ; --------------------------------------------------------------------------
 601     ; part of stage 3
 602     ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
 603     ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
 604     ;step3[21] = dct_const_round_shift(temp1);
 605     ;step3[26] = dct_const_round_shift(temp2);
 606     DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
 607     ; --------------------------------------------------------------------------
 608     ; generate 22,23,24,25
 609     ; --------------------------------------------------------------------------
 610     ; part of stage 1
 611     ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
 612     ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
 613     ;step1b[22][i] = dct_const_round_shift(temp1);
 614     ;step1b[25][i] = dct_const_round_shift(temp2);
 615     LOAD_FROM_TRANSPOSED 11, 13, 19
 616     DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
 617     ; --------------------------------------------------------------------------
 618     ; part of stage 1
 619     ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
 620     ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
 621     ;step1b[23][i] = dct_const_round_shift(temp1);
 622     ;step1b[24][i] = dct_const_round_shift(temp2);
 623     LOAD_FROM_TRANSPOSED 19, 29, 3
 624     DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
 625     ; --------------------------------------------------------------------------
 626     ; part of stage 2
 627     ;step2[22] = -step1b[22][i] + step1b[23][i];
 628     ;step2[23] =  step1b[22][i] + step1b[23][i];
 629     ;step2[24] =  step1b[24][i] + step1b[25][i];
 630     ;step2[25] =  step1b[24][i] - step1b[25][i];
 631     vsub.s16  q14, q4, q5
 632     vadd.s16  q5, q4, q5
 633     vsub.s16  q13, q6, q7
 634     vadd.s16  q6, q6, q7
 635     ; --------------------------------------------------------------------------
 636     ; part of stage 3
 637     ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
 638     ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
 639     ;step3[25] = dct_const_round_shift(temp1);
 640     ;step3[22] = dct_const_round_shift(temp2);
 641     DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
 642     ; --------------------------------------------------------------------------
 643     ; combine 20-23,24-27
 644     ; --------------------------------------------------------------------------
 645     ; part of stage 4
 646     ;step1[22] = step1b[22][i] + step1b[21][i];
 647     ;step1[23] = step1b[23][i] + step1b[20][i];
 648     vadd.s16  q10, q7, q1
 649     vadd.s16  q11, q5, q0
 650     ;step1[24] = step1b[24][i] + step1b[27][i];
 651     ;step1[25] = step1b[25][i] + step1b[26][i];
 652     vadd.s16  q12, q6, q2
 653     vadd.s16  q15, q4, q3
 654     ; --------------------------------------------------------------------------
 655     ; part of stage 6
 656     ;step3[16] = step1b[16][i] + step1b[23][i];
 657     ;step3[17] = step1b[17][i] + step1b[22][i];
 658     ;step3[22] = step1b[17][i] - step1b[22][i];
 659     ;step3[23] = step1b[16][i] - step1b[23][i];
 660     LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
 661     vadd.s16  q8,  q14, q11
 662     vadd.s16  q9,  q13, q10
 663     vsub.s16  q13, q13, q10
 664     vsub.s16  q11, q14, q11
 665     STORE_IN_OUTPUT 17, 17, 16, q9, q8
 666     ; --------------------------------------------------------------------------
 667     ; part of stage 6
 668     ;step3[24] = step1b[31][i] - step1b[24][i];
 669     ;step3[25] = step1b[30][i] - step1b[25][i];
 670     ;step3[30] = step1b[30][i] + step1b[25][i];
 671     ;step3[31] = step1b[31][i] + step1b[24][i];
 672     LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
 673     vsub.s16  q8,  q9,  q12
 674     vadd.s16  q10, q14, q15
 675     vsub.s16  q14, q14, q15
 676     vadd.s16  q12, q9,  q12
 677     STORE_IN_OUTPUT 31, 30, 31, q10, q12
 678     ; --------------------------------------------------------------------------
 679     ; TODO(cd) do some register allocation change to remove these push/pop
 680     vpush {q8}  ; [24]
 681     vpush {q11} ; [23]
 682     ; --------------------------------------------------------------------------
 683     ; part of stage 7
 684     ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
 685     ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
 686     ;step1[22] = dct_const_round_shift(temp1);
 687     ;step1[25] = dct_const_round_shift(temp2);
 688     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
 689     STORE_IN_OUTPUT 31, 25, 22, q14, q13
 690     ; --------------------------------------------------------------------------
 691     ; part of stage 7
 692     ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
 693     ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
 694     ;step1[23] = dct_const_round_shift(temp1);
 695     ;step1[24] = dct_const_round_shift(temp2);
 696     ; TODO(cd) do some register allocation change to remove these push/pop
 697     vpop  {q13} ; [23]
 698     vpop  {q14} ; [24]
 699     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
 700     STORE_IN_OUTPUT 22, 24, 23, q14, q13
 701     ; --------------------------------------------------------------------------
 702     ; part of stage 4
 703     ;step1[20] = step1b[23][i] - step1b[20][i];
 704     ;step1[27] = step1b[24][i] - step1b[27][i];
 705     vsub.s16  q14, q5, q0
 706     vsub.s16  q13, q6, q2
 707     ; --------------------------------------------------------------------------
 708     ; part of stage 5
 709     ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
 710     ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
 711     ;step2[27] = dct_const_round_shift(temp1);
 712     ;step2[20] = dct_const_round_shift(temp2);
 713     DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
 714     ; --------------------------------------------------------------------------
 715     ; part of stage 4
 716     ;step1[21] = step1b[22][i] - step1b[21][i];
 717     ;step1[26] = step1b[25][i] - step1b[26][i];
 718     vsub.s16  q14,  q7, q1
 719     vsub.s16  q13,  q4, q3
 720     ; --------------------------------------------------------------------------
 721     ; part of stage 5
 722     ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
 723     ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
 724     ;step2[26] = dct_const_round_shift(temp1);
 725     ;step2[21] = dct_const_round_shift(temp2);
 726     DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
 727     ; --------------------------------------------------------------------------
 728     ; part of stage 6
 729     ;step3[18] = step1b[18][i] + step1b[21][i];
 730     ;step3[19] = step1b[19][i] + step1b[20][i];
 731     ;step3[20] = step1b[19][i] - step1b[20][i];
 732     ;step3[21] = step1b[18][i] - step1b[21][i];
 733     LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
 734     vadd.s16  q8,  q14, q1
 735     vadd.s16  q9,  q13, q6
 736     vsub.s16  q13, q13, q6
 737     vsub.s16  q1,  q14, q1
 738     STORE_IN_OUTPUT 19, 18, 19, q8, q9
 739     ; --------------------------------------------------------------------------
 740     ; part of stage 6
 741     ;step3[27] = step1b[28][i] - step1b[27][i];
 742     ;step3[28] = step1b[28][i] + step1b[27][i];
 743     ;step3[29] = step1b[29][i] + step1b[26][i];
 744     ;step3[26] = step1b[29][i] - step1b[26][i];
 745     LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
 746     vsub.s16  q14, q8, q5
 747     vadd.s16  q10, q8, q5
 748     vadd.s16  q11, q9, q0
 749     vsub.s16  q0, q9, q0
 750     STORE_IN_OUTPUT 29, 28, 29, q10, q11
 751     ; --------------------------------------------------------------------------
 752     ; part of stage 7
 753     ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
 754     ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
 755     ;step1[20] = dct_const_round_shift(temp1);
 756     ;step1[27] = dct_const_round_shift(temp2);
 757     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
 758     STORE_IN_OUTPUT 29, 20, 27, q13, q14
 759     ; --------------------------------------------------------------------------
 760     ; part of stage 7
 761     ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
 762     ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
 763     ;step1[21] = dct_const_round_shift(temp1);
 764     ;step1[26] = dct_const_round_shift(temp2);
 765     DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
 766     STORE_IN_OUTPUT 27, 21, 26, q1, q0
 767     ; --------------------------------------------------------------------------
 768
 769
 770     ; --------------------------------------------------------------------------
 771     ; BLOCK C: 8-10,11-15
 772     ; --------------------------------------------------------------------------
 773     ; generate 8,9,14,15
 774     ; --------------------------------------------------------------------------
 775     ; part of stage 2
 776     ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
 777     ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
 778     ;step2[8] = dct_const_round_shift(temp1);
 779     ;step2[15] = dct_const_round_shift(temp2);
 780     LOAD_FROM_TRANSPOSED 3, 2, 30
 781     DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
 782     ; --------------------------------------------------------------------------
 783     ; part of stage 2
 784     ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
 785     ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
 786     ;step2[9] = dct_const_round_shift(temp1);
 787     ;step2[14] = dct_const_round_shift(temp2);
 788     LOAD_FROM_TRANSPOSED 30, 18, 14
 789     DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
 790     ; --------------------------------------------------------------------------
 791     ; part of stage 3
 792     ;step3[8] = step1b[8][i] + step1b[9][i];
 793     ;step3[9] = step1b[8][i] - step1b[9][i];
 794     ;step3[14] = step1b[15][i] - step1b[14][i];
 795     ;step3[15] = step1b[15][i] + step1b[14][i];
 796     vsub.s16  q13, q0, q1
 797     vadd.s16  q0, q0, q1
 798     vsub.s16  q14, q2, q3
 799     vadd.s16  q2, q2, q3
 800     ; --------------------------------------------------------------------------
 801     ; part of stage 4
 802     ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
 803     ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
 804     ;step1[9]  = dct_const_round_shift(temp1);
 805     ;step1[14] = dct_const_round_shift(temp2);
 806     DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
 807     ; --------------------------------------------------------------------------
 808     ; generate 10,11,12,13
 809     ; --------------------------------------------------------------------------
 810     ; part of stage 2
 811     ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
 812     ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
 813     ;step2[10] = dct_const_round_shift(temp1);
 814     ;step2[13] = dct_const_round_shift(temp2);
 815     LOAD_FROM_TRANSPOSED 14, 10, 22
 816     DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
 817     ; --------------------------------------------------------------------------
 818     ; part of stage 2
 819     ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
 820     ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
 821     ;step2[11] = dct_const_round_shift(temp1);
 822     ;step2[12] = dct_const_round_shift(temp2);
 823     LOAD_FROM_TRANSPOSED 22, 26, 6
 824     DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
 825     ; --------------------------------------------------------------------------
 826     ; part of stage 3
 827     ;step3[10] = step1b[11][i] - step1b[10][i];
 828     ;step3[11] = step1b[11][i] + step1b[10][i];
 829     ;step3[12] = step1b[12][i] + step1b[13][i];
 830     ;step3[13] = step1b[12][i] - step1b[13][i];
 831     vsub.s16  q14, q4, q5
 832     vadd.s16  q5, q4, q5
 833     vsub.s16  q13, q6, q7
 834     vadd.s16  q6, q6, q7
 835     ; --------------------------------------------------------------------------
 836     ; part of stage 4
 837     ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
 838     ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
 839     ;step1[13] = dct_const_round_shift(temp1);
 840     ;step1[10] = dct_const_round_shift(temp2);
 841     DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
 842     ; --------------------------------------------------------------------------
 843     ; combine 8-10,11-15
 844     ; --------------------------------------------------------------------------
 845     ; part of stage 5
 846     ;step2[8]  = step1b[8][i] + step1b[11][i];
 847     ;step2[9]  = step1b[9][i] + step1b[10][i];
 848     ;step2[10] = step1b[9][i] - step1b[10][i];
 849     vadd.s16  q8,  q0, q5
 850     vadd.s16  q9,  q1, q7
 851     vsub.s16  q13, q1, q7
 852     ;step2[13] = step1b[14][i] - step1b[13][i];
 853     ;step2[14] = step1b[14][i] + step1b[13][i];
 854     ;step2[15] = step1b[15][i] + step1b[12][i];
 855     vsub.s16  q14, q3, q4
 856     vadd.s16  q10, q3, q4
 857     vadd.s16  q15, q2, q6
 858     STORE_IN_OUTPUT 26, 8, 15, q8, q15
 859     STORE_IN_OUTPUT 15, 9, 14, q9, q10
 860     ; --------------------------------------------------------------------------
 861     ; part of stage 6
 862     ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
 863     ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
 864     ;step3[10] = dct_const_round_shift(temp1);
 865     ;step3[13] = dct_const_round_shift(temp2);
 866     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
 867     STORE_IN_OUTPUT 14, 13, 10, q3, q1
 868     ; --------------------------------------------------------------------------
 869     ; part of stage 5
 870     ;step2[11] = step1b[8][i] - step1b[11][i];
 871     ;step2[12] = step1b[15][i] - step1b[12][i];
 872     vsub.s16  q13, q0, q5
 873     vsub.s16  q14,  q2, q6
 874     ; --------------------------------------------------------------------------
 875     ; part of stage 6
 876     ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
 877     ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
 878     ;step3[11] = dct_const_round_shift(temp1);
 879     ;step3[12] = dct_const_round_shift(temp2);
 880     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
 881     STORE_IN_OUTPUT 10, 11, 12, q1, q3
 882     ; --------------------------------------------------------------------------
 883
 884
 885     ; --------------------------------------------------------------------------
 886     ; BLOCK D: 0-3,4-7
 887     ; --------------------------------------------------------------------------
 888     ; generate 4,5,6,7
 889     ; --------------------------------------------------------------------------
 890     ; part of stage 3
 891     ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
 892     ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
 893     ;step3[4] = dct_const_round_shift(temp1);
 894     ;step3[7] = dct_const_round_shift(temp2);
 895     LOAD_FROM_TRANSPOSED 6, 4, 28
 896     DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
 897     ; --------------------------------------------------------------------------
 898     ; part of stage 3
 899     ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
 900     ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
 901     ;step3[5] = dct_const_round_shift(temp1);
 902     ;step3[6] = dct_const_round_shift(temp2);
 903     LOAD_FROM_TRANSPOSED 28, 20, 12
 904     DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
 905     ; --------------------------------------------------------------------------
 906     ; part of stage 4
 907     ;step1[4] = step1b[4][i] + step1b[5][i];
 908     ;step1[5] = step1b[4][i] - step1b[5][i];
 909     ;step1[6] = step1b[7][i] - step1b[6][i];
 910     ;step1[7] = step1b[7][i] + step1b[6][i];
 911     vsub.s16  q13, q0, q1
 912     vadd.s16  q0, q0, q1
 913     vsub.s16  q14, q2, q3
 914     vadd.s16  q2, q2, q3
 915     ; --------------------------------------------------------------------------
 916     ; part of stage 5
 917     ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
 918     ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
 919     ;step2[5] = dct_const_round_shift(temp1);
 920     ;step2[6] = dct_const_round_shift(temp2);
 921     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
 922     ; --------------------------------------------------------------------------
 923     ; generate 0,1,2,3
 924     ; --------------------------------------------------------------------------
 925     ; part of stage 4
 926     ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
 927     ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
 928     ;step1[1] = dct_const_round_shift(temp1);
 929     ;step1[0] = dct_const_round_shift(temp2);
 930     LOAD_FROM_TRANSPOSED 12, 0, 16
 931     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
 932     ; --------------------------------------------------------------------------
 933     ; part of stage 4
 934     ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
 935     ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
 936     ;step1[2] = dct_const_round_shift(temp1);
 937     ;step1[3] = dct_const_round_shift(temp2);
 938     LOAD_FROM_TRANSPOSED 16, 8, 24
 939     DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
 940     ; --------------------------------------------------------------------------
 941     ; part of stage 5
 942     ;step2[0] = step1b[0][i] + step1b[3][i];
 943     ;step2[1] = step1b[1][i] + step1b[2][i];
 944     ;step2[2] = step1b[1][i] - step1b[2][i];
 945     ;step2[3] = step1b[0][i] - step1b[3][i];
 946     vadd.s16  q4, q7, q6
 947     vsub.s16  q7, q7, q6
 948     vsub.s16  q6, q5, q14
 949     vadd.s16  q5, q5, q14
 950     ; --------------------------------------------------------------------------
 951     ; combine 0-3,4-7
 952     ; --------------------------------------------------------------------------
 953     ; part of stage 6
 954     ;step3[0] = step1b[0][i] + step1b[7][i];
 955     ;step3[1] = step1b[1][i] + step1b[6][i];
 956     ;step3[2] = step1b[2][i] + step1b[5][i];
 957     ;step3[3] = step1b[3][i] + step1b[4][i];
 958     vadd.s16  q8,  q4, q2
 959     vadd.s16  q9,  q5, q3
 960     vadd.s16  q10, q6, q1
 961     vadd.s16  q11, q7, q0
 962     ;step3[4] = step1b[3][i] - step1b[4][i];
 963     ;step3[5] = step1b[2][i] - step1b[5][i];
 964     ;step3[6] = step1b[1][i] - step1b[6][i];
 965     ;step3[7] = step1b[0][i] - step1b[7][i];
 966     vsub.s16  q12, q7, q0
 967     vsub.s16  q13, q6, q1
 968     vsub.s16  q14, q5, q3
 969     vsub.s16  q15, q4, q2
 970     ; --------------------------------------------------------------------------
 971     ; part of stage 7
 972     ;step1[0] = step1b[0][i] + step1b[15][i];
 973     ;step1[1] = step1b[1][i] + step1b[14][i];
 974     ;step1[14] = step1b[1][i] - step1b[14][i];
 975     ;step1[15] = step1b[0][i] - step1b[15][i];
 976     LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
 977     vadd.s16  q2, q8, q1
 978     vadd.s16  q3, q9, q0
 979     vsub.s16  q4, q9, q0
 980     vsub.s16  q5, q8, q1
 981     ; --------------------------------------------------------------------------
 982     ; part of final stage
 983     ;output[14 * 32] = step1b[14][i] + step1b[17][i];
 984     ;output[15 * 32] = step1b[15][i] + step1b[16][i];
 985     ;output[16 * 32] = step1b[15][i] - step1b[16][i];
 986     ;output[17 * 32] = step1b[14][i] - step1b[17][i];
 987     LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
 988     vadd.s16  q8, q4, q1
 989     vadd.s16  q9, q5, q0
 990     vsub.s16  q6, q5, q0
 991     vsub.s16  q7, q4, q1
 992
 993     cmp r5, #0
 994     bgt idct32_bands_end_2nd_pass
 995
 996 idct32_bands_end_1st_pass
 997     STORE_IN_OUTPUT 17, 16, 17, q6, q7
 998     STORE_IN_OUTPUT 17, 14, 15, q8, q9
 999     ; --------------------------------------------------------------------------
1000     ; part of final stage
1001     ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
1002     ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
1003     ;output[30 * 32] = step1b[1][i] - step1b[30][i];
1004     ;output[31 * 32] = step1b[0][i] - step1b[31][i];
1005     LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
1006     vadd.s16  q4, q2, q1
1007     vadd.s16  q5, q3, q0
1008     vsub.s16  q6, q3, q0
1009     vsub.s16  q7, q2, q1
1010     STORE_IN_OUTPUT 31, 30, 31, q6, q7
1011     STORE_IN_OUTPUT 31,  0,  1, q4, q5
1012     ; --------------------------------------------------------------------------
1013     ; part of stage 7
1014     ;step1[2] = step1b[2][i] + step1b[13][i];
1015     ;step1[3] = step1b[3][i] + step1b[12][i];
1016     ;step1[12] = step1b[3][i] - step1b[12][i];
1017     ;step1[13] = step1b[2][i] - step1b[13][i];
1018     LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
1019     vadd.s16  q2, q10, q1
1020     vadd.s16  q3, q11, q0
1021     vsub.s16  q4, q11, q0
1022     vsub.s16  q5, q10, q1
1023     ; --------------------------------------------------------------------------
1024     ; part of final stage
1025     ;output[12 * 32] = step1b[12][i] + step1b[19][i];
1026     ;output[13 * 32] = step1b[13][i] + step1b[18][i];
1027     ;output[18 * 32] = step1b[13][i] - step1b[18][i];
1028     ;output[19 * 32] = step1b[12][i] - step1b[19][i];
1029     LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
1030     vadd.s16  q8, q4, q1
1031     vadd.s16  q9, q5, q0
1032     vsub.s16  q6, q5, q0
1033     vsub.s16  q7, q4, q1
1034     STORE_IN_OUTPUT 19, 18, 19, q6, q7
1035     STORE_IN_OUTPUT 19, 12, 13, q8, q9
1036     ; --------------------------------------------------------------------------
1037     ; part of final stage
1038     ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
1039     ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
1040     ;output[28 * 32] = step1b[3][i] - step1b[28][i];
1041     ;output[29 * 32] = step1b[2][i] - step1b[29][i];
1042     LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
1043     vadd.s16  q4, q2, q1
1044     vadd.s16  q5, q3, q0
1045     vsub.s16  q6, q3, q0
1046     vsub.s16  q7, q2, q1
1047     STORE_IN_OUTPUT 29, 28, 29, q6, q7
1048     STORE_IN_OUTPUT 29,  2,  3, q4, q5
1049     ; --------------------------------------------------------------------------
1050     ; part of stage 7
1051     ;step1[4] = step1b[4][i] + step1b[11][i];
1052     ;step1[5] = step1b[5][i] + step1b[10][i];
1053     ;step1[10] = step1b[5][i] - step1b[10][i];
1054     ;step1[11] = step1b[4][i] - step1b[11][i];
1055     LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
1056     vadd.s16  q2, q12, q1
1057     vadd.s16  q3, q13, q0
1058     vsub.s16  q4, q13, q0
1059     vsub.s16  q5, q12, q1
1060     ; --------------------------------------------------------------------------
1061     ; part of final stage
1062     ;output[10 * 32] = step1b[10][i] + step1b[21][i];
1063     ;output[11 * 32] = step1b[11][i] + step1b[20][i];
1064     ;output[20 * 32] = step1b[11][i] - step1b[20][i];
1065     ;output[21 * 32] = step1b[10][i] - step1b[21][i];
1066     LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
1067     vadd.s16  q8, q4, q1
1068     vadd.s16  q9, q5, q0
1069     vsub.s16  q6, q5, q0
1070     vsub.s16  q7, q4, q1
1071     STORE_IN_OUTPUT 21, 20, 21, q6, q7
1072     STORE_IN_OUTPUT 21, 10, 11, q8, q9
1073     ; --------------------------------------------------------------------------
1074     ; part of final stage
1075     ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
1076     ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
1077     ;output[26 * 32] = step1b[5][i] - step1b[26][i];
1078     ;output[27 * 32] = step1b[4][i] - step1b[27][i];
1079     LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
1080     vadd.s16  q4, q2, q1
1081     vadd.s16  q5, q3, q0
1082     vsub.s16  q6, q3, q0
1083     vsub.s16  q7, q2, q1
1084     STORE_IN_OUTPUT 27, 26, 27, q6, q7
1085     STORE_IN_OUTPUT 27,  4,  5, q4, q5
1086     ; --------------------------------------------------------------------------
1087     ; part of stage 7
1088     ;step1[6] = step1b[6][i] + step1b[9][i];
1089     ;step1[7] = step1b[7][i] + step1b[8][i];
1090     ;step1[8] = step1b[7][i] - step1b[8][i];
1091     ;step1[9] = step1b[6][i] - step1b[9][i];
1092     LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
1093     vadd.s16  q2, q14, q1
1094     vadd.s16  q3, q15, q0
1095     vsub.s16  q4, q15, q0
1096     vsub.s16  q5, q14, q1
1097     ; --------------------------------------------------------------------------
1098     ; part of final stage
1099     ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
1100     ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
1101     ;output[22 * 32] = step1b[9][i] - step1b[22][i];
1102     ;output[23 * 32] = step1b[8][i] - step1b[23][i];
1103     LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
1104     vadd.s16  q8, q4, q1
1105     vadd.s16  q9, q5, q0
1106     vsub.s16  q6, q5, q0
1107     vsub.s16  q7, q4, q1
1108     STORE_IN_OUTPUT 23, 22, 23, q6, q7
1109     STORE_IN_OUTPUT 23, 8, 9, q8, q9
1110     ; --------------------------------------------------------------------------
1111     ; part of final stage
1112     ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
1113     ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
1114     ;output[24 * 32] = step1b[7][i] - step1b[24][i];
1115     ;output[25 * 32] = step1b[6][i] - step1b[25][i];
1116     LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
1117     vadd.s16  q4, q2, q1
1118     vadd.s16  q5, q3, q0
1119     vsub.s16  q6, q3, q0
1120     vsub.s16  q7, q2, q1
1121     STORE_IN_OUTPUT 25, 24, 25, q6, q7
1122     STORE_IN_OUTPUT 25,  6,  7, q4, q5
1123
1124     ; restore r0 by removing the last offset from the last
1125     ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
1126     sub r0, r0, #24*8*2
1127     ; restore r1 by removing the last offset from the last
1128     ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
1129     ; advance by 8 columns => 8*2
1130     sub r1, r1, #7*32*2 - 8*2
1131     ;   advance by 8 lines (8*32*2)
1132     ;   go back by the two pairs from the loop (32*2)
1133     add r3, r3, #8*32*2 - 32*2
1134
1135     ; bands loop processing
1136     subs r4, r4, #1
1137     bne idct32_bands_loop
1138
1139     ; parameters for second pass
1140     ; the input of pass2 is the result of pass1. we have to remove the offset
1141     ;   of 32 columns induced by the above idct32_bands_loop
1142     sub r3, r1, #32*2
1143       ; r1 = pass2[32 * 32]
1144     add r1, sp, #2048
1145
1146     ; pass loop processing
1147     add r5, r5, #1
1148     b idct32_pass_loop
1149
1150 idct32_bands_end_2nd_pass
1151     STORE_COMBINE_CENTER_RESULTS
1152     ; --------------------------------------------------------------------------
1153     ; part of final stage
1154     ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
1155     ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
1156     ;output[30 * 32] = step1b[1][i] - step1b[30][i];
1157     ;output[31 * 32] = step1b[0][i] - step1b[31][i];
1158     LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
1159     vadd.s16  q4, q2, q1
1160     vadd.s16  q5, q3, q0
1161     vsub.s16  q6, q3, q0
1162     vsub.s16  q7, q2, q1
1163     STORE_COMBINE_EXTREME_RESULTS
1164     ; --------------------------------------------------------------------------
1165     ; part of stage 7
1166     ;step1[2] = step1b[2][i] + step1b[13][i];
1167     ;step1[3] = step1b[3][i] + step1b[12][i];
1168     ;step1[12] = step1b[3][i] - step1b[12][i];
1169     ;step1[13] = step1b[2][i] - step1b[13][i];
1170     LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
1171     vadd.s16  q2, q10, q1
1172     vadd.s16  q3, q11, q0
1173     vsub.s16  q4, q11, q0
1174     vsub.s16  q5, q10, q1
1175     ; --------------------------------------------------------------------------
1176     ; part of final stage
1177     ;output[12 * 32] = step1b[12][i] + step1b[19][i];
1178     ;output[13 * 32] = step1b[13][i] + step1b[18][i];
1179     ;output[18 * 32] = step1b[13][i] - step1b[18][i];
1180     ;output[19 * 32] = step1b[12][i] - step1b[19][i];
1181     LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
1182     vadd.s16  q8, q4, q1
1183     vadd.s16  q9, q5, q0
1184     vsub.s16  q6, q5, q0
1185     vsub.s16  q7, q4, q1
1186     STORE_COMBINE_CENTER_RESULTS
1187     ; --------------------------------------------------------------------------
1188     ; part of final stage
1189     ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
1190     ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
1191     ;output[28 * 32] = step1b[3][i] - step1b[28][i];
1192     ;output[29 * 32] = step1b[2][i] - step1b[29][i];
1193     LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
1194     vadd.s16  q4, q2, q1
1195     vadd.s16  q5, q3, q0
1196     vsub.s16  q6, q3, q0
1197     vsub.s16  q7, q2, q1
1198     STORE_COMBINE_EXTREME_RESULTS
1199     ; --------------------------------------------------------------------------
1200     ; part of stage 7
1201     ;step1[4] = step1b[4][i] + step1b[11][i];
1202     ;step1[5] = step1b[5][i] + step1b[10][i];
1203     ;step1[10] = step1b[5][i] - step1b[10][i];
1204     ;step1[11] = step1b[4][i] - step1b[11][i];
1205     LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
1206     vadd.s16  q2, q12, q1
1207     vadd.s16  q3, q13, q0
1208     vsub.s16  q4, q13, q0
1209     vsub.s16  q5, q12, q1
1210     ; --------------------------------------------------------------------------
1211     ; part of final stage
1212     ;output[10 * 32] = step1b[10][i] + step1b[21][i];
1213     ;output[11 * 32] = step1b[11][i] + step1b[20][i];
1214     ;output[20 * 32] = step1b[11][i] - step1b[20][i];
1215     ;output[21 * 32] = step1b[10][i] - step1b[21][i];
1216     LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
1217     vadd.s16  q8, q4, q1
1218     vadd.s16  q9, q5, q0
1219     vsub.s16  q6, q5, q0
1220     vsub.s16  q7, q4, q1
1221     STORE_COMBINE_CENTER_RESULTS
1222     ; --------------------------------------------------------------------------
1223     ; part of final stage
1224     ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
1225     ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
1226     ;output[26 * 32] = step1b[5][i] - step1b[26][i];
1227     ;output[27 * 32] = step1b[4][i] - step1b[27][i];
1228     LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
1229     vadd.s16  q4, q2, q1
1230     vadd.s16  q5, q3, q0
1231     vsub.s16  q6, q3, q0
1232     vsub.s16  q7, q2, q1
1233     STORE_COMBINE_EXTREME_RESULTS
1234     ; --------------------------------------------------------------------------
1235     ; part of stage 7
1236     ;step1[6] = step1b[6][i] + step1b[9][i];
1237     ;step1[7] = step1b[7][i] + step1b[8][i];
1238     ;step1[8] = step1b[7][i] - step1b[8][i];
1239     ;step1[9] = step1b[6][i] - step1b[9][i];
1240     LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
1241     vadd.s16  q2, q14, q1
1242     vadd.s16  q3, q15, q0
1243     vsub.s16  q4, q15, q0
1244     vsub.s16  q5, q14, q1
1245     ; --------------------------------------------------------------------------
1246     ; part of final stage
1247     ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
1248     ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
1249     ;output[22 * 32] = step1b[9][i] - step1b[22][i];
1250     ;output[23 * 32] = step1b[8][i] - step1b[23][i];
1251     LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
1252     vadd.s16  q8, q4, q1
1253     vadd.s16  q9, q5, q0
1254     vsub.s16  q6, q5, q0
1255     vsub.s16  q7, q4, q1
1256     STORE_COMBINE_CENTER_RESULTS_LAST
1257     ; --------------------------------------------------------------------------
1258     ; part of final stage
1259     ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
1260     ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
1261     ;output[24 * 32] = step1b[7][i] - step1b[24][i];
1262     ;output[25 * 32] = step1b[6][i] - step1b[25][i];
1263     LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
1264     vadd.s16  q4, q2, q1
1265     vadd.s16  q5, q3, q0
1266     vsub.s16  q6, q3, q0
1267     vsub.s16  q7, q2, q1
1268     STORE_COMBINE_EXTREME_RESULTS_LAST
1269     ; --------------------------------------------------------------------------
1270     ; restore pointers to their initial indices for next band pass by
1271     ;     removing/adding dest_stride * 8. The actual increment by eight
1272     ;     is taken care of within the _LAST macros.
1273     add r6,  r6,  r2, lsl #3
1274     add r9,  r9,  r2, lsl #3
1275     sub r7,  r7,  r2, lsl #3
1276     sub r10, r10, r2, lsl #3
1277
1278     ; restore r0 by removing the last offset from the last
1279     ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
1280     sub r0, r0, #24*8*2
1281     ; restore r1 by removing the last offset from the last
1282     ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
1283     ; advance by 8 columns => 8*2
1284     sub r1, r1, #25*32*2 - 8*2
1285     ;   advance by 8 lines (8*32*2)
1286     ;   go back by the two pairs from the loop (32*2)
1287     add r3, r3, #8*32*2 - 32*2
1288
1289     ; bands loop processing
1290     subs r4, r4, #1
1291     bne idct32_bands_loop
1292
1293     ; stack operation
1294     add sp, sp, #512+2048+2048
1295     vpop {d8-d15}
1296     pop  {r4-r11}
1297     bx              lr
1298     ENDP  ; |vp9_idct32x32_1024_add_neon|
1299     END