vp8/encoder/arm/neon/fastquantizeb_neon.asm

   1 ;
   2 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12     EXPORT  |vp8_fast_quantize_b_neon|
  13     EXPORT  |vp8_fast_quantize_b_pair_neon|
  14
  15     INCLUDE vp8_asm_enc_offsets.asm
  16
  17     ARM
  18     REQUIRE8
  19     PRESERVE8
  20
  21     AREA ||.text||, CODE, READONLY, ALIGN=4
  22
  23 ;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
  24 |vp8_fast_quantize_b_pair_neon| PROC
  25
  26     stmfd           sp!, {r4-r9}
  27     vstmdb          sp!, {q4-q7}
  28
  29     ldr             r4, [r0, #vp8_block_coeff]
  30     ldr             r5, [r0, #vp8_block_quant_fast]
  31     ldr             r6, [r0, #vp8_block_round]
  32
  33     vld1.16         {q0, q1}, [r4@128]  ; load z
  34
  35     ldr             r7, [r2, #vp8_blockd_qcoeff]
  36
  37     vabs.s16        q4, q0              ; calculate x = abs(z)
  38     vabs.s16        q5, q1
  39
  40     ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
  41     vshr.s16        q2, q0, #15         ; sz
  42     vshr.s16        q3, q1, #15
  43
  44     vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
  45     vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
  46
  47     ldr             r4, [r1, #vp8_block_coeff]
  48
  49     vadd.s16        q4, q6              ; x + Round
  50     vadd.s16        q5, q7
  51
  52     vld1.16         {q0, q1}, [r4@128]  ; load z2
  53
  54     vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
  55     vqdmulh.s16     q5, q9
  56
  57     vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
  58     vabs.s16        q11, q1
  59     vshr.s16        q12, q0, #15        ; sz2
  60     vshr.s16        q13, q1, #15
  61
  62     ;modify data to have its original sign
  63     veor.s16        q4, q2              ; y^sz
  64     veor.s16        q5, q3
  65
  66     vadd.s16        q10, q6             ; x2 + Round
  67     vadd.s16        q11, q7
  68
  69     ldr             r8, [r2, #vp8_blockd_dequant]
  70
  71     vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
  72     vqdmulh.s16     q11, q9
  73
  74     vshr.s16        q4, #1              ; right shift 1 after vqdmulh
  75     vshr.s16        q5, #1
  76
  77     vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
  78
  79     vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
  80     vsub.s16        q5, q3
  81
  82     vshr.s16        q10, #1             ; right shift 1 after vqdmulh
  83     vshr.s16        q11, #1
  84
  85     ldr             r9, [r2, #vp8_blockd_dqcoeff]
  86
  87     veor.s16        q10, q12            ; y2^sz2
  88     veor.s16        q11, q13
  89
  90     vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
  91
  92
  93     vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
  94     vsub.s16        q11, q13
  95
  96     ldr             r6, [r3, #vp8_blockd_qcoeff]
  97
  98     vmul.s16        q2, q6, q4          ; x * Dequant
  99     vmul.s16        q3, q7, q5
 100
 101     adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
 102
 103     vceq.s16        q8, q8              ; set q8 to all 1
 104
 105     vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
 106
 107     vmul.s16        q12, q6, q10        ; x2 * Dequant
 108     vmul.s16        q13, q7, q11
 109
 110     vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
 111
 112     vtst.16         q14, q4, q8         ; now find eob
 113     vtst.16         q15, q5, q8         ; non-zero element is set to all 1
 114
 115     vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
 116
 117     ldr             r7, [r3, #vp8_blockd_dqcoeff]
 118
 119     vand            q0, q6, q14         ; get all valid numbers from scan array
 120     vand            q1, q7, q15
 121
 122     vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
 123
 124     vtst.16         q2, q10, q8         ; now find eob
 125     vtst.16         q3, q11, q8         ; non-zero element is set to all 1
 126
 127     vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
 128
 129     vand            q10, q6, q2         ; get all valid numbers from scan array
 130     vand            q11, q7, q3
 131     vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
 132
 133     vmax.u16        d0, d0, d1
 134     vmax.u16        d20, d20, d21
 135     vmovl.u16       q0, d0
 136     vmovl.u16       q10, d20
 137
 138     vmax.u32        d0, d0, d1
 139     vmax.u32        d20, d20, d21
 140     vpmax.u32       d0, d0, d0
 141     vpmax.u32       d20, d20, d20
 142
 143     ldr             r4, [r2, #vp8_blockd_eob]
 144     ldr             r5, [r3, #vp8_blockd_eob]
 145
 146     vst1.8          {d0[0]}, [r4]       ; store eob
 147     vst1.8          {d20[0]}, [r5]      ; store eob
 148
 149     vldmia          sp!, {q4-q7}
 150     ldmfd           sp!, {r4-r9}
 151     bx              lr
 152
 153     ENDP
 154
 155 ;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 156 |vp8_fast_quantize_b_neon| PROC
 157
 158     stmfd           sp!, {r4-r7}
 159
 160     ldr             r3, [r0, #vp8_block_coeff]
 161     ldr             r4, [r0, #vp8_block_quant_fast]
 162     ldr             r5, [r0, #vp8_block_round]
 163
 164     vld1.16         {q0, q1}, [r3@128]  ; load z
 165     vorr.s16        q14, q0, q1         ; check if all zero (step 1)
 166     ldr             r6, [r1, #vp8_blockd_qcoeff]
 167     ldr             r7, [r1, #vp8_blockd_dqcoeff]
 168     vorr.s16        d28, d28, d29       ; check if all zero (step 2)
 169
 170     vabs.s16        q12, q0             ; calculate x = abs(z)
 171     vabs.s16        q13, q1
 172
 173     ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
 174     vshr.s16        q2, q0, #15         ; sz
 175     vmov            r2, r3, d28         ; check if all zero (step 3)
 176     vshr.s16        q3, q1, #15
 177
 178     vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
 179     vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]
 180
 181     vadd.s16        q12, q14            ; x + Round
 182     vadd.s16        q13, q15
 183
 184     adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
 185
 186     vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
 187     vqdmulh.s16     q13, q9
 188
 189     vld1.16         {q10, q11}, [r0@128]; load inverse scan order
 190
 191     vceq.s16        q8, q8              ; set q8 to all 1
 192
 193     ldr             r4, [r1, #vp8_blockd_dequant]
 194
 195     vshr.s16        q12, #1             ; right shift 1 after vqdmulh
 196     vshr.s16        q13, #1
 197
 198     ldr             r5, [r1, #vp8_blockd_eob]
 199
 200     orr             r2, r2, r3          ; check if all zero (step 4)
 201     cmp             r2, #0              ; check if all zero (step 5)
 202     beq             zero_output         ; check if all zero (step 6)
 203
 204     ;modify data to have its original sign
 205     veor.s16        q12, q2             ; y^sz
 206     veor.s16        q13, q3
 207
 208     vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
 209     vsub.s16        q13, q3
 210
 211     vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]
 212
 213     vtst.16         q14, q12, q8        ; now find eob
 214     vtst.16         q15, q13, q8        ; non-zero element is set to all 1
 215
 216     vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1
 217
 218     vand            q10, q10, q14       ; get all valid numbers from scan array
 219     vand            q11, q11, q15
 220
 221
 222     vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
 223     vmax.u16        d0, d0, d1
 224     vmovl.u16       q0, d0
 225
 226     vmul.s16        q2, q12             ; x * Dequant
 227     vmul.s16        q3, q13
 228
 229     vmax.u32        d0, d0, d1
 230     vpmax.u32       d0, d0, d0
 231
 232     vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant
 233
 234     vst1.8          {d0[0]}, [r5]       ; store eob
 235
 236     ldmfd           sp!, {r4-r7}
 237     bx              lr
 238
 239 zero_output
 240     strb            r2, [r5]            ; store eob
 241     vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
 242     vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0
 243
 244     ldmfd           sp!, {r4-r7}
 245     bx              lr
 246
 247     ENDP
 248
 249 ; default inverse zigzag table is defined in vp8/common/entropy.c
 250     ALIGN 16    ; enable use of @128 bit aligned loads
 251 inv_zig_zag
 252     DCW 0x0001, 0x0002, 0x0006, 0x0007
 253     DCW 0x0003, 0x0005, 0x0008, 0x000d
 254     DCW 0x0004, 0x0009, 0x000c, 0x000e
 255     DCW 0x000a, 0x000b, 0x000f, 0x0010
 256
 257     END
 258