src/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm

   1 ;
   2 ;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license and patent
   5 ;  grant that can be found in the LICENSE file in the root of the source
   6 ;  tree. All contributing project authors may be found in the AUTHORS
   7 ;  file in the root of the source tree.
   8 ;
   9
  10     EXPORT  |vp9_idct32x32_1_add_neon|
  11     ARM
  12     REQUIRE8
  13     PRESERVE8
  14
  15     AREA ||.text||, CODE, READONLY, ALIGN=2
  16
  17     ;TODO(hkuang): put the following macros in a seperate
  18     ;file so other idct function could also use them.
  19     MACRO
  20     LD_16x8          $src, $stride
  21     vld1.8           {q8}, [$src], $stride
  22     vld1.8           {q9}, [$src], $stride
  23     vld1.8           {q10}, [$src], $stride
  24     vld1.8           {q11}, [$src], $stride
  25     vld1.8           {q12}, [$src], $stride
  26     vld1.8           {q13}, [$src], $stride
  27     vld1.8           {q14}, [$src], $stride
  28     vld1.8           {q15}, [$src], $stride
  29     MEND
  30
  31     MACRO
  32     ADD_DIFF_16x8    $diff
  33     vqadd.u8         q8, q8, $diff
  34     vqadd.u8         q9, q9, $diff
  35     vqadd.u8         q10, q10, $diff
  36     vqadd.u8         q11, q11, $diff
  37     vqadd.u8         q12, q12, $diff
  38     vqadd.u8         q13, q13, $diff
  39     vqadd.u8         q14, q14, $diff
  40     vqadd.u8         q15, q15, $diff
  41     MEND
  42
  43     MACRO
  44     SUB_DIFF_16x8    $diff
  45     vqsub.u8         q8, q8, $diff
  46     vqsub.u8         q9, q9, $diff
  47     vqsub.u8         q10, q10, $diff
  48     vqsub.u8         q11, q11, $diff
  49     vqsub.u8         q12, q12, $diff
  50     vqsub.u8         q13, q13, $diff
  51     vqsub.u8         q14, q14, $diff
  52     vqsub.u8         q15, q15, $diff
  53     MEND
  54
  55     MACRO
  56     ST_16x8          $dst, $stride
  57     vst1.8           {q8}, [$dst], $stride
  58     vst1.8           {q9}, [$dst], $stride
  59     vst1.8           {q10},[$dst], $stride
  60     vst1.8           {q11},[$dst], $stride
  61     vst1.8           {q12},[$dst], $stride
  62     vst1.8           {q13},[$dst], $stride
  63     vst1.8           {q14},[$dst], $stride
  64     vst1.8           {q15},[$dst], $stride
  65     MEND
  66
  67 ;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
  68 ;                              int dest_stride)
  69 ;
  70 ; r0  int16_t input
  71 ; r1  uint8_t *dest
  72 ; r2  int dest_stride
  73
  74 |vp9_idct32x32_1_add_neon| PROC
  75     push             {lr}
  76     pld              [r1]
  77     add              r3, r1, #16               ; r3 dest + 16 for second loop
  78     ldrsh            r0, [r0]
  79
  80     ; generate cospi_16_64 = 11585
  81     mov              r12, #0x2d00
  82     add              r12, #0x41
  83
  84     ; out = dct_const_round_shift(input[0] * cospi_16_64)
  85     mul              r0, r0, r12               ; input[0] * cospi_16_64
  86     add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
  87     asr              r0, r0, #14               ; >> DCT_CONST_BITS
  88
  89     ; out = dct_const_round_shift(out * cospi_16_64)
  90     mul              r0, r0, r12               ; out * cospi_16_64
  91     mov              r12, r1                   ; save dest
  92     add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
  93     asr              r0, r0, #14               ; >> DCT_CONST_BITS
  94
  95     ; a1 = ROUND_POWER_OF_TWO(out, 6)
  96     add              r0, r0, #32               ; + (1 <<((6) - 1))
  97     asrs             r0, r0, #6                ; >> 6
  98     bge              diff_positive_32_32
  99
 100 diff_negative_32_32
 101     neg              r0, r0
 102     usat             r0, #8, r0
 103     vdup.u8          q0, r0
 104     mov              r0, #4
 105
 106 diff_negative_32_32_loop
 107     sub              r0, #1
 108     LD_16x8          r1, r2
 109     SUB_DIFF_16x8    q0
 110     ST_16x8          r12, r2
 111
 112     LD_16x8          r1, r2
 113     SUB_DIFF_16x8    q0
 114     ST_16x8          r12, r2
 115     cmp              r0, #2
 116     moveq            r1, r3
 117     moveq            r12, r3
 118     cmp              r0, #0
 119     bne              diff_negative_32_32_loop
 120     pop              {pc}
 121
 122 diff_positive_32_32
 123     usat             r0, #8, r0
 124     vdup.u8          q0, r0
 125     mov              r0, #4
 126
 127 diff_positive_32_32_loop
 128     sub              r0, #1
 129     LD_16x8          r1, r2
 130     ADD_DIFF_16x8    q0
 131     ST_16x8          r12, r2
 132
 133     LD_16x8          r1, r2
 134     ADD_DIFF_16x8    q0
 135     ST_16x8          r12, r2
 136     cmp              r0, #2
 137     moveq            r1, r3
 138     moveq            r12, r3
 139     cmp              r0, #0
 140     bne              diff_positive_32_32_loop
 141     pop              {pc}
 142
 143     ENDP             ; |vp9_idct32x32_1_add_neon|
 144     END