vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm

   1 ;
   2 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license and patent
   5 ;  grant that can be found in the LICENSE file in the root of the source
   6 ;  tree. All contributing project authors may be found in the AUTHORS
   7 ;  file in the root of the source tree.
   8 ;
   9
  10
  11     EXPORT  |vp8_loop_filter_horizontal_edge_y_neon|
  12     ARM
  13     REQUIRE8
  14     PRESERVE8
  15
  16     AREA ||.text||, CODE, READONLY, ALIGN=2
  17 ;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
  18 ;are equal. So, in the code, only one load is needed
  19 ;for flimit. Same way applies to limit and thresh.
  20 ; r0    unsigned char *s,
  21 ; r1    int p, //pitch
  22 ; r2    const signed char *flimit,
  23 ; r3    const signed char *limit,
  24 ; stack(r4) const signed char *thresh,
  25 ; //stack(r5)   int count --unused
  26
  27 |vp8_loop_filter_horizontal_edge_y_neon| PROC
  28     sub         r0, r0, r1, lsl #2          ; move src pointer down by 4 lines
  29     ldr         r12, [sp, #0]               ; load thresh pointer
  30
  31     vld1.u8     {q3}, [r0], r1              ; p3
  32     vld1.s8     {d0[], d1[]}, [r2]          ; flimit
  33     vld1.u8     {q4}, [r0], r1              ; p2
  34     vld1.s8     {d2[], d3[]}, [r3]          ; limit
  35     vld1.u8     {q5}, [r0], r1              ; p1
  36     vld1.s8     {d4[], d5[]}, [r12]         ; thresh
  37     vld1.u8     {q6}, [r0], r1              ; p0
  38     ldr         r12, _lfhy_coeff_
  39     vld1.u8     {q7}, [r0], r1              ; q0
  40
  41     ;vp8_filter_mask() function
  42     ;vp8_hevmask() function
  43     vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
  44     vld1.u8     {q8}, [r0], r1              ; q1
  45     vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
  46     vld1.u8     {q9}, [r0], r1              ; q2
  47     vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
  48     vld1.u8     {q10}, [r0], r1             ; q3
  49     vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
  50     vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
  51     vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
  52     vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
  53
  54     vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
  55     vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
  56     vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
  57     vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
  58
  59     vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
  60     vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
  61
  62     vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
  63     vcge.u8     q4, q1, q4                  ; (abs(q3 - q2) > limit)*-1
  64     vadd.u8     q0, q0, q0                  ; flimit * 2
  65     vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
  66
  67     vand        q15, q15, q12
  68     vand        q10, q10, q11
  69     vand        q3, q3, q4
  70
  71     vabd.u8     q2, q5, q8                  ; abs(p1 - q1)
  72     vqadd.u8    q9, q9, q9                  ; abs(p0 - q0) * 2
  73     vshr.u8     q2, q2, #1                  ; abs(p1 - q1) / 2
  74     vqadd.u8    q9, q9, q2                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
  75     vcge.u8     q9, q0, q9                  ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
  76
  77     vld1.u8     {q0}, [r12]!
  78
  79     vand        q15, q15, q10
  80
  81     ;vp8_filter() function
  82     veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
  83     veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
  84     veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
  85     veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
  86 ;;;;;;;;;;;;;;
  87     vld1.u8     {q10}, [r12]!
  88
  89     ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
  90     vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
  91     vsubl.s8    q11, d15, d13
  92
  93     vand        q3, q3, q9
  94     vmovl.u8    q4, d20
  95
  96     vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
  97     vorr        q14, q13, q14               ; q14: vp8_hevmask
  98
  99     ;vmul.i8    q2, q2, q10                 ; 3 * ( qs0 - ps0)
 100     vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
 101     vmul.i16    q11, q11, q4
 102
 103     vand        q1, q1, q14                 ; vp8_filter &= hev
 104     vand        q15, q15, q3                ; q15: vp8_filter_mask
 105     ;;
 106     ;vld1.u8        {q4}, [r12]!            ;no need 7 any more
 107
 108     ;vqadd.s8   q1, q1, q2
 109     vaddw.s8    q2, q2, d2
 110     vaddw.s8    q11, q11, d3
 111
 112     vld1.u8     {q9}, [r12]!
 113     ;
 114     vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
 115     vqmovn.s16  d3, q11
 116     ;;
 117
 118     vand        q1, q1, q15                 ; vp8_filter &= mask
 119     ;;
 120 ;;;;;;;;;;;;
 121
 122 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
 123 ;   vand        q2, q1, q4                  ; s = vp8_filter & 7
 124 ;   vqadd.s8    q1, q1, q9                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
 125     ;;;;
 126 ;   vshr.s8     q1, q1, #3                  ; vp8_filter >>= 3
 127 ;   vceq.i8     q2, q2, q9                  ; s = (s==4)*-1
 128     ;;
 129 ;   ;calculate output
 130 ;   vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
 131 ;   vqadd.s8    q11, q2, q1                 ; u = vp8_signed_char_clamp(s + vp8_filter)
 132 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 133 ;; q10=3
 134     vqadd.s8    q2, q1, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
 135     vqadd.s8    q1, q1, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
 136     vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
 137     vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
 138
 139     ;calculate output
 140     vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
 141     vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
 142 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 143
 144     vrshr.s8    q1, q1, #1                  ;round/shift:  vp8_filter += 1; vp8_filter >>= 1
 145
 146     sub         r0, r0, r1, lsl #2
 147     sub         r0, r0, r1, lsl #1
 148     ;
 149
 150     vbic        q1, q1, q14                 ; vp8_filter &= ~hev
 151     ;
 152     add         r2, r1, r0
 153
 154     vqadd.s8    q13, q5, q1                 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
 155     ;vqadd.s8   q11, q6, q11                ; u = vp8_signed_char_clamp(ps0 + u)
 156     vqsub.s8    q12, q8, q1                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
 157
 158     add         r3, r2, r1
 159
 160     veor        q5, q13, q0                 ; *op1 = u^0x80
 161     veor        q6, q11, q0                 ; *op0 = u^0x80
 162     veor        q7, q10, q0                 ; *oq0 = u^0x80
 163     veor        q8, q12, q0                 ; *oq1 = u^0x80
 164
 165     add         r12, r3, r1
 166
 167     vst1.u8     {q5}, [r0]                  ; store op1
 168     vst1.u8     {q6}, [r2]                  ; store op0
 169     vst1.u8     {q7}, [r3]                  ; store oq0
 170     vst1.u8     {q8}, [r12]                 ; store oq1
 171
 172     bx          lr
 173     ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
 174
 175 ;-----------------
 176     AREA    hloopfiltery_dat, DATA, READWRITE           ;read/write by default
 177 ;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
 178 ;One word each is reserved. Label filter_coeff can be used to access the data.
 179 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 180 _lfhy_coeff_
 181     DCD     lfhy_coeff
 182 lfhy_coeff
 183     DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
 184     DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
 185     DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
 186     DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
 187
 188     END