Merge "Ensure the error-concealment code is available"
[platform/upstream/libvpx.git] / vp9 / encoder / arm / neon / vp9_variance_neon.c
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10
11 #include <arm_neon.h>
12 #include "./vp9_rtcd.h"
13 #include "./vpx_config.h"
14
15 #include "vpx_ports/mem.h"
16 #include "vpx/vpx_integer.h"
17
18 #include "vp9/common/vp9_common.h"
19 #include "vp9/common/vp9_filter.h"
20
21 #include "vp9/encoder/vp9_variance.h"
22
23 enum { kWidth8 = 8 };
24 enum { kHeight8 = 8 };
25 enum { kHeight8PlusOne = 9 };
26 enum { kWidth16 = 16 };
27 enum { kHeight16 = 16 };
28 enum { kHeight16PlusOne = 17 };
29 enum { kWidth32 = 32 };
30 enum { kHeight32 = 32 };
31 enum { kHeight32PlusOne = 33 };
32 enum { kWidth64 = 64 };
33 enum { kHeight64 = 64 };
34 enum { kHeight64PlusOne = 65 };
35 enum { kPixelStepOne = 1 };
36 enum { kAlign16 = 16 };
37
38 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
39   const int32x4_t a = vpaddlq_s16(v_16x8);
40   const int64x2_t b = vpaddlq_s32(a);
41   const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
42                                vreinterpret_s32_s64(vget_high_s64(b)));
43   return vget_lane_s32(c, 0);
44 }
45
46 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
47   const int64x2_t b = vpaddlq_s32(v_32x4);
48   const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
49                                vreinterpret_s32_s64(vget_high_s64(b)));
50   return vget_lane_s32(c, 0);
51 }
52
53 // w * h must be less than 2048 or local variable v_sum may overflow.
54 static void variance_neon_w8(const uint8_t *a, int a_stride,
55                              const uint8_t *b, int b_stride,
56                              int w, int h, uint32_t *sse, int *sum) {
57   int i, j;
58   int16x8_t v_sum = vdupq_n_s16(0);
59   int32x4_t v_sse_lo = vdupq_n_s32(0);
60   int32x4_t v_sse_hi = vdupq_n_s32(0);
61
62   for (i = 0; i < h; ++i) {
63     for (j = 0; j < w; j += 8) {
64       const uint8x8_t v_a = vld1_u8(&a[j]);
65       const uint8x8_t v_b = vld1_u8(&b[j]);
66       const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
67       const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
68       v_sum = vaddq_s16(v_sum, sv_diff);
69       v_sse_lo = vmlal_s16(v_sse_lo,
70                            vget_low_s16(sv_diff),
71                            vget_low_s16(sv_diff));
72       v_sse_hi = vmlal_s16(v_sse_hi,
73                            vget_high_s16(sv_diff),
74                            vget_high_s16(sv_diff));
75     }
76     a += a_stride;
77     b += b_stride;
78   }
79
80   *sum = horizontal_add_s16x8(v_sum);
81   *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
82 }
83
84 void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
85                         const uint8_t *ref_ptr, int ref_stride,
86                         unsigned int *sse, int *sum) {
87   variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8,
88                    kHeight8, sse, sum);
89 }
90
91 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
92                                   const uint8_t *b, int b_stride,
93                                   unsigned int *sse) {
94   int sum;
95   variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
96   return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
97 }
98
99 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
100                           const uint8_t *ref_ptr, int ref_stride,
101                           unsigned int *sse, int *sum) {
102   variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
103                    kHeight16, sse, sum);
104 }
105
106 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
107                                     const uint8_t *b, int b_stride,
108                                     unsigned int *sse) {
109   int sum;
110   variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
111   return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
112 }
113
114 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
115                                       uint8_t *output_ptr,
116                                       unsigned int src_pixels_per_line,
117                                       int pixel_step,
118                                       unsigned int output_height,
119                                       unsigned int output_width,
120                                       const int16_t *vp9_filter) {
121   const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
122   const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
123   unsigned int i;
124   for (i = 0; i < output_height; ++i) {
125     const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
126     const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
127     const uint16x8_t a = vmull_u8(src_0, f0);
128     const uint16x8_t b = vmlal_u8(a, src_1, f1);
129     const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
130     vst1_u8(&output_ptr[0], out);
131     // Next row...
132     src_ptr += src_pixels_per_line;
133     output_ptr += output_width;
134   }
135 }
136
137 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
138                                        uint8_t *output_ptr,
139                                        unsigned int src_pixels_per_line,
140                                        int pixel_step,
141                                        unsigned int output_height,
142                                        unsigned int output_width,
143                                        const int16_t *vp9_filter) {
144   const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
145   const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
146   unsigned int i, j;
147   for (i = 0; i < output_height; ++i) {
148     for (j = 0; j < output_width; j += 16) {
149       const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
150       const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
151       const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
152       const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
153       const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
154       const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
155       const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
156       const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
157       vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
158     }
159     // Next row...
160     src_ptr += src_pixels_per_line;
161     output_ptr += output_width;
162   }
163 }
164
165 unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
166                                             int src_stride,
167                                             int xoffset,
168                                             int yoffset,
169                                             const uint8_t *dst,
170                                             int dst_stride,
171                                             unsigned int *sse) {
172   DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
173   DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
174
175   var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
176                             kHeight8PlusOne, kWidth8,
177                             BILINEAR_FILTERS_2TAP(xoffset));
178   var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
179                             kWidth8, BILINEAR_FILTERS_2TAP(yoffset));
180   return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
181 }
182
183 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
184                                               int src_stride,
185                                               int xoffset,
186                                               int yoffset,
187                                               const uint8_t *dst,
188                                               int dst_stride,
189                                               unsigned int *sse) {
190   DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16);
191   DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
192
193   var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
194                              kHeight16PlusOne, kWidth16,
195                              BILINEAR_FILTERS_2TAP(xoffset));
196   var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
197                              kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
198   return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
199 }
200
201 void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
202                           const uint8_t *ref_ptr, int ref_stride,
203                           unsigned int *sse, int *sum) {
204   variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32,
205                    kHeight32, sse, sum);
206 }
207
208 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
209                                     const uint8_t *b, int b_stride,
210                                     unsigned int *sse) {
211   int sum;
212   variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);
213   return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
214 }
215
216 unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
217                                     const uint8_t *b, int b_stride,
218                                     unsigned int *sse) {
219   int sum1, sum2;
220   uint32_t sse1, sse2;
221   variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, &sse1, &sum1);
222   variance_neon_w8(a + (kHeight32 * a_stride), a_stride,
223                    b + (kHeight32 * b_stride), b_stride, kWidth32, kHeight32,
224                    &sse2, &sum2);
225   *sse = sse1 + sse2;
226   sum1 += sum2;
227   return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
228 }
229
230 unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
231                                     const uint8_t *b, int b_stride,
232                                     unsigned int *sse) {
233   int sum1, sum2;
234   uint32_t sse1, sse2;
235   variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);
236   variance_neon_w8(a + (kHeight16 * a_stride), a_stride,
237                    b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,
238                    &sse2, &sum2);
239   *sse = sse1 + sse2;
240   sum1 += sum2;
241   return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
242 }
243
244 unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
245                                     const uint8_t *b, int b_stride,
246                                     unsigned int *sse) {
247   int sum1, sum2;
248   uint32_t sse1, sse2;
249
250   variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);
251   variance_neon_w8(a + (kHeight16 * a_stride), a_stride,
252                    b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,
253                    &sse2, &sum2);
254   sse1 += sse2;
255   sum1 += sum2;
256
257   variance_neon_w8(a + (kHeight16 * 2 * a_stride), a_stride,
258                    b + (kHeight16 * 2 * b_stride), b_stride,
259                    kWidth64, kHeight16, &sse2, &sum2);
260   sse1 += sse2;
261   sum1 += sum2;
262
263   variance_neon_w8(a + (kHeight16 * 3 * a_stride), a_stride,
264                    b + (kHeight16 * 3 * b_stride), b_stride,
265                    kWidth64, kHeight16, &sse2, &sum2);
266   *sse = sse1 + sse2;
267   sum1 += sum2;
268   return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
269 }
270
271 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
272                                               int src_stride,
273                                               int xoffset,
274                                               int yoffset,
275                                               const uint8_t *dst,
276                                               int dst_stride,
277                                               unsigned int *sse) {
278   DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32);
279   DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32);
280
281   var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
282                              kHeight32PlusOne, kWidth32,
283                              BILINEAR_FILTERS_2TAP(xoffset));
284   var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32,
285                              kWidth32, BILINEAR_FILTERS_2TAP(yoffset));
286   return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);
287 }
288
289 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
290                                               int src_stride,
291                                               int xoffset,
292                                               int yoffset,
293                                               const uint8_t *dst,
294                                               int dst_stride,
295                                               unsigned int *sse) {
296   DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight64 * kWidth64);
297   DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight64PlusOne * kWidth64);
298
299   var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
300                              kHeight64PlusOne, kWidth64,
301                              BILINEAR_FILTERS_2TAP(xoffset));
302   var_filter_block2d_bil_w16(fdata3, temp2, kWidth64, kWidth64, kHeight64,
303                              kWidth64, BILINEAR_FILTERS_2TAP(yoffset));
304   return vp9_variance64x64_neon(temp2, kWidth64, dst, dst_stride, sse);
305 }