2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "vp8/encoder/denoising.h"
14 #include "vpx_mem/vpx_mem.h"
15 #include "./vp8_rtcd.h"
18 * The filter function was modified to reduce the computational complexity.
21 * Instead of applying tap coefficients for each pixel, we calculated the
22 * pixel adjustments vs. pixel diff value ahead of time.
23 * adjustment = filtered_value - current_raw
24 * = (filter_coefficient * diff + 128) >> 8
26 * filter_coefficient = (255 << 8) / (256 + ((abs_diff * 330) >> 3));
27 * filter_coefficient += filter_coefficient /
28 * (3 + motion_magnitude_adjustment);
29 * filter_coefficient is clamped to 0 ~ 255.
32 * The adjustment vs. diff curve becomes flat very quick when diff increases.
33 * This allowed us to use only several levels to approximate the curve without
34 * changing the filtering algorithm too much.
35 * The adjustments were further corrected by checking the motion magnitude.
36 * The levels used are:
37 * diff level adjustment w/o adjustment w/
38 * motion correction motion correction
48 int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg,
49 YV12_BUFFER_CONFIG *running_avg,
50 MACROBLOCK *signal, unsigned int motion_magnitude,
51 int y_offset, int uv_offset) {
52 /* If motion_magnitude is small, making the denoiser more aggressive by
53 * increasing the adjustment for each level, level1 adjustment is
54 * increased, the deltas stay the same.
56 const uint8x16_t v_level1_adjustment = vdupq_n_u8(
57 (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 : 3);
58 const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
59 const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
60 const uint8x16_t v_level1_threshold = vdupq_n_u8(4);
61 const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
62 const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
64 /* Local variables for array pointers and strides. */
65 unsigned char *sig = signal->thismb;
67 unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
68 int mc_running_avg_y_stride = mc_running_avg->y_stride;
69 unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
70 int running_avg_y_stride = running_avg->y_stride;
71 int64x2_t v_sum_diff_total = vdupq_n_s64(0);
75 for (i = 0; i < 16; ++i) {
77 const uint8x16_t v_sig = vld1q_u8(sig);
78 const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
80 /* Calculate absolute difference and sign masks. */
81 const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
82 const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
83 const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
85 /* Figure out which level that put us in. */
86 const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold,
88 const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold,
90 const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold,
93 /* Calculate absolute adjustments for level 1, 2 and 3. */
94 const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask,
95 v_delta_level_1_and_2);
96 const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask,
97 v_delta_level_2_and_3);
98 const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment,
100 const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(
101 v_level1and2_adjustment, v_level3_adjustment);
103 /* Figure adjustment absolute value by selecting between the absolute
104 * difference if in level0 or the value for level 1, 2 and 3.
106 const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask,
107 v_level1and2and3_adjustment, v_abs_diff);
109 /* Calculate positive and negative adjustments. Apply them to the signal
110 * and accumulate them. Adjustments are less than eight and the maximum
111 * sum of them (7 * 16) can fit in a signed char.
113 const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
115 const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
118 uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
119 v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
122 vst1q_u8(running_avg_y, v_running_avg_y);
124 /* Sum all the accumulators to have the sum of all pixel differences
125 * for this macroblock.
128 const int8x16_t v_sum_diff =
129 vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
130 vreinterpretq_s8_u8(v_neg_adjustment));
132 const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
134 const int32x4_t fedc_ba98_7654_3210 =
135 vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
137 const int64x2_t fedcba98_76543210 =
138 vpaddlq_s32(fedc_ba98_7654_3210);
140 v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
143 /* Update pointers for next iteration. */
145 mc_running_avg_y += mc_running_avg_y_stride;
146 running_avg_y += running_avg_y_stride;
149 /* Too much adjustments => copy block. */
151 const int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
152 vget_low_s64(v_sum_diff_total));
153 const int s0 = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
155 if (s0 > SUM_DIFF_THRESHOLD)
159 /* Tell above level that block was filtered. */
160 running_avg_y -= running_avg_y_stride * 16;
161 sig -= sig_stride * 16;
163 vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride);