2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "./vp8_rtcd.h"
12 #include "vp8/common/mips/msa/vp8_macros_msa.h"
14 static void temporal_filter_apply_16size_msa(
15 uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr,
16 int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) {
18 v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
19 v16u8 frame_l, frame_h;
21 v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
22 v8i16 diff0, diff1, cnt0, cnt1;
23 v4i32 const3, const16, filter_wt, strength;
24 v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
25 v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
26 v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
27 v4i32 acc0, acc1, acc2, acc3;
29 filter_wt = __msa_fill_w(filter_wt_in);
30 strength = __msa_fill_w(strength_in);
31 const3 = __msa_ldi_w(3);
32 const16 = __msa_ldi_w(16);
34 for (row = 8; row--;) {
35 frame1_0_b = LD_SB(frame1_ptr);
36 frame2_0_b = LD_SB(frame2_ptr);
39 frame1_1_b = LD_SB(frame1_ptr);
40 frame2_1_b = LD_SB(frame2_ptr);
41 LD_SW2(acc, 4, acc0, acc1);
42 LD_SW2(acc + 8, 4, acc2, acc3);
43 LD_SH2(cnt, 8, cnt0, cnt1);
44 ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
45 HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
46 UNPCK_SH_SW(diff0, diff0_r, diff0_l);
47 UNPCK_SH_SW(diff1, diff1_r, diff1_l);
48 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
49 mod0_w, mod1_w, mod2_w, mod3_w);
50 MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
51 mod1_w, mod2_w, mod3_w);
52 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
53 diff0_r = (mod0_w < const16);
54 diff0_l = (mod1_w < const16);
55 diff1_r = (mod2_w < const16);
56 diff1_l = (mod3_w < const16);
57 SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
58 mod0_w, mod1_w, mod2_w, mod3_w);
59 mod0_w = diff0_r & mod0_w;
60 mod1_w = diff0_l & mod1_w;
61 mod2_w = diff1_r & mod2_w;
62 mod3_w = diff1_l & mod3_w;
63 MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
64 filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
65 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
66 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
67 ST_SH2(mod0_h, mod1_h, cnt, 8);
69 ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
70 UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
71 UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
72 MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
73 mod0_w, mod1_w, mod2_w, mod3_w);
74 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
76 ST_SW2(mod0_w, mod1_w, acc, 4);
77 ST_SW2(mod2_w, mod3_w, acc + 8, 4);
79 LD_SW2(acc, 4, acc0, acc1);
80 LD_SW2(acc + 8, 4, acc2, acc3);
81 LD_SH2(cnt, 8, cnt0, cnt1);
82 ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
83 HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
84 UNPCK_SH_SW(diff0, diff0_r, diff0_l);
85 UNPCK_SH_SW(diff1, diff1_r, diff1_l);
86 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
87 mod0_w, mod1_w, mod2_w, mod3_w);
88 MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
89 mod1_w, mod2_w, mod3_w);
90 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
91 diff0_r = (mod0_w < const16);
92 diff0_l = (mod1_w < const16);
93 diff1_r = (mod2_w < const16);
94 diff1_l = (mod3_w < const16);
95 SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
96 mod0_w, mod1_w, mod2_w, mod3_w);
97 mod0_w = diff0_r & mod0_w;
98 mod1_w = diff0_l & mod1_w;
99 mod2_w = diff1_r & mod2_w;
100 mod3_w = diff1_l & mod3_w;
101 MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
102 filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
103 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
104 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
105 ST_SH2(mod0_h, mod1_h, cnt, 8);
108 UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
109 UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
110 UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
111 MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
112 mod0_w, mod1_w, mod2_w, mod3_w);
113 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
115 ST_SW2(mod0_w, mod1_w, acc, 4);
116 ST_SW2(mod2_w, mod3_w, acc + 8, 4);
118 frame1_ptr += stride;
123 static void temporal_filter_apply_8size_msa(
124 uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr,
125 int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) {
127 uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
128 v16i8 frame1 = { 0 };
129 v16i8 frame2 = { 0 };
130 v16i8 frame3 = { 0 };
131 v16i8 frame4 = { 0 };
132 v16u8 frame_l, frame_h;
133 v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
134 v8i16 diff0, diff1, cnt0, cnt1;
135 v4i32 const3, const16;
136 v4i32 filter_wt, strength;
137 v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
138 v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
139 v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
140 v4i32 acc0, acc1, acc2, acc3;
142 filter_wt = __msa_fill_w(filter_wt_in);
143 strength = __msa_fill_w(strength_in);
144 const3 = __msa_ldi_w(3);
145 const16 = __msa_ldi_w(16);
147 for (row = 2; row--;) {
148 LD2(frame1_ptr, stride, f0, f1);
149 frame1_ptr += (2 * stride);
150 LD2(frame2_ptr, 8, f2, f3);
152 LD2(frame1_ptr, stride, f4, f5);
153 frame1_ptr += (2 * stride);
154 LD2(frame2_ptr, 8, f6, f7);
157 LD_SW2(acc, 4, acc0, acc1);
158 LD_SW2(acc + 8, 4, acc2, acc3);
159 LD_SH2(cnt, 8, cnt0, cnt1);
160 INSERT_D2_SB(f0, f1, frame1);
161 INSERT_D2_SB(f2, f3, frame2);
162 INSERT_D2_SB(f4, f5, frame3);
163 INSERT_D2_SB(f6, f7, frame4);
164 ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
165 HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
166 UNPCK_SH_SW(diff0, diff0_r, diff0_l);
167 UNPCK_SH_SW(diff1, diff1_r, diff1_l);
168 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
169 mod0_w, mod1_w, mod2_w, mod3_w);
170 MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
171 mod1_w, mod2_w, mod3_w);
172 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
173 diff0_r = (mod0_w < const16);
174 diff0_l = (mod1_w < const16);
175 diff1_r = (mod2_w < const16);
176 diff1_l = (mod3_w < const16);
177 SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
178 mod0_w, mod1_w, mod2_w, mod3_w);
179 mod0_w = diff0_r & mod0_w;
180 mod1_w = diff0_l & mod1_w;
181 mod2_w = diff1_r & mod2_w;
182 mod3_w = diff1_l & mod3_w;
183 MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
184 filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
185 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
186 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
187 ST_SH2(mod0_h, mod1_h, cnt, 8);
190 UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
191 UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
192 UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
193 MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
194 mod0_w, mod1_w, mod2_w, mod3_w);
195 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
197 ST_SW2(mod0_w, mod1_w, acc, 4);
198 ST_SW2(mod2_w, mod3_w, acc + 8, 4);
201 LD_SW2(acc, 4, acc0, acc1);
202 LD_SW2(acc + 8, 4, acc2, acc3);
203 LD_SH2(cnt, 8, cnt0, cnt1);
204 ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
205 HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
206 UNPCK_SH_SW(diff0, diff0_r, diff0_l);
207 UNPCK_SH_SW(diff1, diff1_r, diff1_l);
208 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
209 mod0_w, mod1_w, mod2_w, mod3_w);
210 MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
211 mod1_w, mod2_w, mod3_w);
212 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
213 diff0_r = (mod0_w < const16);
214 diff0_l = (mod1_w < const16);
215 diff1_r = (mod2_w < const16);
216 diff1_l = (mod3_w < const16);
217 SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
218 mod0_w, mod1_w, mod2_w, mod3_w);
219 mod0_w = diff0_r & mod0_w;
220 mod1_w = diff0_l & mod1_w;
221 mod2_w = diff1_r & mod2_w;
222 mod3_w = diff1_l & mod3_w;
223 MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
224 filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
225 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
226 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
227 ST_SH2(mod0_h, mod1_h, cnt, 8);
230 UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
231 UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
232 UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
233 MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
234 mod0_w, mod1_w, mod2_w, mod3_w);
235 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
237 ST_SW2(mod0_w, mod1_w, acc, 4);
238 ST_SW2(mod2_w, mod3_w, acc + 8, 4);
243 void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
244 uint8_t *frame2, uint32_t block_size,
245 int32_t strength, int32_t filter_weight,
246 uint32_t *accumulator, uint16_t *count) {
247 if (8 == block_size) {
248 temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
249 filter_weight, accumulator, count);
250 } else if (16 == block_size) {
251 temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
252 filter_weight, accumulator, count);
257 const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;
259 for (i = 0, k = 0; i < block_size; ++i) {
260 for (j = 0; j < block_size; ++j, ++k) {
261 int src_byte = frame1[byte];
262 int pixel_value = *frame2++;
264 modifier = src_byte - pixel_value;
265 modifier *= modifier;
267 modifier += rounding;
268 modifier >>= strength;
270 if (modifier > 16) modifier = 16;
272 modifier = 16 - modifier;
273 modifier *= filter_weight;
275 count[k] += modifier;
276 accumulator[k] += modifier * pixel_value;
281 byte += stride - block_size;