2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include <emmintrin.h> // SSE2
12 #include "vp9/common/vp9_loopfilter.h"
13 #include "vpx_ports/emmintrin_compat.h"
15 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
16 return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
19 static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
21 const unsigned char *_blimit,
22 const unsigned char *_limit,
23 const unsigned char *_thresh) {
24 const __m128i zero = _mm_set1_epi16(0);
25 const __m128i one = _mm_set1_epi8(1);
26 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
27 const __m128i limit = _mm_load_si128((const __m128i *)_limit);
28 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
29 __m128i mask, hev, flat, flat2;
30 __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
33 q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
34 q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
35 (__m64 *)(s + 4 * p)));
36 q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
37 q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
38 (__m64 *)(s + 3 * p)));
39 q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
40 q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
41 (__m64 *)(s + 2 * p)));
42 q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
43 q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
44 (__m64 *)(s + 1 * p)));
45 p1q1 = _mm_shuffle_epi32(q1p1, 78);
46 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
47 q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
48 (__m64 *)(s - 0 * p)));
49 p0q0 = _mm_shuffle_epi32(q0p0, 78);
52 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
53 abs_p1p0 = abs_diff(q1p1, q0p0);
54 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
55 fe = _mm_set1_epi8(0xfe);
56 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
57 abs_p0q0 = abs_diff(q0p0, p0q0);
58 abs_p1q1 = abs_diff(q1p1, p1q1);
59 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
60 hev = _mm_subs_epu8(flat, thresh);
61 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
63 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
64 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
65 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
66 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
67 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
68 mask = _mm_max_epu8(abs_p1p0, mask);
69 // mask |= (abs(p1 - p0) > limit) * -1;
70 // mask |= (abs(q1 - q0) > limit) * -1;
72 work = _mm_max_epu8(abs_diff(q2p2, q1p1),
73 abs_diff(q3p3, q2p2));
74 mask = _mm_max_epu8(work, mask);
75 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
76 mask = _mm_subs_epu8(mask, limit);
77 mask = _mm_cmpeq_epi8(mask, zero);
82 const __m128i t4 = _mm_set1_epi8(4);
83 const __m128i t3 = _mm_set1_epi8(3);
84 const __m128i t80 = _mm_set1_epi8(0x80);
85 const __m128i t1 = _mm_set1_epi16(0x1);
86 __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
87 __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
88 __m128i qs0 = _mm_xor_si128(p0q0, t80);
89 __m128i qs1 = _mm_xor_si128(p1q1, t80);
92 __m128i filter1, filter2;
93 __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
94 __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
96 filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
97 work_a = _mm_subs_epi8(qs0, qs0ps0);
98 filt = _mm_adds_epi8(filt, work_a);
99 filt = _mm_adds_epi8(filt, work_a);
100 filt = _mm_adds_epi8(filt, work_a);
101 // (vp9_filter + 3 * (qs0 - ps0)) & mask
102 filt = _mm_and_si128(filt, mask);
104 filter1 = _mm_adds_epi8(filt, t4);
105 filter2 = _mm_adds_epi8(filt, t3);
107 filter1 = _mm_unpacklo_epi8(zero, filter1);
108 filter1 = _mm_srai_epi16(filter1, 0xB);
109 filter2 = _mm_unpacklo_epi8(zero, filter2);
110 filter2 = _mm_srai_epi16(filter2, 0xB);
113 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
114 qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
117 filt = _mm_adds_epi16(filter1, t1);
118 filt = _mm_srai_epi16(filt, 1);
119 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
121 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
122 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
127 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
128 flat = _mm_max_epu8(abs_p1p0, flat);
129 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
130 flat = _mm_subs_epu8(flat, one);
131 flat = _mm_cmpeq_epi8(flat, zero);
132 flat = _mm_and_si128(flat, mask);
134 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
135 q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
136 (__m64 *)(s + 5 * p)));
138 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
139 q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
140 (__m64 *)(s + 6 * p)));
141 flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
143 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
144 q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
145 (__m64 *)(s + 7 * p)));
146 work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
147 flat2 = _mm_max_epu8(work, flat2);
148 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
149 flat2 = _mm_subs_epu8(flat2, one);
150 flat2 = _mm_cmpeq_epi8(flat2, zero);
151 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
154 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
155 // flat and wide flat calculations
157 const __m128i eight = _mm_set1_epi16(8);
158 const __m128i four = _mm_set1_epi16(4);
159 __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
160 __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
161 __m128i pixelFilter_p, pixelFilter_q;
162 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
163 __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
165 p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
166 p6_16 = _mm_unpacklo_epi8(q6p6, zero);
167 p5_16 = _mm_unpacklo_epi8(q5p5, zero);
168 p4_16 = _mm_unpacklo_epi8(q4p4, zero);
169 p3_16 = _mm_unpacklo_epi8(q3p3, zero);
170 p2_16 = _mm_unpacklo_epi8(q2p2, zero);
171 p1_16 = _mm_unpacklo_epi8(q1p1, zero);
172 p0_16 = _mm_unpacklo_epi8(q0p0, zero);
173 q0_16 = _mm_unpackhi_epi8(q0p0, zero);
174 q1_16 = _mm_unpackhi_epi8(q1p1, zero);
175 q2_16 = _mm_unpackhi_epi8(q2p2, zero);
176 q3_16 = _mm_unpackhi_epi8(q3p3, zero);
177 q4_16 = _mm_unpackhi_epi8(q4p4, zero);
178 q5_16 = _mm_unpackhi_epi8(q5p5, zero);
179 q6_16 = _mm_unpackhi_epi8(q6p6, zero);
180 q7_16 = _mm_unpackhi_epi8(q7p7, zero);
182 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
183 _mm_add_epi16(p4_16, p3_16));
184 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
185 _mm_add_epi16(q4_16, q3_16));
187 pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
188 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
190 pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
191 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
192 pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
194 pixetFilter_p2p1p0 = _mm_add_epi16(four,
195 _mm_add_epi16(pixetFilter_p2p1p0,
196 pixetFilter_q2q1q0));
197 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
198 _mm_add_epi16(p7_16, p0_16)), 4);
199 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
200 _mm_add_epi16(q7_16, q0_16)), 4);
201 flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
202 res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
203 _mm_add_epi16(p3_16, p0_16)), 3);
204 res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
205 _mm_add_epi16(q3_16, q0_16)), 3);
207 flat_q0p0 = _mm_packus_epi16(res_p, res_q);
209 sum_p7 = _mm_add_epi16(p7_16, p7_16);
210 sum_q7 = _mm_add_epi16(q7_16, q7_16);
211 sum_p3 = _mm_add_epi16(p3_16, p3_16);
212 sum_q3 = _mm_add_epi16(q3_16, q3_16);
214 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
215 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
216 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
217 _mm_add_epi16(sum_p7, p1_16)), 4);
218 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
219 _mm_add_epi16(sum_q7, q1_16)), 4);
220 flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
222 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
223 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
224 res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
225 _mm_add_epi16(sum_p3, p1_16)), 3);
226 res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
227 _mm_add_epi16(sum_q3, q1_16)), 3);
228 flat_q1p1 = _mm_packus_epi16(res_p, res_q);
230 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
231 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
232 sum_p3 = _mm_add_epi16(sum_p3, p3_16);
233 sum_q3 = _mm_add_epi16(sum_q3, q3_16);
235 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
236 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
237 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
238 _mm_add_epi16(sum_p7, p2_16)), 4);
239 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
240 _mm_add_epi16(sum_q7, q2_16)), 4);
241 flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
243 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
244 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
246 res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
247 _mm_add_epi16(sum_p3, p2_16)), 3);
248 res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
249 _mm_add_epi16(sum_q3, q2_16)), 3);
250 flat_q2p2 = _mm_packus_epi16(res_p, res_q);
252 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
253 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
254 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
255 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
256 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
257 _mm_add_epi16(sum_p7, p3_16)), 4);
258 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
259 _mm_add_epi16(sum_q7, q3_16)), 4);
260 flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
262 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
263 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
264 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
265 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
266 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
267 _mm_add_epi16(sum_p7, p4_16)), 4);
268 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
269 _mm_add_epi16(sum_q7, q4_16)), 4);
270 flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
272 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
273 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
274 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
275 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
276 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
277 _mm_add_epi16(sum_p7, p5_16)), 4);
278 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
279 _mm_add_epi16(sum_q7, q5_16)), 4);
280 flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
282 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
283 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
284 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
285 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
286 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
287 _mm_add_epi16(sum_p7, p6_16)), 4);
288 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
289 _mm_add_epi16(sum_q7, q6_16)), 4);
290 flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
293 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
295 flat = _mm_shuffle_epi32(flat, 68);
296 flat2 = _mm_shuffle_epi32(flat2, 68);
298 q2p2 = _mm_andnot_si128(flat, q2p2);
299 flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
300 q2p2 = _mm_or_si128(q2p2, flat_q2p2);
302 qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
303 flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
304 q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
306 qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
307 flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
308 q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
310 q6p6 = _mm_andnot_si128(flat2, q6p6);
311 flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
312 q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
313 _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
314 _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
316 q5p5 = _mm_andnot_si128(flat2, q5p5);
317 flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
318 q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
319 _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
320 _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
322 q4p4 = _mm_andnot_si128(flat2, q4p4);
323 flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
324 q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
325 _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
326 _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
328 q3p3 = _mm_andnot_si128(flat2, q3p3);
329 flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
330 q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
331 _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
332 _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
334 q2p2 = _mm_andnot_si128(flat2, q2p2);
335 flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
336 q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
337 _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
338 _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
340 q1p1 = _mm_andnot_si128(flat2, q1p1);
341 flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
342 q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
343 _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
344 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
346 q0p0 = _mm_andnot_si128(flat2, q0p0);
347 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
348 q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
349 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
350 _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
354 static INLINE __m128i filter_add2_sub2(const __m128i *const total,
355 const __m128i *const a1,
356 const __m128i *const a2,
357 const __m128i *const s1,
358 const __m128i *const s2) {
359 __m128i x = _mm_add_epi16(*a1, *total);
360 x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
364 static INLINE __m128i filter8_mask(const __m128i *const flat,
365 const __m128i *const other_filt,
366 const __m128i *const f8_lo,
367 const __m128i *const f8_hi) {
368 const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
369 _mm_srli_epi16(*f8_hi, 3));
370 const __m128i result = _mm_and_si128(*flat, f8);
371 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
374 static INLINE __m128i filter16_mask(const __m128i *const flat,
375 const __m128i *const other_filt,
376 const __m128i *const f_lo,
377 const __m128i *const f_hi) {
378 const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
379 _mm_srli_epi16(*f_hi, 4));
380 const __m128i result = _mm_and_si128(*flat, f);
381 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
384 static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
386 const unsigned char *_blimit,
387 const unsigned char *_limit,
388 const unsigned char *_thresh) {
389 const __m128i zero = _mm_set1_epi16(0);
390 const __m128i one = _mm_set1_epi8(1);
391 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
392 const __m128i limit = _mm_load_si128((const __m128i *)_limit);
393 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
394 __m128i mask, hev, flat, flat2;
396 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
399 __m128i op2, op1, op0, oq0, oq1, oq2;
401 __m128i max_abs_p1p0q1q0;
403 p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
404 p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
405 p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
406 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
407 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
408 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
409 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
410 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
411 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
412 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
413 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
414 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
415 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
416 q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
417 q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
418 q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
421 const __m128i abs_p1p0 = abs_diff(p1, p0);
422 const __m128i abs_q1q0 = abs_diff(q1, q0);
423 const __m128i fe = _mm_set1_epi8(0xfe);
424 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
425 __m128i abs_p0q0 = abs_diff(p0, q0);
426 __m128i abs_p1q1 = abs_diff(p1, q1);
428 max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
430 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
431 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
432 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
433 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
434 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
435 mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
436 // mask |= (abs(p1 - p0) > limit) * -1;
437 // mask |= (abs(q1 - q0) > limit) * -1;
438 work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
439 mask = _mm_max_epu8(work, mask);
440 work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
441 mask = _mm_max_epu8(work, mask);
442 mask = _mm_subs_epu8(mask, limit);
443 mask = _mm_cmpeq_epi8(mask, zero);
448 work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
449 flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
450 work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
451 flat = _mm_max_epu8(work, flat);
452 work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
453 flat = _mm_subs_epu8(flat, one);
454 flat = _mm_cmpeq_epi8(flat, zero);
455 flat = _mm_and_si128(flat, mask);
456 flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
457 flat2 = _mm_max_epu8(work, flat2);
458 work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
459 flat2 = _mm_max_epu8(work, flat2);
460 work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
461 flat2 = _mm_max_epu8(work, flat2);
462 flat2 = _mm_subs_epu8(flat2, one);
463 flat2 = _mm_cmpeq_epi8(flat2, zero);
464 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
467 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
470 const __m128i t4 = _mm_set1_epi8(4);
471 const __m128i t3 = _mm_set1_epi8(3);
472 const __m128i t80 = _mm_set1_epi8(0x80);
473 const __m128i te0 = _mm_set1_epi8(0xe0);
474 const __m128i t1f = _mm_set1_epi8(0x1f);
475 const __m128i t1 = _mm_set1_epi8(0x1);
476 const __m128i t7f = _mm_set1_epi8(0x7f);
477 const __m128i ff = _mm_cmpeq_epi8(t4, t4);
481 __m128i filter1, filter2;
483 op1 = _mm_xor_si128(p1, t80);
484 op0 = _mm_xor_si128(p0, t80);
485 oq0 = _mm_xor_si128(q0, t80);
486 oq1 = _mm_xor_si128(q1, t80);
488 hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
489 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
490 filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
492 work_a = _mm_subs_epi8(oq0, op0);
493 filt = _mm_adds_epi8(filt, work_a);
494 filt = _mm_adds_epi8(filt, work_a);
495 filt = _mm_adds_epi8(filt, work_a);
496 // (vp9_filter + 3 * (qs0 - ps0)) & mask
497 filt = _mm_and_si128(filt, mask);
498 filter1 = _mm_adds_epi8(filt, t4);
499 filter2 = _mm_adds_epi8(filt, t3);
502 work_a = _mm_cmpgt_epi8(zero, filter1);
503 filter1 = _mm_srli_epi16(filter1, 3);
504 work_a = _mm_and_si128(work_a, te0);
505 filter1 = _mm_and_si128(filter1, t1f);
506 filter1 = _mm_or_si128(filter1, work_a);
507 oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
510 work_a = _mm_cmpgt_epi8(zero, filter2);
511 filter2 = _mm_srli_epi16(filter2, 3);
512 work_a = _mm_and_si128(work_a, te0);
513 filter2 = _mm_and_si128(filter2, t1f);
514 filter2 = _mm_or_si128(filter2, work_a);
515 op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
518 filt = _mm_adds_epi8(filter1, t1);
519 work_a = _mm_cmpgt_epi8(zero, filt);
520 filt = _mm_srli_epi16(filt, 1);
521 work_a = _mm_and_si128(work_a, t80);
522 filt = _mm_and_si128(filt, t7f);
523 filt = _mm_or_si128(filt, work_a);
524 filt = _mm_andnot_si128(hev, filt);
525 op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
526 oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
529 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
532 const __m128i four = _mm_set1_epi16(4);
533 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
534 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
535 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
536 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
537 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
538 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
539 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
540 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
542 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
543 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
544 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
545 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
546 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
547 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
548 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
549 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
550 __m128i f8_lo, f8_hi;
552 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
553 _mm_add_epi16(p3_lo, p2_lo));
554 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
555 _mm_add_epi16(p2_lo, p1_lo));
556 f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
558 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
559 _mm_add_epi16(p3_hi, p2_hi));
560 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
561 _mm_add_epi16(p2_hi, p1_hi));
562 f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
564 op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
566 f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
567 f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
568 op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
570 f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
571 f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
572 op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
574 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
575 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
576 oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
578 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
579 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
580 oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
582 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
583 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
584 oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
587 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
588 // wide flat calculations
590 const __m128i eight = _mm_set1_epi16(8);
591 const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
592 const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
593 const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
594 const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
595 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
596 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
597 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
598 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
599 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
600 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
601 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
602 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
603 const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
604 const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
605 const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
606 const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
608 const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
609 const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
610 const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
611 const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
612 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
613 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
614 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
615 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
616 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
617 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
618 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
619 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
620 const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
621 const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
622 const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
623 const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
628 f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
629 f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
630 _mm_add_epi16(p4_lo, f_lo));
631 f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
632 _mm_add_epi16(p2_lo, p1_lo));
633 f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
634 f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
636 f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
637 f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
638 _mm_add_epi16(p4_hi, f_hi));
639 f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
640 _mm_add_epi16(p2_hi, p1_hi));
641 f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
642 f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
644 p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
645 _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
647 f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
648 f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
649 p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
650 _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
652 f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
653 f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
654 p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
655 _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
657 f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
658 f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
659 p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
660 _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
662 f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
663 f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
664 op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
665 _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
667 f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
668 f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
669 op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
670 _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
672 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
673 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
674 op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
675 _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
677 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
678 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
679 oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
680 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
682 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
683 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
684 oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
685 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
687 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
688 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
689 oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
690 _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
692 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
693 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
694 q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
695 _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
697 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
698 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
699 q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
700 _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
702 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
703 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
704 q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
705 _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
707 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
708 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
709 q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
710 _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
713 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
717 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
718 void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
719 const unsigned char *_blimit,
720 const unsigned char *_limit,
721 const unsigned char *_thresh, int count) {
723 mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
725 mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
728 void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
729 const unsigned char *_blimit,
730 const unsigned char *_limit,
731 const unsigned char *_thresh, int count) {
732 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
733 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
734 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
735 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
736 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
737 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
738 const __m128i zero = _mm_set1_epi16(0);
739 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
740 const __m128i limit = _mm_load_si128((const __m128i *)_limit);
741 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
742 __m128i mask, hev, flat;
743 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
744 __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
748 q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
749 _mm_loadl_epi64((__m128i *)(s + 3 * p)));
750 q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
751 _mm_loadl_epi64((__m128i *)(s + 2 * p)));
752 q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
753 _mm_loadl_epi64((__m128i *)(s + 1 * p)));
754 q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
755 _mm_loadl_epi64((__m128i *)(s - 0 * p)));
756 p1q1 = _mm_shuffle_epi32(q1p1, 78);
757 p0q0 = _mm_shuffle_epi32(q0p0, 78);
760 // filter_mask and hev_mask
761 const __m128i one = _mm_set1_epi8(1);
762 const __m128i fe = _mm_set1_epi8(0xfe);
763 const __m128i ff = _mm_cmpeq_epi8(fe, fe);
764 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
765 abs_p1p0 = abs_diff(q1p1, q0p0);
766 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
768 abs_p0q0 = abs_diff(q0p0, p0q0);
769 abs_p1q1 = abs_diff(q1p1, p1q1);
770 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
771 hev = _mm_subs_epu8(flat, thresh);
772 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
774 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
775 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
776 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
777 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
778 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
779 mask = _mm_max_epu8(abs_p1p0, mask);
780 // mask |= (abs(p1 - p0) > limit) * -1;
781 // mask |= (abs(q1 - q0) > limit) * -1;
783 work = _mm_max_epu8(abs_diff(q2p2, q1p1),
784 abs_diff(q3p3, q2p2));
785 mask = _mm_max_epu8(work, mask);
786 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
787 mask = _mm_subs_epu8(mask, limit);
788 mask = _mm_cmpeq_epi8(mask, zero);
792 flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
793 abs_diff(q3p3, q0p0));
794 flat = _mm_max_epu8(abs_p1p0, flat);
795 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
796 flat = _mm_subs_epu8(flat, one);
797 flat = _mm_cmpeq_epi8(flat, zero);
798 flat = _mm_and_si128(flat, mask);
802 const __m128i four = _mm_set1_epi16(4);
803 unsigned char *src = s;
805 __m128i workp_a, workp_b, workp_shft;
806 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
807 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
808 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
809 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
810 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
811 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
812 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
813 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
815 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
816 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
817 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
818 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
819 _mm_storel_epi64((__m128i *)&flat_op2[0],
820 _mm_packus_epi16(workp_shft, workp_shft));
822 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
823 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
824 _mm_storel_epi64((__m128i *)&flat_op1[0],
825 _mm_packus_epi16(workp_shft, workp_shft));
827 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
828 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
829 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
830 _mm_storel_epi64((__m128i *)&flat_op0[0],
831 _mm_packus_epi16(workp_shft, workp_shft));
833 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
834 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
835 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
836 _mm_storel_epi64((__m128i *)&flat_oq0[0],
837 _mm_packus_epi16(workp_shft, workp_shft));
839 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
840 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
841 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
842 _mm_storel_epi64((__m128i *)&flat_oq1[0],
843 _mm_packus_epi16(workp_shft, workp_shft));
845 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
846 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
847 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
848 _mm_storel_epi64((__m128i *)&flat_oq2[0],
849 _mm_packus_epi16(workp_shft, workp_shft));
854 const __m128i t4 = _mm_set1_epi8(4);
855 const __m128i t3 = _mm_set1_epi8(3);
856 const __m128i t80 = _mm_set1_epi8(0x80);
857 const __m128i t1 = _mm_set1_epi8(0x1);
858 const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
860 const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
862 const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
864 const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
868 __m128i filter1, filter2;
870 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
871 work_a = _mm_subs_epi8(qs0, ps0);
872 filt = _mm_adds_epi8(filt, work_a);
873 filt = _mm_adds_epi8(filt, work_a);
874 filt = _mm_adds_epi8(filt, work_a);
875 // (vp9_filter + 3 * (qs0 - ps0)) & mask
876 filt = _mm_and_si128(filt, mask);
878 filter1 = _mm_adds_epi8(filt, t4);
879 filter2 = _mm_adds_epi8(filt, t3);
882 filter1 = _mm_unpacklo_epi8(zero, filter1);
883 filter1 = _mm_srai_epi16(filter1, 11);
884 filter1 = _mm_packs_epi16(filter1, filter1);
887 filter2 = _mm_unpacklo_epi8(zero, filter2);
888 filter2 = _mm_srai_epi16(filter2, 11);
889 filter2 = _mm_packs_epi16(filter2, zero);
892 filt = _mm_adds_epi8(filter1, t1);
893 filt = _mm_unpacklo_epi8(zero, filt);
894 filt = _mm_srai_epi16(filt, 9);
895 filt = _mm_packs_epi16(filt, zero);
897 filt = _mm_andnot_si128(hev, filt);
899 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
900 q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
901 work_a = _mm_andnot_si128(flat, work_a);
902 q0 = _mm_and_si128(flat, q0);
903 q0 = _mm_or_si128(work_a, q0);
905 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
906 q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
907 work_a = _mm_andnot_si128(flat, work_a);
908 q1 = _mm_and_si128(flat, q1);
909 q1 = _mm_or_si128(work_a, q1);
911 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
912 q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
913 work_a = _mm_andnot_si128(flat, work_a);
914 q2 = _mm_and_si128(flat, q2);
915 q2 = _mm_or_si128(work_a, q2);
917 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
918 p0 = _mm_loadl_epi64((__m128i *)flat_op0);
919 work_a = _mm_andnot_si128(flat, work_a);
920 p0 = _mm_and_si128(flat, p0);
921 p0 = _mm_or_si128(work_a, p0);
923 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
924 p1 = _mm_loadl_epi64((__m128i *)flat_op1);
925 work_a = _mm_andnot_si128(flat, work_a);
926 p1 = _mm_and_si128(flat, p1);
927 p1 = _mm_or_si128(work_a, p1);
929 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
930 p2 = _mm_loadl_epi64((__m128i *)flat_op2);
931 work_a = _mm_andnot_si128(flat, work_a);
932 p2 = _mm_and_si128(flat, p2);
933 p2 = _mm_or_si128(work_a, p2);
935 _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
936 _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
937 _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
938 _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
939 _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
940 _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
944 void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
945 const uint8_t *_blimit0,
946 const uint8_t *_limit0,
947 const uint8_t *_thresh0,
948 const uint8_t *_blimit1,
949 const uint8_t *_limit1,
950 const uint8_t *_thresh1) {
951 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
952 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
953 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
954 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
955 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
956 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
957 const __m128i zero = _mm_set1_epi16(0);
958 const __m128i blimit =
959 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
960 _mm_load_si128((const __m128i *)_blimit1));
961 const __m128i limit =
962 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
963 _mm_load_si128((const __m128i *)_limit1));
964 const __m128i thresh =
965 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
966 _mm_load_si128((const __m128i *)_thresh1));
968 __m128i mask, hev, flat;
969 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
971 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
972 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
973 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
974 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
975 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
976 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
977 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
978 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
980 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
981 _mm_subs_epu8(p0, p1));
982 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
983 _mm_subs_epu8(q0, q1));
984 const __m128i one = _mm_set1_epi8(1);
985 const __m128i fe = _mm_set1_epi8(0xfe);
986 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
987 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
988 _mm_subs_epu8(q0, p0));
989 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
990 _mm_subs_epu8(q1, p1));
993 // filter_mask and hev_mask
994 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
995 hev = _mm_subs_epu8(flat, thresh);
996 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
998 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
999 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1000 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1001 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1002 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1003 mask = _mm_max_epu8(flat, mask);
1004 // mask |= (abs(p1 - p0) > limit) * -1;
1005 // mask |= (abs(q1 - q0) > limit) * -1;
1006 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1007 _mm_subs_epu8(p1, p2)),
1008 _mm_or_si128(_mm_subs_epu8(p3, p2),
1009 _mm_subs_epu8(p2, p3)));
1010 mask = _mm_max_epu8(work, mask);
1011 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1012 _mm_subs_epu8(q1, q2)),
1013 _mm_or_si128(_mm_subs_epu8(q3, q2),
1014 _mm_subs_epu8(q2, q3)));
1015 mask = _mm_max_epu8(work, mask);
1016 mask = _mm_subs_epu8(mask, limit);
1017 mask = _mm_cmpeq_epi8(mask, zero);
1020 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
1021 _mm_subs_epu8(p0, p2)),
1022 _mm_or_si128(_mm_subs_epu8(q2, q0),
1023 _mm_subs_epu8(q0, q2)));
1024 flat = _mm_max_epu8(work, flat);
1025 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
1026 _mm_subs_epu8(p0, p3)),
1027 _mm_or_si128(_mm_subs_epu8(q3, q0),
1028 _mm_subs_epu8(q0, q3)));
1029 flat = _mm_max_epu8(work, flat);
1030 flat = _mm_subs_epu8(flat, one);
1031 flat = _mm_cmpeq_epi8(flat, zero);
1032 flat = _mm_and_si128(flat, mask);
1035 const __m128i four = _mm_set1_epi16(4);
1036 unsigned char *src = s;
1040 __m128i workp_a, workp_b, workp_shft;
1041 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1042 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1043 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1044 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1045 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1046 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1047 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1048 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1050 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1051 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1052 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1053 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1054 _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1055 _mm_packus_epi16(workp_shft, workp_shft));
1057 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1058 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1059 _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1060 _mm_packus_epi16(workp_shft, workp_shft));
1062 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1063 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1064 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1065 _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1066 _mm_packus_epi16(workp_shft, workp_shft));
1068 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1069 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1070 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1071 _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1072 _mm_packus_epi16(workp_shft, workp_shft));
1074 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1075 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1076 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1077 _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1078 _mm_packus_epi16(workp_shft, workp_shft));
1080 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1081 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1082 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1083 _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1084 _mm_packus_epi16(workp_shft, workp_shft));
1091 const __m128i t4 = _mm_set1_epi8(4);
1092 const __m128i t3 = _mm_set1_epi8(3);
1093 const __m128i t80 = _mm_set1_epi8(0x80);
1094 const __m128i te0 = _mm_set1_epi8(0xe0);
1095 const __m128i t1f = _mm_set1_epi8(0x1f);
1096 const __m128i t1 = _mm_set1_epi8(0x1);
1097 const __m128i t7f = _mm_set1_epi8(0x7f);
1099 const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1101 const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1103 const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1105 const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1109 __m128i filter1, filter2;
1111 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1112 work_a = _mm_subs_epi8(qs0, ps0);
1113 filt = _mm_adds_epi8(filt, work_a);
1114 filt = _mm_adds_epi8(filt, work_a);
1115 filt = _mm_adds_epi8(filt, work_a);
1116 // (vp9_filter + 3 * (qs0 - ps0)) & mask
1117 filt = _mm_and_si128(filt, mask);
1119 filter1 = _mm_adds_epi8(filt, t4);
1120 filter2 = _mm_adds_epi8(filt, t3);
1123 work_a = _mm_cmpgt_epi8(zero, filter1);
1124 filter1 = _mm_srli_epi16(filter1, 3);
1125 work_a = _mm_and_si128(work_a, te0);
1126 filter1 = _mm_and_si128(filter1, t1f);
1127 filter1 = _mm_or_si128(filter1, work_a);
1130 work_a = _mm_cmpgt_epi8(zero, filter2);
1131 filter2 = _mm_srli_epi16(filter2, 3);
1132 work_a = _mm_and_si128(work_a, te0);
1133 filter2 = _mm_and_si128(filter2, t1f);
1134 filter2 = _mm_or_si128(filter2, work_a);
1137 filt = _mm_adds_epi8(filter1, t1);
1138 work_a = _mm_cmpgt_epi8(zero, filt);
1139 filt = _mm_srli_epi16(filt, 1);
1140 work_a = _mm_and_si128(work_a, t80);
1141 filt = _mm_and_si128(filt, t7f);
1142 filt = _mm_or_si128(filt, work_a);
1144 filt = _mm_andnot_si128(hev, filt);
1146 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1147 q0 = _mm_load_si128((__m128i *)flat_oq0);
1148 work_a = _mm_andnot_si128(flat, work_a);
1149 q0 = _mm_and_si128(flat, q0);
1150 q0 = _mm_or_si128(work_a, q0);
1152 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1153 q1 = _mm_load_si128((__m128i *)flat_oq1);
1154 work_a = _mm_andnot_si128(flat, work_a);
1155 q1 = _mm_and_si128(flat, q1);
1156 q1 = _mm_or_si128(work_a, q1);
1158 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1159 q2 = _mm_load_si128((__m128i *)flat_oq2);
1160 work_a = _mm_andnot_si128(flat, work_a);
1161 q2 = _mm_and_si128(flat, q2);
1162 q2 = _mm_or_si128(work_a, q2);
1164 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1165 p0 = _mm_load_si128((__m128i *)flat_op0);
1166 work_a = _mm_andnot_si128(flat, work_a);
1167 p0 = _mm_and_si128(flat, p0);
1168 p0 = _mm_or_si128(work_a, p0);
1170 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1171 p1 = _mm_load_si128((__m128i *)flat_op1);
1172 work_a = _mm_andnot_si128(flat, work_a);
1173 p1 = _mm_and_si128(flat, p1);
1174 p1 = _mm_or_si128(work_a, p1);
1176 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1177 p2 = _mm_load_si128((__m128i *)flat_op2);
1178 work_a = _mm_andnot_si128(flat, work_a);
1179 p2 = _mm_and_si128(flat, p2);
1180 p2 = _mm_or_si128(work_a, p2);
1182 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1183 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1184 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1185 _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1186 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1187 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1191 void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1192 const unsigned char *_blimit0,
1193 const unsigned char *_limit0,
1194 const unsigned char *_thresh0,
1195 const unsigned char *_blimit1,
1196 const unsigned char *_limit1,
1197 const unsigned char *_thresh1) {
1198 const __m128i blimit =
1199 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1200 _mm_load_si128((const __m128i *)_blimit1));
1201 const __m128i limit =
1202 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1203 _mm_load_si128((const __m128i *)_limit1));
1204 const __m128i thresh =
1205 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1206 _mm_load_si128((const __m128i *)_thresh1));
1207 const __m128i zero = _mm_set1_epi16(0);
1208 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1209 __m128i mask, hev, flat;
1211 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1212 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1213 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1214 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1215 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1216 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1217 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1218 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1220 // filter_mask and hev_mask
1222 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1223 _mm_subs_epu8(p0, p1));
1224 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1225 _mm_subs_epu8(q0, q1));
1226 const __m128i fe = _mm_set1_epi8(0xfe);
1227 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1228 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1229 _mm_subs_epu8(q0, p0));
1230 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1231 _mm_subs_epu8(q1, p1));
1234 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1235 hev = _mm_subs_epu8(flat, thresh);
1236 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1238 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1239 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1240 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1241 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1242 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1243 mask = _mm_max_epu8(flat, mask);
1244 // mask |= (abs(p1 - p0) > limit) * -1;
1245 // mask |= (abs(q1 - q0) > limit) * -1;
1246 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1247 _mm_subs_epu8(p1, p2)),
1248 _mm_or_si128(_mm_subs_epu8(p3, p2),
1249 _mm_subs_epu8(p2, p3)));
1250 mask = _mm_max_epu8(work, mask);
1251 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1252 _mm_subs_epu8(q1, q2)),
1253 _mm_or_si128(_mm_subs_epu8(q3, q2),
1254 _mm_subs_epu8(q2, q3)));
1255 mask = _mm_max_epu8(work, mask);
1256 mask = _mm_subs_epu8(mask, limit);
1257 mask = _mm_cmpeq_epi8(mask, zero);
1262 const __m128i t4 = _mm_set1_epi8(4);
1263 const __m128i t3 = _mm_set1_epi8(3);
1264 const __m128i t80 = _mm_set1_epi8(0x80);
1265 const __m128i te0 = _mm_set1_epi8(0xe0);
1266 const __m128i t1f = _mm_set1_epi8(0x1f);
1267 const __m128i t1 = _mm_set1_epi8(0x1);
1268 const __m128i t7f = _mm_set1_epi8(0x7f);
1270 const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1272 const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1274 const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1276 const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1280 __m128i filter1, filter2;
1282 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1283 work_a = _mm_subs_epi8(qs0, ps0);
1284 filt = _mm_adds_epi8(filt, work_a);
1285 filt = _mm_adds_epi8(filt, work_a);
1286 filt = _mm_adds_epi8(filt, work_a);
1287 // (vp9_filter + 3 * (qs0 - ps0)) & mask
1288 filt = _mm_and_si128(filt, mask);
1290 filter1 = _mm_adds_epi8(filt, t4);
1291 filter2 = _mm_adds_epi8(filt, t3);
1294 work_a = _mm_cmpgt_epi8(zero, filter1);
1295 filter1 = _mm_srli_epi16(filter1, 3);
1296 work_a = _mm_and_si128(work_a, te0);
1297 filter1 = _mm_and_si128(filter1, t1f);
1298 filter1 = _mm_or_si128(filter1, work_a);
1301 work_a = _mm_cmpgt_epi8(zero, filter2);
1302 filter2 = _mm_srli_epi16(filter2, 3);
1303 work_a = _mm_and_si128(work_a, te0);
1304 filter2 = _mm_and_si128(filter2, t1f);
1305 filter2 = _mm_or_si128(filter2, work_a);
1308 filt = _mm_adds_epi8(filter1, t1);
1309 work_a = _mm_cmpgt_epi8(zero, filt);
1310 filt = _mm_srli_epi16(filt, 1);
1311 work_a = _mm_and_si128(work_a, t80);
1312 filt = _mm_and_si128(filt, t7f);
1313 filt = _mm_or_si128(filt, work_a);
1315 filt = _mm_andnot_si128(hev, filt);
1317 q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1318 q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1319 p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1320 p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1322 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1323 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1324 _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1325 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1329 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1330 int in_p, unsigned char *out, int out_p) {
1331 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1332 __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1335 x0 = _mm_loadl_epi64((__m128i *)in0);
1336 x8 = _mm_loadl_epi64((__m128i *)in1);
1337 x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
1338 x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
1339 x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
1340 x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
1341 x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
1342 x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
1343 x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
1344 x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
1345 x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
1346 x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
1347 x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
1348 x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
1349 x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
1350 x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
1352 x0 = _mm_unpacklo_epi8(x0, x1);
1353 x1 = _mm_unpacklo_epi8(x2, x3);
1354 x2 = _mm_unpacklo_epi8(x4, x5);
1355 x3 = _mm_unpacklo_epi8(x6, x7);
1357 x8 = _mm_unpacklo_epi8(x8, x9);
1358 x9 = _mm_unpacklo_epi8(x10, x11);
1359 x10 = _mm_unpacklo_epi8(x12, x13);
1360 x11 = _mm_unpacklo_epi8(x14, x15);
1362 x4 = _mm_unpacklo_epi16(x0, x1);
1363 x5 = _mm_unpacklo_epi16(x2, x3);
1364 x12 = _mm_unpacklo_epi16(x8, x9);
1365 x13 = _mm_unpacklo_epi16(x10, x11);
1367 x6 = _mm_unpacklo_epi32(x4, x5);
1368 x7 = _mm_unpackhi_epi32(x4, x5);
1369 x14 = _mm_unpacklo_epi32(x12, x13);
1370 x15 = _mm_unpackhi_epi32(x12, x13);
1372 // Store first 4-line result
1373 _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1374 _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1375 _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1376 _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1378 x4 = _mm_unpackhi_epi16(x0, x1);
1379 x5 = _mm_unpackhi_epi16(x2, x3);
1380 x12 = _mm_unpackhi_epi16(x8, x9);
1381 x13 = _mm_unpackhi_epi16(x10, x11);
1383 x6 = _mm_unpacklo_epi32(x4, x5);
1384 x7 = _mm_unpackhi_epi32(x4, x5);
1385 x14 = _mm_unpacklo_epi32(x12, x13);
1386 x15 = _mm_unpackhi_epi32(x12, x13);
1388 // Store second 4-line result
1389 _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1390 _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1391 _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1392 _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1395 static INLINE void transpose(unsigned char *src[], int in_p,
1396 unsigned char *dst[], int out_p,
1397 int num_8x8_to_transpose) {
1399 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1401 unsigned char *in = src[idx8x8];
1402 unsigned char *out = dst[idx8x8];
1404 x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
1405 x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
1406 x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
1407 x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
1408 x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
1409 x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
1410 x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
1411 x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
1412 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1413 x0 = _mm_unpacklo_epi8(x0, x1);
1414 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1415 x1 = _mm_unpacklo_epi8(x2, x3);
1416 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1417 x2 = _mm_unpacklo_epi8(x4, x5);
1418 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1419 x3 = _mm_unpacklo_epi8(x6, x7);
1420 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1421 x4 = _mm_unpacklo_epi16(x0, x1);
1422 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1423 x5 = _mm_unpacklo_epi16(x2, x3);
1424 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1425 x6 = _mm_unpacklo_epi32(x4, x5);
1426 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1427 x7 = _mm_unpackhi_epi32(x4, x5);
1429 _mm_storel_pd((double *)(out + 0*out_p),
1430 _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
1431 _mm_storeh_pd((double *)(out + 1*out_p),
1432 _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
1433 _mm_storel_pd((double *)(out + 2*out_p),
1434 _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
1435 _mm_storeh_pd((double *)(out + 3*out_p),
1436 _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
1438 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1439 x4 = _mm_unpackhi_epi16(x0, x1);
1440 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1441 x5 = _mm_unpackhi_epi16(x2, x3);
1442 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1443 x6 = _mm_unpacklo_epi32(x4, x5);
1444 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1445 x7 = _mm_unpackhi_epi32(x4, x5);
1447 _mm_storel_pd((double *)(out + 4*out_p),
1448 _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
1449 _mm_storeh_pd((double *)(out + 5*out_p),
1450 _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
1451 _mm_storel_pd((double *)(out + 6*out_p),
1452 _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
1453 _mm_storeh_pd((double *)(out + 7*out_p),
1454 _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
1455 } while (++idx8x8 < num_8x8_to_transpose);
1458 void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1459 const uint8_t *limit0,
1460 const uint8_t *thresh0,
1461 const uint8_t *blimit1,
1462 const uint8_t *limit1,
1463 const uint8_t *thresh1) {
1464 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
1465 unsigned char *src[2];
1466 unsigned char *dst[2];
1469 transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1472 vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1473 blimit1, limit1, thresh1);
1477 dst[1] = s - 4 + p * 8;
1480 transpose(src, 16, dst, p, 2);
1483 void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
1484 const unsigned char *blimit,
1485 const unsigned char *limit,
1486 const unsigned char *thresh, int count) {
1487 DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
1488 unsigned char *src[1];
1489 unsigned char *dst[1];
1496 transpose(src, p, dst, 8, 1);
1499 vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
1505 transpose(src, 8, dst, p, 1);
1508 void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1509 const uint8_t *limit0,
1510 const uint8_t *thresh0,
1511 const uint8_t *blimit1,
1512 const uint8_t *limit1,
1513 const uint8_t *thresh1) {
1514 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
1515 unsigned char *src[2];
1516 unsigned char *dst[2];
1519 transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1522 vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1523 blimit1, limit1, thresh1);
1528 dst[1] = s - 4 + p * 8;
1531 transpose(src, 16, dst, p, 2);
1534 void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
1535 const unsigned char *blimit,
1536 const unsigned char *limit,
1537 const unsigned char *thresh) {
1538 DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
1539 unsigned char *src[2];
1540 unsigned char *dst[2];
1545 dst[1] = t_dst + 8 * 8;
1548 transpose(src, p, dst, 8, 2);
1551 mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
1554 src[1] = t_dst + 8 * 8;
1559 transpose(src, 8, dst, p, 2);
1562 void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
1563 const uint8_t *blimit, const uint8_t *limit,
1564 const uint8_t *thresh) {
1565 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1568 transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1569 transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1572 mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
1576 transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1577 transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);