2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
33 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
34 SRARI_H2_SH(out0, out1, rnd_val); \
35 CLIP_SH2_0_255(out0, out1); \
38 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
39 vec0, vec1, vec2, vec3, rnd_val, \
40 out0, out1, out2, out3) \
42 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
43 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
46 #define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \
49 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
50 SRARI_H2_SH(out0, out1, rnd_val); \
51 CLIP_SH2_0_255(out0, out1); \
54 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
55 vec3, rnd_val, out0, out1, out2, out3) \
57 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \
58 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \
61 static void hevc_bi_copy_4w_msa(const uint8_t *src0_ptr,
63 const int16_t *src1_ptr,
69 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70 uint64_t tpd0, tpd1, tpd2, tpd3;
71 v16i8 src0 = { 0 }, src1 = { 0 };
73 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
74 v8i16 dst0, dst1, dst2, dst3;
77 LW2(src0_ptr, src_stride, tp0, tp1);
78 INSERT_W2_SB(tp0, tp1, src0);
79 LD2(src1_ptr, src2_stride, tpd0, tpd1);
80 INSERT_D2_SH(tpd0, tpd1, in0);
82 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
85 dst0 = __msa_srari_h(dst0, 7);
88 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
89 ST_W2(dst0, 0, 1, dst, dst_stride);
90 } else if (4 == height) {
91 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
92 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
93 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
94 INSERT_D2_SH(tpd0, tpd1, in0);
95 INSERT_D2_SH(tpd2, tpd3, in1);
96 ILVRL_B2_SH(zero, src0, dst0, dst1);
97 SLLI_2V(dst0, dst1, 6);
98 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
99 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
100 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
101 } else if (0 == height % 8) {
102 for (loop_cnt = (height >> 3); loop_cnt--;) {
103 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
104 src0_ptr += 4 * src_stride;
105 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
106 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
107 src0_ptr += 4 * src_stride;
108 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
109 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
110 src1_ptr += (4 * src2_stride);
111 INSERT_D2_SH(tpd0, tpd1, in0);
112 INSERT_D2_SH(tpd2, tpd3, in1);
113 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
114 src1_ptr += (4 * src2_stride);
115 INSERT_D2_SH(tpd0, tpd1, in2);
116 INSERT_D2_SH(tpd2, tpd3, in3);
117 ILVRL_B2_SH(zero, src0, dst0, dst1);
118 ILVRL_B2_SH(zero, src1, dst2, dst3);
119 SLLI_4V(dst0, dst1, dst2, dst3, 6);
120 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
121 dst3, 7, dst0, dst1, dst2, dst3);
122 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
123 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
124 dst += (8 * dst_stride);
129 static void hevc_bi_copy_6w_msa(const uint8_t *src0_ptr,
131 const int16_t *src1_ptr,
138 uint64_t tp0, tp1, tp2, tp3;
139 int32_t res = height & 0x07;
140 v16u8 out0, out1, out2, out3;
142 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
143 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
144 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
146 for (loop_cnt = (height >> 3); loop_cnt--;) {
147 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148 src0_ptr += (4 * src_stride);
149 INSERT_D2_SB(tp0, tp1, src0);
150 INSERT_D2_SB(tp2, tp3, src1);
151 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
152 src0_ptr += (4 * src_stride);
153 INSERT_D2_SB(tp0, tp1, src2);
154 INSERT_D2_SB(tp2, tp3, src3);
155 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
156 src1_ptr += (8 * src2_stride);
157 ILVRL_B2_SH(zero, src0, dst0, dst1);
158 ILVRL_B2_SH(zero, src1, dst2, dst3);
159 ILVRL_B2_SH(zero, src2, dst4, dst5);
160 ILVRL_B2_SH(zero, src3, dst6, dst7);
161 SLLI_4V(dst0, dst1, dst2, dst3, 6);
162 SLLI_4V(dst4, dst5, dst6, dst7, 6);
163 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
164 7, dst0, dst1, dst2, dst3);
165 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
166 7, dst4, dst5, dst6, dst7);
167 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
168 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
169 ST_W2(out0, 0, 2, dst, dst_stride);
170 ST_H2(out0, 2, 6, dst + 4, dst_stride);
171 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
172 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
173 dst += (4 * dst_stride);
174 ST_W2(out2, 0, 2, dst, dst_stride);
175 ST_H2(out2, 2, 6, dst + 4, dst_stride);
176 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
177 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
178 dst += (4 * dst_stride);
181 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
182 src0_ptr += (4 * src_stride);
183 INSERT_D2_SB(tp0, tp1, src0);
184 INSERT_D2_SB(tp2, tp3, src1);
185 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
186 INSERT_D2_SB(tp0, tp1, src2);
187 INSERT_D2_SB(tp2, tp3, src3);
188 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
189 ILVRL_B2_SH(zero, src0, dst0, dst1);
190 ILVRL_B2_SH(zero, src1, dst2, dst3);
191 ILVRL_B2_SH(zero, src2, dst4, dst5);
192 ILVRL_B2_SH(zero, src3, dst6, dst7);
193 SLLI_4V(dst0, dst1, dst2, dst3, 6);
194 SLLI_4V(dst4, dst5, dst6, dst7, 6);
195 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
196 7, dst0, dst1, dst2, dst3);
197 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
198 7, dst4, dst5, dst6, dst7);
199 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
200 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
202 ST_W2(out0, 0, 2, dst, dst_stride);
203 ST_H2(out0, 2, 6, dst + 4, dst_stride);
204 } else if (res == 4) {
205 ST_W2(out0, 0, 2, dst, dst_stride);
206 ST_H2(out0, 2, 6, dst + 4, dst_stride);
207 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
208 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
210 ST_W2(out0, 0, 2, dst, dst_stride);
211 ST_H2(out0, 2, 6, dst + 4, dst_stride);
212 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
213 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
214 dst += (4 * dst_stride);
215 ST_W2(out2, 0, 2, dst, dst_stride);
216 ST_H2(out2, 2, 6, dst + 4, dst_stride);
221 static void hevc_bi_copy_8w_msa(const uint8_t *src0_ptr,
223 const int16_t *src1_ptr,
229 uint64_t tp0, tp1, tp2, tp3;
230 v16u8 out0, out1, out2, out3;
231 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
233 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
234 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
237 LD2(src0_ptr, src_stride, tp0, tp1);
238 INSERT_D2_SB(tp0, tp1, src0);
239 LD_SH2(src1_ptr, src2_stride, in0, in1);
240 ILVRL_B2_SH(zero, src0, dst0, dst1);
241 SLLI_2V(dst0, dst1, 6);
242 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
243 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
244 ST_D2(out0, 0, 1, dst, dst_stride);
245 } else if (4 == height) {
246 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
247 INSERT_D2_SB(tp0, tp1, src0);
248 INSERT_D2_SB(tp2, tp3, src1);
249 ILVRL_B2_SH(zero, src0, dst0, dst1);
250 ILVRL_B2_SH(zero, src1, dst2, dst3);
251 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
252 SLLI_4V(dst0, dst1, dst2, dst3, 6);
253 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
254 7, dst0, dst1, dst2, dst3);
255 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
256 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
257 } else if (6 == height) {
258 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
259 src0_ptr += 4 * src_stride;
260 INSERT_D2_SB(tp0, tp1, src0);
261 INSERT_D2_SB(tp2, tp3, src1);
262 LD2(src0_ptr, src_stride, tp0, tp1);
263 INSERT_D2_SB(tp0, tp1, src2);
264 ILVRL_B2_SH(zero, src0, dst0, dst1);
265 ILVRL_B2_SH(zero, src1, dst2, dst3);
266 ILVRL_B2_SH(zero, src2, dst4, dst5);
267 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
268 SLLI_4V(dst0, dst1, dst2, dst3, 6);
269 SLLI_2V(dst4, dst5, 6);
270 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
271 7, dst0, dst1, dst2, dst3);
272 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
273 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
274 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
275 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
276 } else if (0 == height % 8) {
279 for (loop_cnt = (height >> 3); loop_cnt--;) {
280 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
281 src0_ptr += 4 * src_stride;
282 INSERT_D2_SB(tp0, tp1, src0);
283 INSERT_D2_SB(tp2, tp3, src1);
284 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
285 src0_ptr += 4 * src_stride;
286 INSERT_D2_SB(tp0, tp1, src2);
287 INSERT_D2_SB(tp2, tp3, src3);
288 ILVRL_B2_SH(zero, src0, dst0, dst1);
289 ILVRL_B2_SH(zero, src1, dst2, dst3);
290 ILVRL_B2_SH(zero, src2, dst4, dst5);
291 ILVRL_B2_SH(zero, src3, dst6, dst7);
292 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
294 src1_ptr += (8 * src2_stride);
295 SLLI_4V(dst0, dst1, dst2, dst3, 6);
296 SLLI_4V(dst4, dst5, dst6, dst7, 6);
297 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
298 dst3, 7, dst0, dst1, dst2, dst3);
299 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6,
300 dst7, 7, dst4, dst5, dst6, dst7);
301 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
302 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
303 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
304 dst += (8 * dst_stride);
309 static void hevc_bi_copy_12w_msa(const uint8_t *src0_ptr,
311 const int16_t *src1_ptr,
319 v16u8 out0, out1, out2;
320 v16i8 src0, src1, src2, src3;
321 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
322 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
324 for (loop_cnt = 4; loop_cnt--;) {
325 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
326 src0_ptr += (4 * src_stride);
328 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
329 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
330 src1_ptr += (4 * src2_stride);
331 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
332 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
334 SLLI_4V(dst0, dst1, dst2, dst3, 6);
335 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
336 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
337 SLLI_2V(dst4, dst5, 6);
338 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
339 7, dst0, dst1, dst2, dst3);
340 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
341 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
342 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
343 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
344 dst += (4 * dst_stride);
348 static void hevc_bi_copy_16w_msa(const uint8_t *src0_ptr,
350 const int16_t *src1_ptr,
357 v16u8 out0, out1, out2, out3;
358 v16i8 src0, src1, src2, src3;
359 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
360 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
363 for (loop_cnt = (height >> 2); loop_cnt--;) {
364 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
365 src0_ptr += (4 * src_stride);
366 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
367 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
368 src1_ptr += (4 * src2_stride);
369 ILVRL_B2_SH(zero, src0, dst0_r, dst0_l);
370 ILVRL_B2_SH(zero, src1, dst1_r, dst1_l);
371 ILVRL_B2_SH(zero, src2, dst2_r, dst2_l);
372 ILVRL_B2_SH(zero, src3, dst3_r, dst3_l);
373 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
374 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
375 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l,
376 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
377 HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l,
378 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
379 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
380 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
381 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
382 dst += (4 * dst_stride);
386 static void hevc_bi_copy_24w_msa(const uint8_t *src0_ptr,
388 const int16_t *src1_ptr,
395 v16u8 out0, out1, out2, out3, out4, out5;
396 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
397 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
398 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
400 for (loop_cnt = 8; loop_cnt--;) {
401 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
402 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
403 src0_ptr += (4 * src_stride);
404 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
405 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
406 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
407 src1_ptr += (4 * src2_stride);
409 ILVRL_B2_SH(zero, src0, dst0, dst1);
410 ILVRL_B2_SH(zero, src1, dst2, dst3);
411 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
412 ILVRL_B2_SH(zero, src4, dst6, dst7);
413 ILVRL_B2_SH(zero, src5, dst8, dst9);
414 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
415 SLLI_4V(dst0, dst1, dst2, dst3, 6);
416 SLLI_4V(dst4, dst5, dst6, dst7, 6);
417 SLLI_4V(dst8, dst9, dst10, dst11, 6);
418 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3,
419 7, dst0, dst1, dst2, dst3);
420 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7,
421 7, dst4, dst5, dst6, dst7);
422 HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10,
423 dst11, 7, dst8, dst9, dst10, dst11);
424 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
425 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
426 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
427 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
428 dst += (4 * dst_stride);
432 static void hevc_bi_copy_32w_msa(const uint8_t *src0_ptr,
434 const int16_t *src1_ptr,
441 v16u8 out0, out1, out2, out3;
442 v16i8 src0, src1, src2, src3;
444 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
445 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
447 for (loop_cnt = (height >> 1); loop_cnt--;) {
448 LD_SB2(src0_ptr, 16, src0, src1);
449 src0_ptr += src_stride;
450 LD_SB2(src0_ptr, 16, src2, src3);
451 src0_ptr += src_stride;
452 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
453 src1_ptr += src2_stride;
454 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
455 src1_ptr += src2_stride;
457 ILVRL_B2_SH(zero, src0, dst0, dst1);
458 ILVRL_B2_SH(zero, src1, dst2, dst3);
459 ILVRL_B2_SH(zero, src2, dst4, dst5);
460 ILVRL_B2_SH(zero, src3, dst6, dst7);
461 SLLI_4V(dst0, dst1, dst2, dst3, 6);
462 SLLI_4V(dst4, dst5, dst6, dst7, 6);
463 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
464 7, dst0, dst1, dst2, dst3);
465 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
466 7, dst4, dst5, dst6, dst7);
467 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
468 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
469 ST_UB2(out0, out1, dst, 16);
471 ST_UB2(out2, out3, dst, 16);
476 static void hevc_bi_copy_48w_msa(const uint8_t *src0_ptr,
478 const int16_t *src1_ptr,
485 v16u8 out0, out1, out2, out3, out4, out5;
486 v16i8 src0, src1, src2, src3, src4, src5;
488 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
489 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
491 for (loop_cnt = (height >> 1); loop_cnt--;) {
492 LD_SB3(src0_ptr, 16, src0, src1, src2);
493 src0_ptr += src_stride;
494 LD_SB3(src0_ptr, 16, src3, src4, src5);
495 src0_ptr += src_stride;
497 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
498 src1_ptr += src2_stride;
499 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
500 src1_ptr += src2_stride;
502 ILVRL_B2_SH(zero, src0, dst0, dst1);
503 ILVRL_B2_SH(zero, src1, dst2, dst3);
504 ILVRL_B2_SH(zero, src2, dst4, dst5);
505 ILVRL_B2_SH(zero, src3, dst6, dst7);
506 ILVRL_B2_SH(zero, src4, dst8, dst9);
507 ILVRL_B2_SH(zero, src5, dst10, dst11);
509 SLLI_4V(dst0, dst1, dst2, dst3, 6);
510 SLLI_4V(dst4, dst5, dst6, dst7, 6);
511 SLLI_4V(dst8, dst9, dst10, dst11, 6);
513 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
514 7, dst0, dst1, dst2, dst3);
515 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
516 7, dst4, dst5, dst6, dst7);
517 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10,
518 dst11, 7, dst8, dst9, dst10, dst11);
519 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
520 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
521 ST_UB2(out0, out1, dst, 16);
522 ST_UB(out2, dst + 32);
524 ST_UB2(out3, out4, dst, 16);
525 ST_UB(out5, dst + 32);
530 static void hevc_bi_copy_64w_msa(const uint8_t *src0_ptr,
532 const int16_t *src1_ptr,
539 v16u8 out0, out1, out2, out3;
540 v16i8 src0, src1, src2, src3;
542 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
543 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
545 for (loop_cnt = height; loop_cnt--;) {
546 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
547 src0_ptr += src_stride;
548 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
549 src1_ptr += src2_stride;
551 ILVRL_B2_SH(zero, src0, dst0, dst1);
552 ILVRL_B2_SH(zero, src1, dst2, dst3);
553 ILVRL_B2_SH(zero, src2, dst4, dst5);
554 ILVRL_B2_SH(zero, src3, dst6, dst7);
555 SLLI_4V(dst0, dst1, dst2, dst3, 6);
556 SLLI_4V(dst4, dst5, dst6, dst7, 6);
557 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
558 7, dst0, dst1, dst2, dst3);
559 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
560 7, dst4, dst5, dst6, dst7);
561 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
562 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
564 ST_UB4(out0, out1, out2, out3, dst, 16);
569 static void hevc_hz_bi_8t_4w_msa(const uint8_t *src0_ptr,
571 const int16_t *src1_ptr,
575 const int8_t *filter,
579 int32_t res = height & 0x07;
580 v8i16 filt0, filt1, filt2, filt3;
581 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
582 v16i8 mask1, mask2, mask3;
583 v16i8 vec0, vec1, vec2, vec3;
584 v8i16 dst0, dst1, dst2, dst3;
585 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
586 v8i16 filter_vec, const_vec;
587 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
591 /* rearranging filter */
592 filter_vec = LD_SH(filter);
593 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
599 const_vec = __msa_ldi_h(128);
602 for (loop_cnt = (height >> 3); loop_cnt--;) {
603 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
604 src4, src5, src6, src7);
605 src0_ptr += (8 * src_stride);
606 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
607 src1_ptr += (8 * src2_stride);
609 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
610 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
611 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
617 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
618 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
619 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
621 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
622 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
623 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
625 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
626 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
627 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
629 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
630 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
631 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
634 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
635 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
637 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
638 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
639 dst += (8 * dst_stride);
642 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
643 src4, src5, src6, src7);
644 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
646 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
647 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
648 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
654 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
655 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
656 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
658 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
659 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
660 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
662 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
663 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
664 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
666 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
667 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
668 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
671 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
672 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
674 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
676 ST_W2(dst0, 0, 1, dst, dst_stride);
677 } else if (res == 4) {
678 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
680 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
681 dst += (4 * dst_stride);
682 ST_W2(dst1, 0, 1, dst, dst_stride);
687 static void hevc_hz_bi_8t_8w_msa(const uint8_t *src0_ptr,
689 const int16_t *src1_ptr,
693 const int8_t *filter,
697 v8i16 filt0, filt1, filt2, filt3;
698 v16i8 src0, src1, src2, src3;
699 v16i8 mask1, mask2, mask3;
700 v16i8 vec0, vec1, vec2, vec3;
701 v8i16 dst0, dst1, dst2, dst3;
702 v8i16 in0, in1, in2, in3;
703 v8i16 filter_vec, const_vec;
704 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
708 const_vec = __msa_ldi_h(128);
711 filter_vec = LD_SH(filter);
712 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
718 for (loop_cnt = (height >> 2); loop_cnt--;) {
719 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
720 src0_ptr += (4 * src_stride);
721 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
722 src1_ptr += (4 * src2_stride);
723 XORI_B4_128_SB(src0, src1, src2, src3);
729 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
730 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
731 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
733 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
734 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
735 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
737 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
738 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
739 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
741 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
742 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
743 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
746 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
747 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
749 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
750 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
751 dst += (4 * dst_stride);
755 static void hevc_hz_bi_8t_12w_msa(const uint8_t *src0_ptr,
757 const int16_t *src1_ptr,
761 const int8_t *filter,
767 v16i8 src0, src1, src2, src3;
768 v16i8 vec0, vec1, vec2;
769 v8i16 filt0, filt1, filt2, filt3;
770 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
771 v8i16 dst0, dst1, dst2;
772 v8i16 in0, in1, in2, in3;
773 v8i16 filter_vec, const_vec;
776 const_vec = __msa_ldi_h(128);
779 filter_vec = LD_SH(filter);
780 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
782 mask0 = LD_SB(ff_hevc_mask_arr);
786 mask4 = LD_SB(&ff_hevc_mask_arr[16]);
791 for (loop_cnt = 8; loop_cnt--;) {
792 LD_SB2(src0_ptr, 8, src0, src1);
793 src0_ptr += src_stride;
794 LD_SB2(src0_ptr, 8, src2, src3);
795 src0_ptr += src_stride;
796 LD_SH2(src1_ptr, 8, in0, in1);
797 src1_ptr += src2_stride;
798 LD_SH2(src1_ptr, 8, in2, in3);
799 src1_ptr += src2_stride;
800 XORI_B4_128_SB(src0, src1, src2, src3);
806 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0,
808 DPADD_SB2_SH(vec0, vec1, filt0, filt0, dst0, dst1);
809 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
810 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1,
812 DPADD_SB2_SH(vec0, vec1, filt1, filt1, dst0, dst1);
813 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
814 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2,
816 DPADD_SB2_SH(vec0, vec1, filt2, filt2, dst0, dst1);
817 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
818 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3,
820 DPADD_SB2_SH(vec0, vec1, filt3, filt3, dst0, dst1);
821 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
823 in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
824 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
825 dst2 = __msa_adds_s_h(in2, dst2);
826 dst2 = __msa_srari_h(dst2, 7);
828 PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1);
830 tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
831 tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
832 tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
833 tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
843 static void hevc_hz_bi_8t_16w_msa(const uint8_t *src0_ptr,
845 const int16_t *src1_ptr,
849 const int8_t *filter,
853 v16i8 src0, src1, src2, src3;
854 v8i16 filt0, filt1, filt2, filt3;
855 v16i8 mask1, mask2, mask3;
856 v16i8 vec0, vec1, vec2, vec3;
857 v8i16 dst0, dst1, dst2, dst3;
858 v8i16 in0, in1, in2, in3;
859 v8i16 filter_vec, const_vec;
860 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
863 const_vec = __msa_ldi_h(128);
866 filter_vec = LD_SH(filter);
867 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
873 for (loop_cnt = (height >> 1); loop_cnt--;) {
874 LD_SB2(src0_ptr, 8, src0, src1);
875 src0_ptr += src_stride;
876 LD_SB2(src0_ptr, 8, src2, src3);
877 src0_ptr += src_stride;
878 LD_SH2(src1_ptr, 8, in0, in1);
879 src1_ptr += src2_stride;
880 LD_SH2(src1_ptr, 8, in2, in3);
881 src1_ptr += src2_stride;
882 XORI_B4_128_SB(src0, src1, src2, src3);
888 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
889 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
890 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
892 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
893 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
894 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
896 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
897 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
898 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
900 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
901 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
902 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
905 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
906 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
908 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
909 ST_SH2(dst0, dst1, dst, dst_stride);
910 dst += (2 * dst_stride);
914 static void hevc_hz_bi_8t_24w_msa(const uint8_t *src0_ptr,
916 const int16_t *src1_ptr,
920 const int8_t *filter,
925 v16i8 src0, src1, tmp0, tmp1;
926 v8i16 filt0, filt1, filt2, filt3;
927 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
928 v16i8 vec0, vec1, vec2, vec3;
929 v8i16 dst0, dst1, dst2;
931 v8i16 filter_vec, const_vec;
932 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
934 src0_ptr = src0_ptr - 3;
935 const_vec = __msa_ldi_h(128);
938 filter_vec = LD_SH(filter);
939 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
949 for (loop_cnt = height; loop_cnt--;) {
950 LD_SB2(src0_ptr, 16, src0, src1);
951 src0_ptr += src_stride;
952 LD_SH2(src1_ptr, 8, in0, in1);
953 in2 = LD_SH(src1_ptr + 16);
954 src1_ptr += src2_stride;
955 XORI_B2_128_SB(src0, src1);
960 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
961 VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3);
962 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
964 VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1);
965 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3);
966 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
968 VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1);
969 VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3);
970 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
973 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
974 dst2 = __msa_adds_s_h(dst2, in2);
975 dst2 = __msa_srari_h(dst2, 7);
978 PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
979 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
981 SD(dst_val0, dst + 16);
986 static void hevc_hz_bi_8t_32w_msa(const uint8_t *src0_ptr,
988 const int16_t *src1_ptr,
992 const int8_t *filter,
996 v16i8 src0, src1, src2, tmp0, tmp1;
997 v8i16 filt0, filt1, filt2, filt3;
998 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
999 v16i8 vec0, vec1, vec2, vec3;
1000 v8i16 dst0, dst1, dst2, dst3;
1001 v8i16 in0, in1, in2, in3;
1002 v8i16 filter_vec, const_vec;
1003 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1006 const_vec = __msa_ldi_h(128);
1009 filter_vec = LD_SH(filter);
1010 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1020 for (loop_cnt = height; loop_cnt--;) {
1021 LD_SB2(src0_ptr, 16, src0, src1);
1022 src2 = LD_SB(src0_ptr + 24);
1023 src0_ptr += src_stride;
1024 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1025 src1_ptr += src2_stride;
1026 XORI_B3_128_SB(src0, src1, src2);
1032 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1033 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1034 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1036 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1037 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1038 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1040 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1041 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1042 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1044 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1045 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1046 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1049 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1050 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
1052 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1053 ST_SB2(tmp0, tmp1, dst, 16);
1058 static void hevc_hz_bi_8t_48w_msa(const uint8_t *src0_ptr,
1060 const int16_t *src1_ptr,
1061 int32_t src2_stride,
1064 const int8_t *filter,
1068 v16i8 src0, src1, src2, src3;
1069 v16i8 tmp0, tmp1, tmp2;
1070 v8i16 filt0, filt1, filt2, filt3;
1071 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1072 v16i8 vec0, vec1, vec2, vec3;
1073 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1074 v8i16 in0, in1, in2, in3, in4, in5;
1075 v8i16 filter_vec, const_vec;
1076 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1080 const_vec = __msa_ldi_h(128);
1083 filter_vec = LD_SH(filter);
1084 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1094 for (loop_cnt = 64; loop_cnt--;) {
1095 LD_SB3(src0_ptr, 16, src0, src1, src2);
1096 src3 = LD_SB(src0_ptr + 40);
1097 src0_ptr += src_stride;
1098 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1099 XORI_B4_128_SB(src0, src1, src2, src3);
1106 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1107 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
1108 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1110 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1111 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
1112 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1114 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1115 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
1116 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1118 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1119 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
1120 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1122 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
1123 HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
1124 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1126 ST_SB(tmp1, dst + 16);
1128 LD_SH2(src1_ptr + 32, 8, in4, in5);
1129 src1_ptr += src2_stride;
1133 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1);
1134 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
1135 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
1137 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1);
1138 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
1139 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
1142 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
1144 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
1145 ST_SB(tmp2, dst + 32);
1150 static void hevc_hz_bi_8t_64w_msa(const uint8_t *src0_ptr,
1152 const int16_t *src1_ptr,
1153 int32_t src2_stride,
1156 const int8_t *filter,
1160 v16i8 src0, src1, src2, src3, src4, src5, tmp0, tmp1;
1161 v8i16 filt0, filt1, filt2, filt3;
1162 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1163 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1164 v16i8 vec0, vec1, vec2, vec3;
1165 v8i16 dst0, dst1, dst2, dst3;
1166 v8i16 in0, in1, in2, in3;
1167 v8i16 filter_vec, const_vec;
1171 const_vec = __msa_ldi_h(128);
1174 filter_vec = LD_SH(filter);
1175 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1185 for (loop_cnt = height; loop_cnt--;) {
1186 LD_SB2(src0_ptr, 16, src0, src1);
1187 src2 = LD_SB(src0_ptr + 24);
1188 LD_SB2(src0_ptr + 32, 16, src3, src4);
1189 src5 = LD_SB(src0_ptr + 56);
1190 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1191 XORI_B3_128_SB(src0, src1, src2);
1198 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1199 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1200 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1202 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1203 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1204 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1206 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1207 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1208 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1210 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1211 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1212 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1215 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1216 dst0, dst1, dst2, dst3, 7,
1217 dst0, dst1, dst2, dst3);
1219 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1220 ST_SB2(tmp0, tmp1, dst, 16);
1226 LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
1227 XORI_B3_128_SB(src0, src1, src2);
1233 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1234 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1235 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1237 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1238 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1239 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1241 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1242 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1243 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1245 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1246 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1247 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1249 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1250 dst0, dst1, dst2, dst3, 7,
1251 dst0, dst1, dst2, dst3);
1252 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1253 ST_SB2(tmp0, tmp1, dst + 32, 16);
1254 src1_ptr += src2_stride;
1255 src0_ptr += src_stride;
1260 static void hevc_vt_bi_8t_4w_msa(const uint8_t *src0_ptr,
1262 const int16_t *src1_ptr,
1263 int32_t src2_stride,
1266 const int8_t *filter,
1270 int32_t res = height & 0x07;
1271 v16i8 src0, src1, src2, src3, src4, src5;
1272 v16i8 src6, src7, src8, src9, src10;
1273 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1274 v16i8 src11, src12, src13, src14;
1275 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1276 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1277 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1278 v16i8 src2110, src4332, src6554, src8776, src10998;
1279 v16i8 src12111110, src14131312;
1280 v8i16 dst10, dst32, dst54, dst76;
1281 v8i16 filt0, filt1, filt2, filt3;
1282 v8i16 filter_vec, const_vec;
1284 src0_ptr -= (3 * src_stride);
1286 const_vec = __msa_ldi_h(128);
1289 filter_vec = LD_SH(filter);
1290 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1292 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1293 src0_ptr += (7 * src_stride);
1294 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1295 src10_r, src32_r, src54_r, src21_r);
1296 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1297 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1298 src2110, src4332, src6554);
1299 XORI_B3_128_SB(src2110, src4332, src6554);
1301 for (loop_cnt = (height >> 3); loop_cnt--;) {
1302 LD_SB8(src0_ptr, src_stride,
1303 src7, src8, src9, src10, src11, src12, src13, src14);
1304 src0_ptr += (8 * src_stride);
1305 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1306 src1_ptr += (8 * src2_stride);
1308 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1309 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1310 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1311 src76_r, src87_r, src98_r, src109_r);
1312 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1313 src1110_r, src1211_r, src1312_r, src1413_r);
1314 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1315 src1413_r, src1312_r,
1316 src8776, src10998, src12111110, src14131312);
1317 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1320 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1321 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1323 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1324 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1326 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1327 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1329 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1330 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1332 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1333 dst10, dst32, dst54, dst76, 7,
1334 dst10, dst32, dst54, dst76);
1336 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1337 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1338 dst += (8 * dst_stride);
1341 src4332 = src12111110;
1342 src6554 = src14131312;
1346 LD_SB8(src0_ptr, src_stride,
1347 src7, src8, src9, src10, src11, src12, src13, src14);
1348 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1350 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1351 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1352 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1353 src76_r, src87_r, src98_r, src109_r);
1354 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1355 src1110_r, src1211_r, src1312_r, src1413_r);
1356 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1357 src1413_r, src1312_r,
1358 src8776, src10998, src12111110, src14131312);
1359 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1362 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1363 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1365 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1366 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1368 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1369 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1371 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1372 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1374 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1375 dst10, dst32, dst54, dst76, 7,
1376 dst10, dst32, dst54, dst76);
1378 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1380 ST_W2(dst10, 0, 1, dst, dst_stride);
1381 } else if (res == 4) {
1382 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
1384 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
1385 dst += 4 * dst_stride;
1386 ST_W2(dst54, 0, 1, dst, dst_stride);
1391 static void hevc_vt_bi_8t_8w_msa(const uint8_t *src0_ptr,
1393 const int16_t *src1_ptr,
1394 int32_t src2_stride,
1397 const int8_t *filter,
1401 v16i8 src0, src1, src2, src3, src4, src5;
1402 v16i8 src6, src7, src8, src9, src10;
1403 v8i16 in0, in1, in2, in3;
1404 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1405 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1406 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1407 v8i16 filt0, filt1, filt2, filt3;
1408 v8i16 filter_vec, const_vec;
1410 src0_ptr -= (3 * src_stride);
1411 const_vec = __msa_ldi_h(128);
1414 filter_vec = LD_SH(filter);
1415 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1417 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1418 src0_ptr += (7 * src_stride);
1419 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1420 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1421 src10_r, src32_r, src54_r, src21_r);
1422 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1424 for (loop_cnt = (height >> 2); loop_cnt--;) {
1425 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1426 src0_ptr += (4 * src_stride);
1427 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1428 src1_ptr += (4 * src2_stride);
1429 XORI_B4_128_SB(src7, src8, src9, src10);
1430 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1431 src76_r, src87_r, src98_r, src109_r);
1434 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1435 filt0, filt1, filt2, filt3,
1436 dst0_r, dst0_r, dst0_r, dst0_r);
1438 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1439 filt0, filt1, filt2, filt3,
1440 dst1_r, dst1_r, dst1_r, dst1_r);
1442 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1443 filt0, filt1, filt2, filt3,
1444 dst2_r, dst2_r, dst2_r, dst2_r);
1446 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1447 filt0, filt1, filt2, filt3,
1448 dst3_r, dst3_r, dst3_r, dst3_r);
1450 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1451 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1452 dst0_r, dst1_r, dst2_r, dst3_r);
1454 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1455 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1456 dst += (4 * dst_stride);
1469 static void hevc_vt_bi_8t_12w_msa(const uint8_t *src0_ptr,
1471 const int16_t *src1_ptr,
1472 int32_t src2_stride,
1475 const int8_t *filter,
1479 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1480 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1481 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1482 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1483 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1484 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1485 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1486 v16i8 src2110, src4332, src6554, src8776, src10998;
1487 v8i16 dst0_l, dst1_l;
1488 v8i16 filt0, filt1, filt2, filt3;
1489 v8i16 filter_vec, const_vec;
1491 src0_ptr -= (3 * src_stride);
1492 const_vec = __msa_ldi_h(128);
1495 filter_vec = LD_SH(filter);
1496 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1498 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1499 src0_ptr += (7 * src_stride);
1500 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1502 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1503 src10_r, src32_r, src54_r, src21_r);
1504 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1505 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1506 src10_l, src32_l, src54_l, src21_l);
1507 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1508 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1509 src2110, src4332, src6554);
1511 for (loop_cnt = (height >> 2); loop_cnt--;) {
1512 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1513 src0_ptr += (4 * src_stride);
1514 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1515 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1516 src1_ptr += (4 * src2_stride);
1518 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
1519 XORI_B4_128_SB(src7, src8, src9, src10);
1520 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1521 src76_r, src87_r, src98_r, src109_r);
1522 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1523 src76_l, src87_l, src98_l, src109_l);
1524 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1527 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1528 filt0, filt1, filt2, filt3,
1529 dst0_r, dst0_r, dst0_r, dst0_r);
1531 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1532 filt0, filt1, filt2, filt3,
1533 dst1_r, dst1_r, dst1_r, dst1_r);
1535 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1536 filt0, filt1, filt2, filt3,
1537 dst2_r, dst2_r, dst2_r, dst2_r);
1539 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1540 filt0, filt1, filt2, filt3,
1541 dst3_r, dst3_r, dst3_r, dst3_r);
1543 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1544 filt0, filt1, filt2, filt3,
1545 dst0_l, dst0_l, dst0_l, dst0_l);
1547 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1548 filt0, filt1, filt2, filt3,
1549 dst1_l, dst1_l, dst1_l, dst1_l);
1551 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1552 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1553 dst0_r, dst1_r, dst2_r, dst3_r);
1554 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
1557 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1558 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1559 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1560 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
1561 dst += (4 * dst_stride);
1576 static void hevc_vt_bi_8t_16multx2mult_msa(const uint8_t *src0_ptr,
1578 const int16_t *src1_ptr,
1579 int32_t src2_stride,
1582 const int8_t *filter,
1583 int32_t height, int32_t width)
1585 const uint8_t *src0_ptr_tmp;
1586 const int16_t *src1_ptr_tmp;
1590 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1591 v8i16 in0, in1, in2, in3;
1592 v16i8 src10_r, src32_r, src54_r, src76_r;
1593 v16i8 src21_r, src43_r, src65_r, src87_r;
1594 v8i16 dst0_r, dst1_r;
1595 v16i8 src10_l, src32_l, src54_l, src76_l;
1596 v16i8 src21_l, src43_l, src65_l, src87_l;
1597 v8i16 dst0_l, dst1_l;
1598 v8i16 filt0, filt1, filt2, filt3;
1599 v8i16 filter_vec, const_vec;
1601 src0_ptr -= (3 * src_stride);
1602 const_vec = __msa_ldi_h(128);
1605 filter_vec = LD_SH(filter);
1606 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1608 for (cnt = (width >> 4); cnt--;) {
1609 src0_ptr_tmp = src0_ptr;
1610 src1_ptr_tmp = src1_ptr;
1613 LD_SB7(src0_ptr_tmp, src_stride,
1614 src0, src1, src2, src3, src4, src5, src6);
1615 src0_ptr_tmp += (7 * src_stride);
1616 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1618 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1619 src10_r, src32_r, src54_r, src21_r);
1620 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1621 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1622 src10_l, src32_l, src54_l, src21_l);
1623 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1625 for (loop_cnt = (height >> 1); loop_cnt--;) {
1626 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1627 src0_ptr_tmp += (2 * src_stride);
1628 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1629 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1630 src1_ptr_tmp += (2 * src2_stride);
1631 XORI_B2_128_SB(src7, src8);
1633 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1634 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1637 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1638 filt0, filt1, filt2, filt3,
1639 dst0_r, dst0_r, dst0_r, dst0_r);
1641 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1642 filt0, filt1, filt2, filt3,
1643 dst1_r, dst1_r, dst1_r, dst1_r);
1645 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1646 filt0, filt1, filt2, filt3,
1647 dst0_l, dst0_l, dst0_l, dst0_l);
1649 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1650 filt0, filt1, filt2, filt3,
1651 dst1_l, dst1_l, dst1_l, dst1_l);
1653 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1654 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1655 dst0_r, dst1_r, dst0_l, dst1_l);
1657 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1658 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1659 dst_tmp += (2 * dst_stride);
1682 static void hevc_vt_bi_8t_16w_msa(const uint8_t *src0_ptr,
1684 const int16_t *src1_ptr,
1685 int32_t src2_stride,
1688 const int8_t *filter,
1691 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1692 dst, dst_stride, filter, height, 16);
1695 static void hevc_vt_bi_8t_24w_msa(const uint8_t *src0_ptr,
1697 const int16_t *src1_ptr,
1698 int32_t src2_stride,
1701 const int8_t *filter,
1704 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1705 dst, dst_stride, filter, height, 16);
1706 hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
1707 dst + 16, dst_stride, filter, height);
1710 static void hevc_vt_bi_8t_32w_msa(const uint8_t *src0_ptr,
1712 const int16_t *src1_ptr,
1713 int32_t src2_stride,
1716 const int8_t *filter,
1719 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1720 dst, dst_stride, filter, height, 32);
1723 static void hevc_vt_bi_8t_48w_msa(const uint8_t *src0_ptr,
1725 const int16_t *src1_ptr,
1726 int32_t src2_stride,
1729 const int8_t *filter,
1732 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1733 dst, dst_stride, filter, height, 48);
1736 static void hevc_vt_bi_8t_64w_msa(const uint8_t *src0_ptr,
1738 const int16_t *src1_ptr,
1739 int32_t src2_stride,
1742 const int8_t *filter,
1745 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1746 dst, dst_stride, filter, height, 64);
1749 static void hevc_hv_bi_8t_4w_msa(const uint8_t *src0_ptr,
1751 const int16_t *src1_ptr,
1752 int32_t src2_stride,
1755 const int8_t *filter_x,
1756 const int8_t *filter_y,
1762 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1763 v8i16 in0 = { 0 }, in1 = { 0 };
1764 v8i16 filt0, filt1, filt2, filt3;
1765 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1766 v16i8 mask1, mask2, mask3;
1767 v8i16 filter_vec, const_vec;
1768 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1769 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1771 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1772 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1773 v4i32 dst0, dst1, dst2, dst3;
1774 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1776 src0_ptr -= ((3 * src_stride) + 3);
1777 filter_vec = LD_SH(filter_x);
1778 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1780 filter_vec = LD_SH(filter_y);
1781 UNPCK_R_SB_SH(filter_vec, filter_vec);
1783 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1789 const_vec = __msa_ldi_h(128);
1792 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1793 src0_ptr += (7 * src_stride);
1794 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1796 /* row 0 row 1 row 2 row 3 */
1797 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1798 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1799 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1800 vec8, vec9, vec10, vec11);
1801 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1802 vec12, vec13, vec14, vec15);
1804 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1806 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1808 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1810 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1813 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
1814 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
1815 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
1817 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1819 for (loop_cnt = height >> 2; loop_cnt--;) {
1820 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1821 src0_ptr += (4 * src_stride);
1822 XORI_B4_128_SB(src7, src8, src9, src10);
1824 LD2(src1_ptr, src2_stride, tp0, tp1);
1825 INSERT_D2_SH(tp0, tp1, in0);
1826 src1_ptr += (2 * src2_stride);
1827 LD2(src1_ptr, src2_stride, tp0, tp1);
1828 INSERT_D2_SH(tp0, tp1, in1);
1829 src1_ptr += (2 * src2_stride);
1831 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1832 vec0, vec1, vec2, vec3);
1833 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1834 vec4, vec5, vec6, vec7);
1835 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1837 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1840 dst76 = __msa_ilvr_h(dst97, dst66);
1841 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
1842 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1843 dst98 = __msa_ilvr_h(dst66, dst108);
1845 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
1847 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
1849 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
1851 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
1854 SRA_4V(dst0, dst1, dst2, dst3, 6);
1855 PCKEV_H2_SH(dst1, dst0, dst3, dst2, out0, out1);
1856 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
1857 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
1858 SRARI_H2_SH(out0, out1, 7);
1859 CLIP_SH2_0_255(out0, out1);
1860 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1861 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1862 dst += (4 * dst_stride);
1870 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1874 static void hevc_hv_bi_8t_8multx1mult_msa(const uint8_t *src0_ptr,
1876 const int16_t *src1_ptr,
1877 int32_t src2_stride,
1880 const int8_t *filter_x,
1881 const int8_t *filter_y,
1882 int32_t height, int32_t width)
1886 const uint8_t *src0_ptr_tmp;
1887 const int16_t *src1_ptr_tmp;
1890 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1892 v8i16 filt0, filt1, filt2, filt3;
1893 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1894 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1895 v16i8 mask1, mask2, mask3;
1896 v8i16 filter_vec, const_vec;
1897 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1898 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1899 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1900 v4i32 dst0_r, dst0_l;
1901 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1902 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1904 src0_ptr -= ((3 * src_stride) + 3);
1905 const_vec = __msa_ldi_h(128);
1908 filter_vec = LD_SH(filter_x);
1909 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1911 filter_vec = LD_SH(filter_y);
1912 UNPCK_R_SB_SH(filter_vec, filter_vec);
1914 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1920 for (cnt = width >> 3; cnt--;) {
1921 src0_ptr_tmp = src0_ptr;
1923 src1_ptr_tmp = src1_ptr;
1925 LD_SB7(src0_ptr_tmp, src_stride,
1926 src0, src1, src2, src3, src4, src5, src6);
1927 src0_ptr_tmp += (7 * src_stride);
1928 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1930 /* row 0 row 1 row 2 row 3 */
1931 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1932 vec0, vec1, vec2, vec3);
1933 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1934 vec4, vec5, vec6, vec7);
1935 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1936 vec8, vec9, vec10, vec11);
1937 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1938 vec12, vec13, vec14, vec15);
1939 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1941 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1943 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1945 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1948 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1949 vec0, vec1, vec2, vec3);
1950 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1951 vec4, vec5, vec6, vec7);
1952 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1953 vec8, vec9, vec10, vec11);
1954 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1956 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1958 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1961 for (loop_cnt = height; loop_cnt--;) {
1962 src7 = LD_SB(src0_ptr_tmp);
1963 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1964 src0_ptr_tmp += src_stride;
1966 in0 = LD_SH(src1_ptr_tmp);
1967 src1_ptr_tmp += src2_stride;
1969 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1970 vec0, vec1, vec2, vec3);
1971 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1973 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1974 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1975 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1976 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1977 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1978 filt_h0, filt_h1, filt_h2, filt_h3);
1979 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1980 filt_h0, filt_h1, filt_h2, filt_h3);
1984 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1985 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
1986 tmp = __msa_srari_h(tmp, 7);
1988 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
1989 ST_D1(out, 0, dst_tmp);
1990 dst_tmp += dst_stride;
2007 static void hevc_hv_bi_8t_8w_msa(const uint8_t *src0_ptr,
2009 const int16_t *src1_ptr,
2010 int32_t src2_stride,
2013 const int8_t *filter_x,
2014 const int8_t *filter_y,
2017 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2018 dst, dst_stride, filter_x, filter_y,
2022 static void hevc_hv_bi_8t_12w_msa(const uint8_t *src0_ptr,
2024 const int16_t *src1_ptr,
2025 int32_t src2_stride,
2028 const int8_t *filter_x,
2029 const int8_t *filter_y,
2033 const uint8_t *src0_ptr_tmp;
2035 const int16_t *src1_ptr_tmp;
2038 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2039 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2040 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2041 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2042 v8i16 in0, in1 = { 0 }, out0, out1, tmp, filter_vec, const_vec;
2043 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2044 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2045 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
2046 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
2047 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2048 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2049 v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
2051 src0_ptr -= ((3 * src_stride) + 3);
2053 const_vec = __msa_ldi_h(128);
2056 filter_vec = LD_SH(filter_x);
2057 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2059 filter_vec = LD_SH(filter_y);
2060 UNPCK_R_SB_SH(filter_vec, filter_vec);
2062 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2064 mask0 = LD_SB(ff_hevc_mask_arr);
2069 src0_ptr_tmp = src0_ptr;
2071 src1_ptr_tmp = src1_ptr;
2073 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5,
2075 src0_ptr_tmp += (7 * src_stride);
2076 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2078 /* row 0 row 1 row 2 row 3 */
2079 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2081 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
2083 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2085 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2087 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2089 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2091 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2093 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2095 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2097 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
2099 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2101 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2103 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2105 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2108 for (loop_cnt = 16; loop_cnt--;) {
2109 src7 = LD_SB(src0_ptr_tmp);
2110 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
2111 src0_ptr_tmp += src_stride;
2113 in0 = LD_SH(src1_ptr_tmp);
2114 src1_ptr_tmp += src2_stride;
2116 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2118 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2120 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
2121 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
2122 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
2123 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2124 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2125 filt_h1, filt_h2, filt_h3);
2126 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2127 filt_h1, filt_h2, filt_h3);
2131 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2132 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
2133 tmp = __msa_srari_h(tmp, 7);
2135 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
2136 ST_D1(out, 0, dst_tmp);
2137 dst_tmp += dst_stride;
2152 mask4 = LD_SB(ff_hevc_mask_arr + 16);
2157 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2158 src0_ptr += (7 * src_stride);
2159 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2161 /* row 0 row 1 row 2 row 3 */
2162 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2163 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2164 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7,
2165 vec8, vec9, vec10, vec11);
2166 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
2167 vec12, vec13, vec14, vec15);
2168 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2170 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2172 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2174 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2177 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2178 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2179 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2181 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2183 for (loop_cnt = 4; loop_cnt--;) {
2184 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2185 src0_ptr += (4 * src_stride);
2186 XORI_B4_128_SB(src7, src8, src9, src10);
2188 LD2(src1_ptr, src2_stride, tp0, tp1);
2189 INSERT_D2_SH(tp0, tp1, in0);
2190 src1_ptr += (2 * src2_stride);
2191 LD2(src1_ptr, src2_stride, tp0, tp1);
2192 INSERT_D2_SH(tp0, tp1, in1);
2193 src1_ptr += (2 * src2_stride);
2195 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2197 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2199 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2201 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2204 dst76 = __msa_ilvr_h(dst97, dst66);
2205 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2206 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2207 dst98 = __msa_ilvr_h(dst66, dst108);
2209 tmp0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2211 tmp1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2213 tmp2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2215 tmp3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2217 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
2218 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, out0, out1);
2219 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
2220 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
2221 SRARI_H2_SH(out0, out1, 7);
2222 CLIP_SH2_0_255(out0, out1);
2223 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
2224 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2225 dst += (4 * dst_stride);
2233 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2237 static void hevc_hv_bi_8t_16w_msa(const uint8_t *src0_ptr,
2239 const int16_t *src1_ptr,
2240 int32_t src2_stride,
2243 const int8_t *filter_x,
2244 const int8_t *filter_y,
2247 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2248 dst, dst_stride, filter_x, filter_y,
2252 static void hevc_hv_bi_8t_24w_msa(const uint8_t *src0_ptr,
2254 const int16_t *src1_ptr,
2255 int32_t src2_stride,
2258 const int8_t *filter_x,
2259 const int8_t *filter_y,
2262 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2263 dst, dst_stride, filter_x, filter_y,
2267 static void hevc_hv_bi_8t_32w_msa(const uint8_t *src0_ptr,
2269 const int16_t *src1_ptr,
2270 int32_t src2_stride,
2273 const int8_t *filter_x,
2274 const int8_t *filter_y,
2277 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2278 dst, dst_stride, filter_x, filter_y,
2282 static void hevc_hv_bi_8t_48w_msa(const uint8_t *src0_ptr,
2284 const int16_t *src1_ptr,
2285 int32_t src2_stride,
2288 const int8_t *filter_x,
2289 const int8_t *filter_y,
2292 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2293 dst, dst_stride, filter_x, filter_y,
2297 static void hevc_hv_bi_8t_64w_msa(const uint8_t *src0_ptr,
2299 const int16_t *src1_ptr,
2300 int32_t src2_stride,
2303 const int8_t *filter_x,
2304 const int8_t *filter_y,
2307 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2308 dst, dst_stride, filter_x, filter_y,
2312 static void hevc_hz_bi_4t_4x2_msa(const uint8_t *src0_ptr,
2314 const int16_t *src1_ptr,
2315 int32_t src2_stride,
2318 const int8_t *filter,
2322 v16i8 src0, src1, dst0, vec0, vec1;
2324 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2327 v8i16 filter_vec, const_vec;
2331 const_vec = __msa_ldi_h(128);
2334 filter_vec = LD_SH(filter);
2335 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2339 LD_SB2(src0_ptr, src_stride, src0, src1);
2340 LD_SH2(src1_ptr, src2_stride, in0, in1);
2341 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2342 XORI_B2_128_SB(src0, src1);
2343 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2345 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
2347 tmp0 = __msa_adds_s_h(tmp0, in0);
2348 tmp0 = __msa_srari_h(tmp0, 7);
2349 CLIP_SH_0_255(tmp0);
2350 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
2352 ST_W2(dst0, 0, 1, dst, dst_stride);
2355 static void hevc_hz_bi_4t_4x4_msa(const uint8_t *src0_ptr,
2357 const int16_t *src1_ptr,
2358 int32_t src2_stride,
2361 const int8_t *filter,
2365 v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
2366 v8i16 in0, in1, in2, in3;
2368 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2371 v8i16 filter_vec, const_vec;
2375 const_vec = __msa_ldi_h(128);
2378 filter_vec = LD_SH(filter);
2379 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2383 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2384 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2386 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2387 XORI_B4_128_SB(src0, src1, src2, src3);
2391 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2392 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2393 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1,
2395 HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
2396 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2398 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2401 static void hevc_hz_bi_4t_4x8multiple_msa(const uint8_t *src0_ptr,
2403 const int16_t *src1_ptr,
2404 int32_t src2_stride,
2407 const int8_t *filter,
2412 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2414 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2415 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2416 v16i8 mask1, vec0, vec1, vec2, vec3;
2417 v8i16 tmp0, tmp1, tmp2, tmp3;
2418 v8i16 filter_vec, const_vec;
2422 const_vec = __msa_ldi_h(128);
2425 filter_vec = LD_SH(filter);
2426 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2430 for (loop_cnt = (height >> 3); loop_cnt--;) {
2431 LD_SB8(src0_ptr, src_stride,
2432 src0, src1, src2, src3, src4, src5, src6, src7);
2433 src0_ptr += (8 * src_stride);
2434 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2435 src1_ptr += (4 * src2_stride);
2436 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2437 src1_ptr += (4 * src2_stride);
2438 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2439 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2440 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2446 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2447 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
2448 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
2450 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
2451 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
2452 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
2455 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2456 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2458 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2459 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2460 dst += (8 * dst_stride);
2464 static void hevc_hz_bi_4t_4w_msa(const uint8_t *src0_ptr,
2466 const int16_t *src1_ptr,
2467 int32_t src2_stride,
2470 const int8_t *filter,
2474 hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2475 dst, dst_stride, filter, height);
2476 } else if (4 == height) {
2477 hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2478 dst, dst_stride, filter, height);
2479 } else if (8 == height || 16 == height) {
2480 hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
2481 src1_ptr, src2_stride,
2482 dst, dst_stride, filter, height);
2486 static void hevc_hz_bi_4t_6w_msa(const uint8_t *src0_ptr,
2488 const int16_t *src1_ptr,
2489 int32_t src2_stride,
2492 const int8_t *filter,
2496 int32_t res = height & 0x03;
2498 v16i8 src0, src1, src2, src3;
2499 v8i16 in0, in1, in2, in3;
2500 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2502 v16i8 vec0, vec1, vec2, vec3;
2503 v8i16 dst0, dst1, dst2, dst3;
2504 v8i16 filter_vec, const_vec;
2508 const_vec = __msa_ldi_h(128);
2511 filter_vec = LD_SH(filter);
2512 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2516 for (loop_cnt = (height >> 2); loop_cnt--;) {
2517 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2518 src0_ptr += (4 * src_stride);
2519 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2520 src1_ptr += (4 * src2_stride);
2521 XORI_B4_128_SB(src0, src1, src2, src3);
2527 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2528 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2529 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2531 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2532 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2533 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2536 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2537 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2539 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2540 ST_W2(dst0, 0, 2, dst, dst_stride);
2541 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2542 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2543 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2544 dst += (4 * dst_stride);
2547 LD_SB2(src0_ptr, src_stride, src0, src1);
2548 LD_SH2(src1_ptr, src2_stride, in0, in1);
2549 XORI_B2_128_SB(src0, src1);
2553 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2554 DPADD_SB2_SH(vec0, vec1, filt0, filt0, dst0, dst1);
2555 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2556 DPADD_SB2_SH(vec0, vec1, filt1, filt1, dst0, dst1);
2558 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
2560 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2561 ST_W2(dst0, 0, 2, dst, dst_stride);
2562 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2566 static void hevc_hz_bi_4t_8x2_msa(const uint8_t *src0_ptr,
2568 const int16_t *src1_ptr,
2569 int32_t src2_stride,
2572 const int8_t *filter,
2578 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2579 v16i8 mask1, vec0, vec1, vec2, vec3;
2581 v8i16 filter_vec, const_vec;
2585 const_vec = __msa_ldi_h(128);
2588 filter_vec = LD_SH(filter);
2589 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2593 LD_SB2(src0_ptr, src_stride, src0, src1);
2594 LD_SH2(src1_ptr, src2_stride, in0, in1);
2595 XORI_B2_128_SB(src0, src1);
2599 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2600 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2601 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1,
2603 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
2605 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2606 ST_D2(dst0, 0, 1, dst, dst_stride);
2609 static void hevc_hz_bi_4t_8x6_msa(const uint8_t *src0_ptr,
2611 const int16_t *src1_ptr,
2612 int32_t src2_stride,
2615 const int8_t *filter,
2619 v16i8 src0, src1, src2, src3, src4, src5;
2620 v8i16 in0, in1, in2, in3, in4, in5;
2621 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2623 v16i8 vec0, vec1, vec2, vec3;
2624 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2625 v8i16 filter_vec, const_vec;
2629 const_vec = __msa_ldi_h(128);
2632 filter_vec = LD_SH(filter);
2633 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2637 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2638 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2639 src1_ptr += (4 * src2_stride);
2640 LD_SH2(src1_ptr, src2_stride, in4, in5);
2641 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2647 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2648 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2649 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1,
2651 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2652 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2653 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1,
2658 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2659 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3);
2660 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5,
2663 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2664 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2665 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2667 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2668 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2669 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2670 ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride);
2673 static void hevc_hz_bi_4t_8x4multiple_msa(const uint8_t *src0_ptr,
2675 const int16_t *src1_ptr,
2676 int32_t src2_stride,
2679 const int8_t *filter,
2684 v16i8 src0, src1, src2, src3;
2685 v8i16 in0, in1, in2, in3;
2686 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2688 v16i8 vec0, vec1, vec2, vec3;
2689 v8i16 dst0, dst1, dst2, dst3;
2690 v8i16 filter_vec, const_vec;
2694 const_vec = __msa_ldi_h(128);
2697 filter_vec = LD_SH(filter);
2698 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2702 for (loop_cnt = (height >> 2); loop_cnt--;) {
2703 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2704 src0_ptr += (4 * src_stride);
2705 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2706 src1_ptr += (4 * src2_stride);
2707 XORI_B4_128_SB(src0, src1, src2, src3);
2713 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2714 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2715 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2717 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2718 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2719 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2722 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2723 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2725 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2726 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2727 dst += (4 * dst_stride);
2731 static void hevc_hz_bi_4t_8w_msa(const uint8_t *src0_ptr,
2733 const int16_t *src1_ptr,
2734 int32_t src2_stride,
2737 const int8_t *filter,
2741 hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2742 dst, dst_stride, filter, height);
2743 } else if (6 == height) {
2744 hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2745 dst, dst_stride, filter, height);
2746 } else if (0 == (height % 4)) {
2747 hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
2748 src1_ptr, src2_stride,
2749 dst, dst_stride, filter, height);
2753 static void hevc_hz_bi_4t_12w_msa(const uint8_t *src0_ptr,
2755 const int16_t *src1_ptr,
2756 int32_t src2_stride,
2759 const int8_t *filter,
2764 v16i8 src0, src1, src2, src3;
2765 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2766 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2768 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2771 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2772 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2773 v8i16 filter_vec, const_vec;
2777 const_vec = __msa_ldi_h(128);
2780 filter_vec = LD_SH(filter);
2781 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2786 for (loop_cnt = (height >> 2); loop_cnt--;) {
2787 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2788 src0_ptr += (4 * src_stride);
2789 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2790 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2791 src1_ptr += (4 * src2_stride);
2793 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
2794 XORI_B4_128_SB(src0, src1, src2, src3);
2802 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2803 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2804 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2807 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
2808 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2809 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2810 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec4, vec5);
2811 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2813 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
2815 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2816 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2817 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2819 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2820 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2821 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2822 ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride);
2823 dst += (4 * dst_stride);
2827 static void hevc_hz_bi_4t_16w_msa(const uint8_t *src0_ptr,
2829 const int16_t *src1_ptr,
2830 int32_t src2_stride,
2833 const int8_t *filter,
2837 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
2838 v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3;
2840 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2842 v8i16 filter_vec, const_vec;
2846 const_vec = __msa_ldi_h(128);
2849 filter_vec = LD_SH(filter);
2850 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2854 for (loop_cnt = (height >> 1); loop_cnt--;) {
2855 LD_SB2(src0_ptr, src_stride, src0, src2);
2856 LD_SB2(src0_ptr + 8, src_stride, src1, src3);
2857 src0_ptr += (2 * src_stride);
2858 LD_SH2(src1_ptr, src2_stride, in0, in2);
2859 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
2860 src1_ptr += (2 * src2_stride);
2862 XORI_B4_128_SB(src0, src1, src2, src3);
2869 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2870 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2873 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2874 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2875 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2878 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2879 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2881 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2882 ST_SH2(dst0, dst1, dst, dst_stride);
2883 dst += (2 * dst_stride);
2887 static void hevc_hz_bi_4t_24w_msa(const uint8_t *src0_ptr,
2889 const int16_t *src1_ptr,
2890 int32_t src2_stride,
2893 const int8_t *filter,
2896 const int16_t *src1_ptr_tmp;
2899 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2900 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2902 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2903 v16i8 mask1, mask2, mask3;
2904 v16i8 vec0, vec1, vec2, vec3;
2905 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2906 v8i16 filter_vec, const_vec;
2910 const_vec = __msa_ldi_h(128);
2913 filter_vec = LD_SH(filter);
2914 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2921 src1_ptr_tmp = src1_ptr + 16;
2923 for (loop_cnt = (height >> 2); loop_cnt--;) {
2924 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2925 LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2926 src0_ptr += (4 * src_stride);
2927 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2928 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2929 src1_ptr += (4 * src2_stride);
2930 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2936 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2937 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3);
2938 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2940 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2941 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3);
2942 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2949 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1);
2950 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3);
2951 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4,
2953 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1);
2954 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3);
2955 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4,
2958 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2959 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2960 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2961 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2963 PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2964 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2965 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2966 dst += (4 * dst_stride);
2968 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2969 src1_ptr_tmp += (4 * src2_stride);
2975 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2976 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2977 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2979 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec0, vec1);
2980 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3);
2981 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2984 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2985 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2987 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2988 ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride);
2989 dst_tmp += (4 * dst_stride);
2993 static void hevc_hz_bi_4t_32w_msa(const uint8_t *src0_ptr,
2995 const int16_t *src1_ptr,
2996 int32_t src2_stride,
2999 const int8_t *filter,
3003 v16i8 src0, src1, src2;
3004 v8i16 in0, in1, in2, in3;
3006 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3007 v16i8 mask1, mask2, mask3;
3008 v8i16 dst0, dst1, dst2, dst3;
3009 v16i8 vec0, vec1, vec2, vec3;
3010 v8i16 filter_vec, const_vec;
3014 const_vec = __msa_ldi_h(128);
3017 filter_vec = LD_SH(filter);
3018 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3024 for (loop_cnt = height; loop_cnt--;) {
3025 LD_SB2(src0_ptr, 16, src0, src1);
3026 src2 = LD_SB(src0_ptr + 24);
3027 src0_ptr += src_stride;
3028 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3029 src1_ptr += src2_stride;
3030 XORI_B3_128_SB(src0, src1, src2);
3036 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
3037 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
3038 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
3040 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
3041 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
3042 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
3045 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3046 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
3048 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3049 ST_SH2(dst0, dst1, dst, 16);
3054 static void hevc_vt_bi_4t_4x2_msa(const uint8_t *src0_ptr,
3056 const int16_t *src1_ptr,
3057 int32_t src2_stride,
3060 const int8_t *filter,
3063 v16i8 src0, src1, src2, src3, src4;
3065 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3068 v8i16 filter_vec, const_vec;
3070 src0_ptr -= src_stride;
3072 const_vec = __msa_ldi_h(128);
3075 filter_vec = LD_SH(filter);
3076 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3078 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3079 src0_ptr += (3 * src_stride);
3081 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3082 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3083 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3085 LD_SB2(src0_ptr, src_stride, src3, src4);
3086 LD_SH2(src1_ptr, src2_stride, in0, in1);
3087 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3088 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3089 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3090 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3093 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3094 dst10 = __msa_adds_s_h(dst10, in0);
3095 dst10 = __msa_srari_h(dst10, 7);
3096 CLIP_SH_0_255(dst10);
3098 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
3099 ST_W2(dst10, 0, 1, dst, dst_stride);
3102 static void hevc_vt_bi_4t_4x4_msa(const uint8_t *src0_ptr,
3104 const int16_t *src1_ptr,
3105 int32_t src2_stride,
3108 const int8_t *filter,
3111 v16i8 src0, src1, src2, src3, src4, src5, src6;
3112 v8i16 in0, in1, in2, in3;
3113 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3114 v16i8 src2110, src4332, src6554;
3117 v8i16 filter_vec, const_vec;
3119 src0_ptr -= src_stride;
3121 const_vec = __msa_ldi_h(128);
3124 filter_vec = LD_SH(filter);
3125 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3127 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3128 src0_ptr += (3 * src_stride);
3129 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3130 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3131 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3133 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3134 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3135 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3136 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3137 src32_r, src43_r, src54_r, src65_r);
3138 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3139 XORI_B2_128_SB(src4332, src6554);
3142 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3144 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3145 HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
3147 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3148 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3151 static void hevc_vt_bi_4t_4x8multiple_msa(const uint8_t *src0_ptr,
3153 const int16_t *src1_ptr,
3154 int32_t src2_stride,
3157 const int8_t *filter,
3161 v16i8 src0, src1, src2, src3, src4, src5;
3162 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3163 v16i8 src6, src7, src8, src9;
3164 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3165 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3166 v16i8 src2110, src4332, src6554, src8776;
3167 v8i16 dst10, dst32, dst54, dst76;
3169 v8i16 filter_vec, const_vec;
3171 src0_ptr -= src_stride;
3173 const_vec = __msa_ldi_h(128);
3176 filter_vec = LD_SH(filter);
3177 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3179 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3180 src0_ptr += (3 * src_stride);
3181 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3182 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3183 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3185 for (loop_cnt = (height >> 3); loop_cnt--;) {
3186 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3187 src0_ptr += (6 * src_stride);
3188 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3189 src1_ptr += (8 * src2_stride);
3190 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3191 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3192 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3193 src32_r, src43_r, src54_r, src65_r);
3194 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3195 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3196 src4332, src6554, src8776);
3197 XORI_B3_128_SB(src4332, src6554, src8776);
3200 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3202 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3204 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3206 LD_SB2(src0_ptr, src_stride, src9, src2);
3207 src0_ptr += (2 * src_stride);
3208 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3209 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3210 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3212 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3214 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3215 dst10, dst32, dst54, dst76, 7,
3216 dst10, dst32, dst54, dst76);
3218 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
3219 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3220 dst += (8 * dst_stride);
3224 static void hevc_vt_bi_4t_4w_msa(const uint8_t *src0_ptr,
3226 const int16_t *src1_ptr,
3227 int32_t src2_stride,
3230 const int8_t *filter,
3234 hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3235 dst, dst_stride, filter, height);
3236 } else if (4 == height) {
3237 hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3238 dst, dst_stride, filter, height);
3240 hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
3241 src1_ptr, src2_stride,
3242 dst, dst_stride, filter, height);
3246 static void hevc_vt_bi_4t_6w_msa(const uint8_t *src0_ptr,
3248 const int16_t *src1_ptr,
3249 int32_t src2_stride,
3252 const int8_t *filter,
3255 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3256 v8i16 in0, in1, in2, in3;
3257 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3258 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3260 v8i16 filter_vec, const_vec;
3262 src0_ptr -= src_stride;
3264 const_vec = __msa_ldi_h(128);
3267 filter_vec = LD_SH(filter);
3268 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3270 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3271 src0_ptr += (3 * src_stride);
3272 LD_SB2(src0_ptr, src_stride, src3, src4);
3273 src0_ptr += (2 * src_stride);
3274 LD_SB2(src0_ptr, src_stride, src5, src6);
3275 src0_ptr += (2 * src_stride);
3276 LD_SB2(src0_ptr, src_stride, src7, src8);
3277 src0_ptr += (2 * src_stride);
3278 LD_SB2(src0_ptr, src_stride, src9, src10);
3279 src0_ptr += (2 * src_stride);
3281 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3282 src1_ptr += (4 * src2_stride);
3284 XORI_B3_128_SB(src0, src1, src2);
3285 XORI_B2_128_SB(src3, src4);
3286 XORI_B2_128_SB(src5, src6);
3287 XORI_B2_128_SB(src7, src8);
3288 XORI_B2_128_SB(src9, src10);
3290 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3291 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3294 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3296 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3298 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3301 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3303 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3305 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3306 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3307 dst0_r, dst1_r, dst2_r, dst3_r);
3309 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3310 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3311 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3312 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3313 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3314 dst += (4 * dst_stride);
3316 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3317 src1_ptr += (4 * src2_stride);
3318 ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r);
3321 DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3323 DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3325 ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r);
3328 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3330 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3332 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3333 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3334 dst0_r, dst1_r, dst2_r, dst3_r);
3336 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3337 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3338 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3339 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3340 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3341 dst += (4 * dst_stride);
3344 static void hevc_vt_bi_4t_8x2_msa(const uint8_t *src0_ptr,
3346 const int16_t *src1_ptr,
3347 int32_t src2_stride,
3350 const int8_t *filter,
3353 v16i8 src0, src1, src2, src3, src4;
3354 v8i16 in0, in1, dst0_r, dst1_r;
3355 v16i8 src10_r, src32_r, src21_r, src43_r;
3357 v8i16 filter_vec, const_vec;
3359 src0_ptr -= src_stride;
3361 const_vec = __msa_ldi_h(128);
3364 filter_vec = LD_SH(filter);
3365 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3367 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3368 src0_ptr += (3 * src_stride);
3369 XORI_B3_128_SB(src0, src1, src2);
3370 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3372 LD_SB2(src0_ptr, src_stride, src3, src4);
3373 LD_SH2(src1_ptr, src2_stride, in0, in1);
3374 XORI_B2_128_SB(src3, src4);
3375 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3378 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3380 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3382 HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
3383 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3385 ST_D2(dst0_r, 0, 1, dst, dst_stride);
3388 static void hevc_vt_bi_4t_8x6_msa(const uint8_t *src0_ptr,
3390 const int16_t *src1_ptr,
3391 int32_t src2_stride,
3394 const int8_t *filter,
3397 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3398 v8i16 in0, in1, in2, in3, in4, in5;
3399 v16i8 src10_r, src32_r, src54_r, src76_r;
3400 v16i8 src21_r, src43_r, src65_r, src87_r;
3401 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3403 v8i16 filter_vec, const_vec;
3405 src0_ptr -= src_stride;
3407 const_vec = __msa_ldi_h(128);
3410 filter_vec = LD_SH(filter);
3411 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3413 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3414 src0_ptr += (3 * src_stride);
3415 XORI_B3_128_SB(src0, src1, src2);
3416 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3418 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3419 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3420 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3421 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3422 src32_r, src43_r, src54_r, src65_r);
3423 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3426 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3428 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3430 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3432 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3434 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3436 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3437 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3438 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3439 dst0_r, dst1_r, dst2_r, dst3_r);
3440 HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
3442 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3443 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3444 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3445 ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride);
3448 static void hevc_vt_bi_4t_8x4multiple_msa(const uint8_t *src0_ptr,
3450 const int16_t *src1_ptr,
3451 int32_t src2_stride,
3454 const int8_t *filter,
3458 v16i8 src0, src1, src2, src3, src4, src5;
3459 v8i16 in0, in1, in2, in3;
3460 v16i8 src10_r, src32_r, src21_r, src43_r;
3461 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3463 v8i16 filter_vec, const_vec;
3465 src0_ptr -= src_stride;
3467 const_vec = __msa_ldi_h(128);
3470 filter_vec = LD_SH(filter);
3471 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3473 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3474 src0_ptr += (3 * src_stride);
3475 XORI_B3_128_SB(src0, src1, src2);
3476 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3478 for (loop_cnt = (height >> 2); loop_cnt--;) {
3479 LD_SB2(src0_ptr, src_stride, src3, src4);
3480 src0_ptr += (2 * src_stride);
3481 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3482 src1_ptr += (4 * src2_stride);
3483 XORI_B2_128_SB(src3, src4);
3484 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3487 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3489 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3491 LD_SB2(src0_ptr, src_stride, src5, src2);
3492 src0_ptr += (2 * src_stride);
3493 XORI_B2_128_SB(src5, src2);
3494 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3497 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3499 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3500 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3501 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3502 dst0_r, dst1_r, dst2_r, dst3_r);
3504 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3505 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3506 dst += (4 * dst_stride);
3510 static void hevc_vt_bi_4t_8w_msa(const uint8_t *src0_ptr,
3512 const int16_t *src1_ptr,
3513 int32_t src2_stride,
3516 const int8_t *filter,
3520 hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3521 dst, dst_stride, filter, height);
3522 } else if (6 == height) {
3523 hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3524 dst, dst_stride, filter, height);
3526 hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
3527 src1_ptr, src2_stride,
3528 dst, dst_stride, filter, height);
3532 static void hevc_vt_bi_4t_12w_msa(const uint8_t *src0_ptr,
3534 const int16_t *src1_ptr,
3535 int32_t src2_stride,
3538 const int8_t *filter,
3542 v16i8 src0, src1, src2, src3, src4, src5, src6;
3543 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3544 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3545 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3546 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3547 v16i8 src2110, src4332, src6554;
3548 v8i16 dst0_l, dst1_l, filt0, filt1;
3549 v8i16 filter_vec, const_vec;
3551 src0_ptr -= (1 * src_stride);
3553 const_vec = __msa_ldi_h(128);
3556 filter_vec = LD_SH(filter);
3557 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3559 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3560 src0_ptr += (3 * src_stride);
3561 XORI_B3_128_SB(src0, src1, src2);
3562 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3563 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3564 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3566 for (loop_cnt = (height >> 2); loop_cnt--;) {
3567 LD_SB2(src0_ptr, src_stride, src3, src4);
3568 src0_ptr += (2 * src_stride);
3569 LD_SB2(src0_ptr, src_stride, src5, src6);
3570 src0_ptr += (2 * src_stride);
3571 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3572 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3573 src1_ptr += (4 * src2_stride);
3574 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3575 XORI_B2_128_SB(src3, src4);
3576 XORI_B2_128_SB(src5, src6);
3578 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3579 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3580 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3581 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3582 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
3583 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3586 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3588 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3590 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3592 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3594 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3596 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
3597 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3598 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3599 dst0_r, dst1_r, dst2_r, dst3_r);
3600 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
3602 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3603 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3604 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3605 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
3606 dst += (4 * dst_stride);
3615 static void hevc_vt_bi_4t_16w_msa(const uint8_t *src0_ptr,
3617 const int16_t *src1_ptr,
3618 int32_t src2_stride,
3621 const int8_t *filter,
3625 v16i8 src0, src1, src2, src3, src4, src5;
3626 v8i16 in0, in1, in2, in3;
3627 v16i8 src10_r, src32_r, src21_r, src43_r;
3628 v16i8 src10_l, src32_l, src21_l, src43_l;
3629 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3631 v8i16 filter_vec, const_vec;
3633 src0_ptr -= src_stride;
3635 const_vec = __msa_ldi_h(128);
3638 filter_vec = LD_SH(filter);
3639 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3641 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3642 src0_ptr += (3 * src_stride);
3643 XORI_B3_128_SB(src0, src1, src2);
3644 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3645 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3647 for (loop_cnt = (height >> 2); loop_cnt--;) {
3648 LD_SB2(src0_ptr, src_stride, src3, src4);
3649 src0_ptr += (2 * src_stride);
3650 LD_SH2(src1_ptr, src2_stride, in0, in1);
3651 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3652 src1_ptr += (2 * src2_stride);
3653 XORI_B2_128_SB(src3, src4);
3654 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3655 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3658 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3660 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3662 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3664 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3665 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3666 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3667 dst0_r, dst1_r, dst0_l, dst1_l);
3669 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3670 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3671 dst += (2 * dst_stride);
3673 LD_SB2(src0_ptr, src_stride, src5, src2);
3674 src0_ptr += (2 * src_stride);
3675 LD_SH2(src1_ptr, src2_stride, in0, in1);
3676 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3677 src1_ptr += (2 * src2_stride);
3678 XORI_B2_128_SB(src5, src2);
3679 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3680 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3683 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3685 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3687 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3689 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3690 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3691 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3692 dst0_r, dst1_r, dst0_l, dst1_l);
3694 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3695 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3696 dst += (2 * dst_stride);
3700 static void hevc_vt_bi_4t_24w_msa(const uint8_t *src0_ptr,
3702 const int16_t *src1_ptr,
3703 int32_t src2_stride,
3706 const int8_t *filter,
3710 v16i8 src0, src1, src2, src3, src4, src5;
3711 v16i8 src6, src7, src8, src9, src10, src11;
3712 v8i16 in0, in1, in2, in3, in4, in5;
3713 v16i8 src10_r, src32_r, src76_r, src98_r;
3714 v16i8 src21_r, src43_r, src87_r, src109_r;
3715 v16i8 src10_l, src32_l, src21_l, src43_l;
3716 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3717 v8i16 dst0_l, dst1_l;
3719 v8i16 filter_vec, const_vec;
3721 src0_ptr -= src_stride;
3723 const_vec = __msa_ldi_h(128);
3726 filter_vec = LD_SH(filter);
3727 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3730 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3731 XORI_B3_128_SB(src0, src1, src2);
3732 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3733 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3735 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3736 src0_ptr += (3 * src_stride);
3737 XORI_B3_128_SB(src6, src7, src8);
3738 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3740 for (loop_cnt = (height >> 2); loop_cnt--;) {
3742 LD_SB2(src0_ptr, src_stride, src3, src4);
3743 LD_SH2(src1_ptr, src2_stride, in0, in1);
3744 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3745 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3746 src1_ptr += (2 * src2_stride);
3747 XORI_B2_128_SB(src3, src4);
3748 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3749 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3751 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3752 src0_ptr += (2 * src_stride);
3753 XORI_B2_128_SB(src9, src10);
3754 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3757 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3759 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3761 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3763 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3766 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3768 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3770 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3771 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3772 dst0_r, dst1_r, dst0_l, dst1_l);
3774 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3776 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3777 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3778 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3779 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3780 dst += (2 * dst_stride);
3783 LD_SB2(src0_ptr, src_stride, src5, src2);
3784 LD_SH2(src1_ptr, src2_stride, in0, in1);
3785 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3786 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3787 src1_ptr += (2 * src2_stride);
3788 XORI_B2_128_SB(src5, src2);
3789 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3790 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3792 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3793 src0_ptr += (2 * src_stride);
3794 XORI_B2_128_SB(src11, src8);
3795 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3798 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3800 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3802 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3804 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3807 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3809 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3811 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3812 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3813 dst0_r, dst1_r, dst0_l, dst1_l);
3814 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3816 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3817 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3818 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3819 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3820 dst += (2 * dst_stride);
3824 static void hevc_vt_bi_4t_32w_msa(const uint8_t *src0_ptr,
3826 const int16_t *src1_ptr,
3827 int32_t src2_stride,
3830 const int8_t *filter,
3834 uint8_t *dst_tmp = dst + 16;
3835 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3836 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3837 v16i8 src10_r, src32_r, src76_r, src98_r;
3838 v16i8 src21_r, src43_r, src87_r, src109_r;
3839 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3840 v16i8 src10_l, src32_l, src76_l, src98_l;
3841 v16i8 src21_l, src43_l, src87_l, src109_l;
3842 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3844 v8i16 filter_vec, const_vec;
3846 src0_ptr -= src_stride;
3848 const_vec = __msa_ldi_h(128);
3851 filter_vec = LD_SH(filter);
3852 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3855 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3856 XORI_B3_128_SB(src0, src1, src2);
3857 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3858 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3861 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3862 src0_ptr += (3 * src_stride);
3863 XORI_B3_128_SB(src6, src7, src8);
3864 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3865 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3867 for (loop_cnt = (height >> 1); loop_cnt--;) {
3869 LD_SB2(src0_ptr, src_stride, src3, src4);
3870 LD_SH2(src1_ptr, src2_stride, in0, in1);
3871 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3872 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3873 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3874 src1_ptr += (2 * src2_stride);
3875 XORI_B2_128_SB(src3, src4);
3876 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3877 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3880 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3882 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3884 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3886 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3888 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3889 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3890 dst0_r, dst1_r, dst0_l, dst1_l);
3898 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3899 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3900 dst += (2 * dst_stride);
3903 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3904 src0_ptr += (2 * src_stride);
3905 XORI_B2_128_SB(src9, src10);
3906 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3907 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3910 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3912 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3914 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3916 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3918 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
3919 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3920 dst2_r, dst3_r, dst2_l, dst3_l);
3922 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3923 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3924 dst_tmp += (2 * dst_stride);
3934 static void hevc_hv_bi_4t_4x2_msa(const uint8_t *src0_ptr,
3936 const int16_t *src1_ptr,
3937 int32_t src2_stride,
3940 const int8_t *filter_x,
3941 const int8_t *filter_y)
3946 v16i8 src0, src1, src2, src3, src4;
3948 v8i16 filt_h0, filt_h1;
3949 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3951 v8i16 filter_vec, const_vec;
3952 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3953 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp;
3956 src0_ptr -= (src_stride + 1);
3958 filter_vec = LD_SH(filter_x);
3959 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3961 filter_vec = LD_SH(filter_y);
3962 UNPCK_R_SB_SH(filter_vec, filter_vec);
3964 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3968 const_vec = __msa_ldi_h(128);
3971 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
3972 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3974 LD2(src1_ptr, src2_stride, tp0, tp1);
3975 INSERT_D2_SH(tp0, tp1, in0);
3976 in0 = __msa_adds_s_h(in0, const_vec);
3978 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3979 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3980 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3982 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3983 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3984 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3986 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3987 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3989 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3990 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3993 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3994 tmp = __msa_adds_s_h(tmp, in0);
3995 tmp = __msa_srari_h(tmp, 7);
3997 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
3998 ST_W2(out, 0, 1, dst, dst_stride);
4001 static void hevc_hv_bi_4t_4x4_msa(const uint8_t *src0_ptr,
4003 const int16_t *src1_ptr,
4004 int32_t src2_stride,
4007 const int8_t *filter_x,
4008 const int8_t *filter_y)
4012 v16i8 src0, src1, src2, src3, src4, src5, src6;
4014 v8i16 filt_h0, filt_h1;
4015 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4017 v8i16 filter_vec, const_vec;
4018 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4020 v8i16 in0 = { 0 }, in1 = { 0 };
4021 v8i16 dst30, dst41, dst52, dst63;
4022 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4023 v4i32 dst0, dst1, dst2, dst3;
4025 src0_ptr -= (src_stride + 1);
4027 filter_vec = LD_SH(filter_x);
4028 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4030 filter_vec = LD_SH(filter_y);
4031 UNPCK_R_SB_SH(filter_vec, filter_vec);
4033 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4037 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4038 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4040 const_vec = __msa_ldi_h(128);
4043 LD2(src1_ptr, src2_stride, tp0, tp1);
4044 src1_ptr += 2 * src2_stride;
4045 INSERT_D2_SH(tp0, tp1, in0);
4046 LD2(src1_ptr, src2_stride, tp0, tp1);
4047 INSERT_D2_SH(tp0, tp1, in1);
4049 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4051 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4052 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4053 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4054 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4056 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4057 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4058 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4059 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4061 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4062 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4063 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4064 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4065 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4066 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4067 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4068 SRA_4V(dst0, dst1, dst2, dst3, 6);
4069 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4070 ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1);
4071 SRARI_H2_SH(tmp0, tmp1, 7);
4072 CLIP_SH2_0_255(tmp0, tmp1);
4073 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4074 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4077 static void hevc_hv_bi_4t_4multx8mult_msa(const uint8_t *src0_ptr,
4079 const int16_t *src1_ptr,
4080 int32_t src2_stride,
4083 const int8_t *filter_x,
4084 const int8_t *filter_y,
4090 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4092 v8i16 filt_h0, filt_h1;
4093 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4095 v8i16 filter_vec, const_vec;
4096 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4097 v8i16 tmp0, tmp1, tmp2, tmp3;
4098 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4099 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4100 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4101 v8i16 dst98_r, dst109_r;
4102 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4103 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4105 src0_ptr -= (src_stride + 1);
4107 filter_vec = LD_SH(filter_x);
4108 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4110 filter_vec = LD_SH(filter_y);
4111 UNPCK_R_SB_SH(filter_vec, filter_vec);
4113 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4117 const_vec = __msa_ldi_h(128);
4120 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4121 src0_ptr += (3 * src_stride);
4122 XORI_B3_128_SB(src0, src1, src2);
4124 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4125 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4126 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4127 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4128 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4129 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4132 for (loop_cnt = height >> 3; loop_cnt--;) {
4133 LD_SB8(src0_ptr, src_stride,
4134 src3, src4, src5, src6, src7, src8, src9, src10);
4135 src0_ptr += (8 * src_stride);
4136 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4137 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4138 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4139 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4140 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4142 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4143 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4144 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4145 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4147 dst32_r = __msa_ilvr_h(dst73, dst22);
4148 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4149 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4150 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4151 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4152 dst76_r = __msa_ilvr_h(dst22, dst106);
4154 LD2(src1_ptr, src2_stride, tp0, tp1);
4155 src1_ptr += 2 * src2_stride;
4156 INSERT_D2_SH(tp0, tp1, in0);
4157 LD2(src1_ptr, src2_stride, tp0, tp1);
4158 src1_ptr += 2 * src2_stride;
4159 INSERT_D2_SH(tp0, tp1, in1);
4161 LD2(src1_ptr, src2_stride, tp0, tp1);
4162 src1_ptr += 2 * src2_stride;
4163 INSERT_D2_SH(tp0, tp1, in2);
4164 LD2(src1_ptr, src2_stride, tp0, tp1);
4165 src1_ptr += 2 * src2_stride;
4166 INSERT_D2_SH(tp0, tp1, in3);
4168 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4169 const_vec, in0, in1, in2, in3);
4170 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4171 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4172 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4173 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4174 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4175 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4176 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4177 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4178 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4179 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4180 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
4181 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4182 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
4184 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4185 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4186 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4187 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4188 dst += (8 * dst_stride);
4192 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4196 static void hevc_hv_bi_4t_4w_msa(const uint8_t *src0_ptr,
4198 const int16_t *src1_ptr,
4199 int32_t src2_stride,
4202 const int8_t *filter_x,
4203 const int8_t *filter_y,
4207 hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4208 dst, dst_stride, filter_x, filter_y);
4209 } else if (4 == height) {
4210 hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4211 dst, dst_stride, filter_x, filter_y);
4212 } else if (0 == (height % 8)) {
4213 hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
4214 src1_ptr, src2_stride,
4216 filter_x, filter_y, height);
4220 static void hevc_hv_bi_4t_6w_msa(const uint8_t *src0_ptr,
4222 const int16_t *src1_ptr,
4223 int32_t src2_stride,
4226 const int8_t *filter_x,
4227 const int8_t *filter_y,
4230 uint32_t tpw0, tpw1, tpw2, tpw3;
4232 v16u8 out0, out1, out2;
4233 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4234 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4236 v8i16 filt_h0, filt_h1;
4237 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4239 v8i16 filter_vec, const_vec;
4240 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4241 v8i16 dsth10, tmp4, tmp5;
4242 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4243 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4244 v8i16 tmp0, tmp1, tmp2, tmp3;
4245 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4246 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4247 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
4248 v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l;
4249 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4250 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4251 v8i16 in4 = { 0 }, in5 = { 0 };
4253 src0_ptr -= (src_stride + 1);
4255 filter_vec = LD_SH(filter_x);
4256 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4258 filter_vec = LD_SH(filter_y);
4259 UNPCK_R_SB_SH(filter_vec, filter_vec);
4261 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4265 const_vec = __msa_ldi_h(128);
4268 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4269 src0_ptr += (3 * src_stride);
4270 XORI_B3_128_SB(src0, src1, src2);
4272 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4273 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4274 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4276 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4277 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4278 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4280 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4281 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4283 LD_SB8(src0_ptr, src_stride,
4284 src3, src4, src5, src6, src7, src8, src9, src10);
4285 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4287 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4288 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4289 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4290 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4292 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4293 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4294 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4295 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4297 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4298 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4299 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4300 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4302 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4303 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4304 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4305 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4307 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4308 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4309 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4310 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4311 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4312 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4313 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4314 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4315 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4316 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4317 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4319 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4320 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4321 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4322 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4323 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4324 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4325 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4326 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4327 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4328 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4329 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4330 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4331 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4332 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4333 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4334 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4335 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4336 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4338 LD2(src1_ptr, src2_stride, tp0, tp1);
4339 INSERT_D2_SH(tp0, tp1, in0);
4340 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4341 INSERT_D2_SH(tp0, tp1, in1);
4343 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4344 INSERT_D2_SH(tp0, tp1, in2);
4345 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4346 INSERT_D2_SH(tp0, tp1, in3);
4348 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4349 in0, in1, in2, in3);
4350 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
4352 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4353 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4354 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4355 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4357 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4358 src1_ptr += (4 * src2_stride);
4359 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
4360 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4361 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
4362 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4363 ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
4364 SRARI_H2_SH(tmp4, tmp5, 7);
4365 CLIP_SH2_0_255(tmp4, tmp5);
4366 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4367 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4370 static void hevc_hv_bi_4t_8x2_msa(const uint8_t *src0_ptr,
4372 const int16_t *src1_ptr,
4373 int32_t src2_stride,
4376 const int8_t *filter_x,
4377 const int8_t *filter_y)
4380 v16i8 src0, src1, src2, src3, src4;
4382 v8i16 filt_h0, filt_h1;
4383 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4385 v8i16 filter_vec, const_vec;
4386 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4387 v8i16 dst0, dst1, dst2, dst3, dst4;
4388 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4389 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4390 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4394 src0_ptr -= (src_stride + 1);
4396 filter_vec = LD_SH(filter_x);
4397 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4399 filter_vec = LD_SH(filter_y);
4400 UNPCK_R_SB_SH(filter_vec, filter_vec);
4402 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4406 const_vec = __msa_ldi_h(128);
4409 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4410 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4412 LD_SH2(src1_ptr, src2_stride, in0, in1);
4413 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4415 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4416 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4417 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4418 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4419 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4421 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4422 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4423 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4424 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4425 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4427 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4428 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4429 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4430 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4431 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4432 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4433 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4434 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4435 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4436 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4437 ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1);
4438 SRARI_H2_SH(tmp0, tmp1, 7);
4439 CLIP_SH2_0_255(tmp0, tmp1);
4440 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4441 ST_D2(out, 0, 1, dst, dst_stride);
4444 static void hevc_hv_bi_4t_8multx4_msa(const uint8_t *src0_ptr,
4446 const int16_t *src1_ptr,
4447 int32_t src2_stride,
4450 const int8_t *filter_x,
4451 const int8_t *filter_y,
4456 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
4457 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4458 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4459 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4460 v8i16 in0, in1, in2, in3;
4461 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4462 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4463 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4465 src0_ptr -= (src_stride + 1);
4467 filter_vec = LD_SH(filter_x);
4468 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4470 filter_vec = LD_SH(filter_y);
4471 UNPCK_R_SB_SH(filter_vec, filter_vec);
4473 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4475 mask0 = LD_SB(ff_hevc_mask_arr);
4478 const_vec = __msa_ldi_h(128);
4481 for (cnt = width8mult; cnt--;) {
4482 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4484 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4486 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4488 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4489 const_vec, in0, in1, in2, in3);
4491 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4492 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4493 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4495 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4496 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4497 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4499 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4500 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4502 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4503 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4504 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4505 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4507 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4508 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4509 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4510 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4512 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4513 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4514 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4515 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4517 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4518 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4519 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4520 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4521 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4522 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4523 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4524 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4526 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4527 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4528 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4529 dst3_r, tmp0, tmp1, tmp2, tmp3);
4530 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4531 tmp0, tmp1, tmp2, tmp3);
4532 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4533 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4534 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4535 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4540 static void hevc_hv_bi_4t_8x6_msa(const uint8_t *src0_ptr,
4542 const int16_t *src1_ptr,
4543 int32_t src2_stride,
4546 const int8_t *filter_x,
4547 const int8_t *filter_y)
4549 v16u8 out0, out1, out2;
4550 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4551 v8i16 in0, in1, in2, in3, in4, in5;
4553 v8i16 filt_h0, filt_h1;
4554 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4556 v8i16 filter_vec, const_vec;
4557 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4558 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4559 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4560 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4561 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4562 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4563 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4564 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4565 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4566 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4568 src0_ptr -= (src_stride + 1);
4570 filter_vec = LD_SH(filter_x);
4571 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4573 filter_vec = LD_SH(filter_y);
4574 UNPCK_R_SB_SH(filter_vec, filter_vec);
4576 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4580 const_vec = __msa_ldi_h(128);
4583 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4584 src0_ptr += (5 * src_stride);
4585 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
4587 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4588 XORI_B4_128_SB(src5, src6, src7, src8);
4590 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4591 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4592 in0, in1, in2, in3);
4593 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4595 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4596 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4597 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4598 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4599 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4600 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4601 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4602 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4603 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4605 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4606 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4607 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4608 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4609 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4610 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
4611 dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
4612 dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
4613 dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
4615 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4616 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4617 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4618 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4619 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4620 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4621 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4622 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4624 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4625 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4626 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4627 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4628 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4629 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4630 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4631 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4632 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4633 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4634 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4635 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4637 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4638 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4639 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4640 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4641 tmp0, tmp1, tmp2, tmp3);
4642 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4643 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4644 tmp0, tmp1, tmp2, tmp3);
4645 ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
4646 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4647 SRARI_H2_SH(tmp4, tmp5, 7);
4648 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4649 CLIP_SH2_0_255(tmp4, tmp5);
4650 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4651 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4652 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4653 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4656 static void hevc_hv_bi_4t_8multx4mult_msa(const uint8_t *src0_ptr,
4658 const int16_t *src1_ptr,
4659 int32_t src2_stride,
4662 const int8_t *filter_x,
4663 const int8_t *filter_y,
4667 uint32_t loop_cnt, cnt;
4668 const uint8_t *src0_ptr_tmp;
4669 const int16_t *src1_ptr_tmp;
4672 v16i8 src0, src1, src2, src3, src4, src5, src6;
4673 v8i16 in0, in1, in2, in3;
4675 v8i16 filt_h0, filt_h1;
4676 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4678 v8i16 filter_vec, const_vec;
4679 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4680 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4681 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4682 v8i16 tmp0, tmp1, tmp2, tmp3;
4683 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4684 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4685 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
4687 src0_ptr -= (src_stride + 1);
4689 filter_vec = LD_SH(filter_x);
4690 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4692 filter_vec = LD_SH(filter_y);
4693 UNPCK_R_SB_SH(filter_vec, filter_vec);
4695 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4699 const_vec = __msa_ldi_h(128);
4702 for (cnt = width >> 3; cnt--;) {
4703 src0_ptr_tmp = src0_ptr;
4705 src1_ptr_tmp = src1_ptr;
4707 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4708 src0_ptr_tmp += (3 * src_stride);
4709 XORI_B3_128_SB(src0, src1, src2);
4711 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4712 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4713 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4715 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4716 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4717 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4719 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4720 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4722 for (loop_cnt = height >> 2; loop_cnt--;) {
4723 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4724 src0_ptr_tmp += (4 * src_stride);
4725 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4726 src1_ptr_tmp += (4 * src2_stride);
4727 XORI_B4_128_SB(src3, src4, src5, src6);
4729 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4730 const_vec, in0, in1, in2, in3);
4732 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4733 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4734 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4735 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4737 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4738 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4739 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4740 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4742 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4743 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4744 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4745 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4747 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4748 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4749 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4750 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4751 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4752 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4753 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4754 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4756 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4757 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4758 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4759 dst3_r, tmp0, tmp1, tmp2, tmp3);
4760 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4761 tmp0, tmp1, tmp2, tmp3);
4762 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4763 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4764 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4765 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4766 dst_tmp += (4 * dst_stride);
4781 static void hevc_hv_bi_4t_8w_msa(const uint8_t *src0_ptr,
4783 const int16_t *src1_ptr,
4784 int32_t src2_stride,
4787 const int8_t *filter_x,
4788 const int8_t *filter_y,
4792 hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4793 dst, dst_stride, filter_x, filter_y);
4794 } else if (4 == height) {
4795 hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4796 dst, dst_stride, filter_x, filter_y, 1);
4797 } else if (6 == height) {
4798 hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4799 dst, dst_stride, filter_x, filter_y);
4801 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
4802 src1_ptr, src2_stride,
4804 filter_x, filter_y, height, 8);
4808 static void hevc_hv_bi_4t_12w_msa(const uint8_t *src0_ptr,
4810 const int16_t *src1_ptr,
4811 int32_t src2_stride,
4814 const int8_t *filter_x,
4815 const int8_t *filter_y,
4820 const uint8_t *src0_ptr_tmp;
4822 const int16_t *src1_ptr_tmp;
4824 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4825 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4826 v16i8 mask0, mask1, mask2, mask3;
4827 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4828 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec;
4829 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4830 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
4831 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4832 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4833 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4834 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4835 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4837 src0_ptr -= (src_stride + 1);
4839 filter_vec = LD_SH(filter_x);
4840 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4842 filter_vec = LD_SH(filter_y);
4843 UNPCK_R_SB_SH(filter_vec, filter_vec);
4845 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4847 mask0 = LD_SB(ff_hevc_mask_arr);
4850 const_vec = __msa_ldi_h(128);
4853 src0_ptr_tmp = src0_ptr;
4855 src1_ptr_tmp = src1_ptr;
4857 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4858 src0_ptr_tmp += (3 * src_stride);
4860 XORI_B3_128_SB(src0, src1, src2);
4862 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4863 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4864 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4866 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4867 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4868 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4870 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4871 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4873 for (loop_cnt = 4; loop_cnt--;) {
4874 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4875 src0_ptr_tmp += (4 * src_stride);
4876 XORI_B4_128_SB(src3, src4, src5, src6);
4878 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4879 src1_ptr_tmp += (4 * src2_stride);
4880 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4881 const_vec, in0, in1, in2, in3);
4883 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4884 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4885 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4886 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4888 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4889 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4890 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4891 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4893 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4894 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4895 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4896 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4898 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4899 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4900 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4901 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4902 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4903 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4904 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4905 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4907 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4908 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4909 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4910 dst3_r, tmp0, tmp1, tmp2, tmp3);
4911 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4912 tmp0, tmp1, tmp2, tmp3);
4913 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4914 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4915 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4916 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4917 dst_tmp += (4 * dst_stride);
4930 mask2 = LD_SB(ff_hevc_mask_arr + 16);
4933 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4934 src0_ptr += (3 * src_stride);
4935 XORI_B3_128_SB(src0, src1, src2);
4936 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4937 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4939 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4940 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4942 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4943 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4945 for (loop_cnt = 2; loop_cnt--;) {
4946 LD_SB8(src0_ptr, src_stride,
4947 src3, src4, src5, src6, src7, src8, src9, src10);
4948 src0_ptr += (8 * src_stride);
4949 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4950 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4951 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4952 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4953 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4955 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4956 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4957 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4958 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4960 dst32_r = __msa_ilvr_h(dst73, dst22);
4961 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4962 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4963 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4964 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4965 dst76_r = __msa_ilvr_h(dst22, dst106);
4967 LD2(src1_ptr, src2_stride, tp0, tp1);
4968 src1_ptr += 2 * src2_stride;
4969 INSERT_D2_SH(tp0, tp1, in0);
4970 LD2(src1_ptr, src2_stride, tp0, tp1);
4971 src1_ptr += 2 * src2_stride;
4972 INSERT_D2_SH(tp0, tp1, in1);
4974 LD2(src1_ptr, src2_stride, tp0, tp1);
4975 src1_ptr += 2 * src2_stride;
4976 INSERT_D2_SH(tp0, tp1, in2);
4977 LD2(src1_ptr, src2_stride, tp0, tp1);
4978 src1_ptr += 2 * src2_stride;
4979 INSERT_D2_SH(tp0, tp1, in3);
4981 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4982 const_vec, in0, in1, in2, in3);
4984 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4985 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4986 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4987 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4988 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4989 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4990 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4991 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4993 SRA_4V(dst0, dst1, dst2, dst3, 6);
4994 SRA_4V(dst4, dst5, dst6, dst7, 6);
4995 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4996 tmp0, tmp1, tmp2, tmp3);
4997 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4998 tmp0, tmp1, tmp2, tmp3);
4999 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5000 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5001 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5002 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5003 dst += (8 * dst_stride);
5007 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5011 static void hevc_hv_bi_4t_16w_msa(const uint8_t *src0_ptr,
5013 const int16_t *src1_ptr,
5014 int32_t src2_stride,
5017 const int8_t *filter_x,
5018 const int8_t *filter_y,
5022 hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5023 dst, dst_stride, filter_x, filter_y, 2);
5025 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
5026 src2_stride, dst, dst_stride, filter_x,
5027 filter_y, height, 16);
5031 static void hevc_hv_bi_4t_24w_msa(const uint8_t *src0_ptr,
5033 const int16_t *src1_ptr,
5034 int32_t src2_stride,
5037 const int8_t *filter_x,
5038 const int8_t *filter_y,
5041 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5042 dst, dst_stride, filter_x, filter_y,
5046 static void hevc_hv_bi_4t_32w_msa(const uint8_t *src0_ptr,
5048 const int16_t *src1_ptr,
5049 int32_t src2_stride,
5052 const int8_t *filter_x,
5053 const int8_t *filter_y,
5056 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5057 dst, dst_stride, filter_x, filter_y,
5061 #define BI_MC_COPY(WIDTH) \
5062 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5063 ptrdiff_t dst_stride, \
5064 const uint8_t *src, \
5065 ptrdiff_t src_stride, \
5066 const int16_t *src_16bit, \
5072 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
5073 dst, dst_stride, height); \
5088 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5089 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5090 ptrdiff_t dst_stride, \
5091 const uint8_t *src, \
5092 ptrdiff_t src_stride, \
5093 const int16_t *src_16bit, \
5099 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5101 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5102 MAX_PB_SIZE, dst, dst_stride, \
5106 BI_MC(qpel, h, 4, 8, hz, mx);
5107 BI_MC(qpel, h, 8, 8, hz, mx);
5108 BI_MC(qpel, h, 12, 8, hz, mx);
5109 BI_MC(qpel, h, 16, 8, hz, mx);
5110 BI_MC(qpel, h, 24, 8, hz, mx);
5111 BI_MC(qpel, h, 32, 8, hz, mx);
5112 BI_MC(qpel, h, 48, 8, hz, mx);
5113 BI_MC(qpel, h, 64, 8, hz, mx);
5115 BI_MC(qpel, v, 4, 8, vt, my);
5116 BI_MC(qpel, v, 8, 8, vt, my);
5117 BI_MC(qpel, v, 12, 8, vt, my);
5118 BI_MC(qpel, v, 16, 8, vt, my);
5119 BI_MC(qpel, v, 24, 8, vt, my);
5120 BI_MC(qpel, v, 32, 8, vt, my);
5121 BI_MC(qpel, v, 48, 8, vt, my);
5122 BI_MC(qpel, v, 64, 8, vt, my);
5124 BI_MC(epel, h, 4, 4, hz, mx);
5125 BI_MC(epel, h, 8, 4, hz, mx);
5126 BI_MC(epel, h, 6, 4, hz, mx);
5127 BI_MC(epel, h, 12, 4, hz, mx);
5128 BI_MC(epel, h, 16, 4, hz, mx);
5129 BI_MC(epel, h, 24, 4, hz, mx);
5130 BI_MC(epel, h, 32, 4, hz, mx);
5132 BI_MC(epel, v, 4, 4, vt, my);
5133 BI_MC(epel, v, 8, 4, vt, my);
5134 BI_MC(epel, v, 6, 4, vt, my);
5135 BI_MC(epel, v, 12, 4, vt, my);
5136 BI_MC(epel, v, 16, 4, vt, my);
5137 BI_MC(epel, v, 24, 4, vt, my);
5138 BI_MC(epel, v, 32, 4, vt, my);
5142 #define BI_MC_HV(PEL, WIDTH, TAP) \
5143 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
5144 ptrdiff_t dst_stride, \
5145 const uint8_t *src, \
5146 ptrdiff_t src_stride, \
5147 const int16_t *src_16bit, \
5153 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5154 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5156 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5157 MAX_PB_SIZE, dst, dst_stride, \
5158 filter_x, filter_y, height); \
5161 BI_MC_HV(qpel, 4, 8);
5162 BI_MC_HV(qpel, 8, 8);
5163 BI_MC_HV(qpel, 12, 8);
5164 BI_MC_HV(qpel, 16, 8);
5165 BI_MC_HV(qpel, 24, 8);
5166 BI_MC_HV(qpel, 32, 8);
5167 BI_MC_HV(qpel, 48, 8);
5168 BI_MC_HV(qpel, 64, 8);
5170 BI_MC_HV(epel, 4, 4);
5171 BI_MC_HV(epel, 8, 4);
5172 BI_MC_HV(epel, 6, 4);
5173 BI_MC_HV(epel, 12, 4);
5174 BI_MC_HV(epel, 16, 4);
5175 BI_MC_HV(epel, 24, 4);
5176 BI_MC_HV(epel, 32, 4);