2 * Copyright (C) 2022 Loongson Technology Corporation Limited
3 * Contributed by Hao Chen(chenhao@loongson.cn)
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "swscale_loongarch.h"
23 #include "libavutil/loongarch/loongson_intrinsics.h"
24 #include "libavutil/intreadwrite.h"
26 #define SCALE_8_16(_sh) \
28 src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
29 src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
30 src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
31 src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
32 src4 = __lasx_xvldrepl_d(src + filterPos[4], 0); \
33 src5 = __lasx_xvldrepl_d(src + filterPos[5], 0); \
34 src6 = __lasx_xvldrepl_d(src + filterPos[6], 0); \
35 src7 = __lasx_xvldrepl_d(src + filterPos[7], 0); \
36 src8 = __lasx_xvldrepl_d(src + filterPos[8], 0); \
37 src9 = __lasx_xvldrepl_d(src + filterPos[9], 0); \
38 src10 = __lasx_xvldrepl_d(src + filterPos[10], 0); \
39 src11 = __lasx_xvldrepl_d(src + filterPos[11], 0); \
40 src12 = __lasx_xvldrepl_d(src + filterPos[12], 0); \
41 src13 = __lasx_xvldrepl_d(src + filterPos[13], 0); \
42 src14 = __lasx_xvldrepl_d(src + filterPos[14], 0); \
43 src15 = __lasx_xvldrepl_d(src + filterPos[15], 0); \
44 DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
45 filter, 96, filter0, filter1, filter2, filter3); \
46 DUP4_ARG2(__lasx_xvld, filter, 128, filter, 160, \
47 filter, 192, filter, 224, filter4, \
48 filter5, filter6, filter7); \
49 DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2, \
50 src5, src4, src7, src6, src0, src2, src4, src6); \
51 DUP4_ARG2(__lasx_xvilvl_d, src9, src8, src11, src10, \
52 src13, src12, src15, src14, src8, src10, src12, src14); \
53 DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6, \
54 src0, src2, src4, src6); \
55 DUP4_ARG1(__lasx_vext2xv_hu_bu, src8, src10, src12, \
56 src14, src8, src10, src12, src14); \
57 DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2, \
58 filter2, src4, filter3, src6, src0, src1, src2, src3); \
59 DUP4_ARG2(__lasx_xvdp2_w_h, filter4, src8, filter5, src10, \
60 filter6, src12, filter7, src14, src4, src5, src6, src7);\
61 src0 = __lasx_xvhaddw_d_w(src0, src0); \
62 src1 = __lasx_xvhaddw_d_w(src1, src1); \
63 src2 = __lasx_xvhaddw_d_w(src2, src2); \
64 src3 = __lasx_xvhaddw_d_w(src3, src3); \
65 src4 = __lasx_xvhaddw_d_w(src4, src4); \
66 src5 = __lasx_xvhaddw_d_w(src5, src5); \
67 src6 = __lasx_xvhaddw_d_w(src6, src6); \
68 src7 = __lasx_xvhaddw_d_w(src7, src7); \
69 DUP4_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2, \
70 src5, src4, src7, src6, src0, src1, src2, src3); \
71 src0 = __lasx_xvhaddw_d_w(src0, src0); \
72 src1 = __lasx_xvhaddw_d_w(src1, src1); \
73 src2 = __lasx_xvhaddw_d_w(src2, src2); \
74 src3 = __lasx_xvhaddw_d_w(src3, src3); \
75 src0 = __lasx_xvpickev_w(src1, src0); \
76 src1 = __lasx_xvpickev_w(src3, src2); \
77 src0 = __lasx_xvsrai_w(src0, _sh); \
78 src1 = __lasx_xvsrai_w(src1, _sh); \
79 src0 = __lasx_xvmin_w(src0, vmax); \
80 src1 = __lasx_xvmin_w(src1, vmax); \
81 src0 = __lasx_xvperm_w(src0, shuf); \
82 src1 = __lasx_xvperm_w(src1, shuf); \
83 src0 = __lasx_xvpickev_h(src1, src0); \
84 src0 = __lasx_xvpermi_d(src0, 0xd8); \
85 __lasx_xvst(src0, dst, 0); \
91 #define SCALE_8_8(_sh) \
93 src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
94 src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
95 src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
96 src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
97 src4 = __lasx_xvldrepl_d(src + filterPos[4], 0); \
98 src5 = __lasx_xvldrepl_d(src + filterPos[5], 0); \
99 src6 = __lasx_xvldrepl_d(src + filterPos[6], 0); \
100 src7 = __lasx_xvldrepl_d(src + filterPos[7], 0); \
101 DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
102 filter, 96, filter0, filter1, filter2, filter3); \
105 DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2, \
106 src5, src4, src7, src6, src0, src2, src4, src6); \
107 DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6, \
108 src0, src2, src4, src6); \
109 DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2, \
110 filter2, src4, filter3, src6, src0, src1, src2,src3); \
111 src0 = __lasx_xvhaddw_d_w(src0, src0); \
112 src1 = __lasx_xvhaddw_d_w(src1, src1); \
113 src2 = __lasx_xvhaddw_d_w(src2, src2); \
114 src3 = __lasx_xvhaddw_d_w(src3, src3); \
115 src0 = __lasx_xvpickev_w(src1, src0); \
116 src1 = __lasx_xvpickev_w(src3, src2); \
117 src0 = __lasx_xvhaddw_d_w(src0, src0); \
118 src1 = __lasx_xvhaddw_d_w(src1, src1); \
119 src0 = __lasx_xvpickev_w(src1, src0); \
120 src0 = __lasx_xvsrai_w(src0, _sh); \
121 src0 = __lasx_xvmin_w(src0, vmax); \
122 src0 = __lasx_xvperm_w(src0, shuf); \
125 #define SCALE_8_4(_sh) \
127 src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
128 src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
129 src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
130 src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
131 filter0 = __lasx_xvld(filter, 0); \
132 filter1 = __lasx_xvld(filter, 32); \
135 src0 = __lasx_xvilvl_d(src1, src0); \
136 src2 = __lasx_xvilvl_d(src3, src2); \
137 src0 = __lasx_vext2xv_hu_bu(src0); \
138 src2 = __lasx_vext2xv_hu_bu(src2); \
139 src0 = __lasx_xvdp2_w_h(src0, filter0); \
140 src1 = __lasx_xvdp2_w_h(src2, filter1); \
141 src0 = __lasx_xvhaddw_d_w(src0, src0); \
142 src1 = __lasx_xvhaddw_d_w(src1, src1); \
143 src0 = __lasx_xvpickev_w(src1, src0); \
144 src0 = __lasx_xvhaddw_d_w(src0, src0); \
145 src0 = __lasx_xvpickev_w(src0, src0); \
146 src0 = __lasx_xvsrai_w(src0, _sh); \
147 src0 = __lasx_xvmin_w(src0, vmax); \
148 src0 = __lasx_xvperm_w(src0, shuf); \
151 #define SCALE_8_2(_sh) \
153 src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
154 src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
155 filter0 = __lasx_xvld(filter, 0); \
156 src0 = __lasx_xvilvl_d(src1, src0); \
157 src0 = __lasx_vext2xv_hu_bu(src0); \
158 src0 = __lasx_xvdp2_w_h(filter0, src0); \
159 src0 = __lasx_xvhaddw_d_w(src0, src0); \
160 src0 = __lasx_xvhaddw_q_d(src0, src0); \
161 src0 = __lasx_xvsrai_w(src0, _sh); \
162 src0 = __lasx_xvmin_w(src0, vmax); \
163 dst[0] = __lasx_xvpickve2gr_w(src0, 0); \
164 dst[1] = __lasx_xvpickve2gr_w(src0, 4); \
170 #define SCALE_4_16(_sh) \
172 src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
173 src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
174 src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
175 src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
176 src4 = __lasx_xvldrepl_w(src + filterPos[4], 0); \
177 src5 = __lasx_xvldrepl_w(src + filterPos[5], 0); \
178 src6 = __lasx_xvldrepl_w(src + filterPos[6], 0); \
179 src7 = __lasx_xvldrepl_w(src + filterPos[7], 0); \
180 src8 = __lasx_xvldrepl_w(src + filterPos[8], 0); \
181 src9 = __lasx_xvldrepl_w(src + filterPos[9], 0); \
182 src10 = __lasx_xvldrepl_w(src + filterPos[10], 0); \
183 src11 = __lasx_xvldrepl_w(src + filterPos[11], 0); \
184 src12 = __lasx_xvldrepl_w(src + filterPos[12], 0); \
185 src13 = __lasx_xvldrepl_w(src + filterPos[13], 0); \
186 src14 = __lasx_xvldrepl_w(src + filterPos[14], 0); \
187 src15 = __lasx_xvldrepl_w(src + filterPos[15], 0); \
188 DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
189 filter, 96, filter0, filter1, filter2, filter3); \
190 DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5, \
191 src4, src7, src6, src0, src2, src4, src6); \
192 DUP4_ARG2(__lasx_xvilvl_w, src9, src8, src11, src10, src13, \
193 src12, src15, src14, src8, src10, src12, src14); \
194 DUP4_ARG2(__lasx_xvilvl_d, src2, src0, src6, src4, src10, \
195 src8, src14, src12, src0, src1, src2, src3); \
196 DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src1, src2, src3, \
197 src0, src1, src2, src3); \
198 DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1, \
199 filter2, src2, filter3, src3, src0, src1, src2, src3); \
200 src0 = __lasx_xvhaddw_d_w(src0, src0); \
201 src1 = __lasx_xvhaddw_d_w(src1, src1); \
202 src2 = __lasx_xvhaddw_d_w(src2, src2); \
203 src3 = __lasx_xvhaddw_d_w(src3, src3); \
204 src0 = __lasx_xvpickev_w(src1, src0); \
205 src1 = __lasx_xvpickev_w(src3, src2); \
206 src0 = __lasx_xvsrai_w(src0, _sh); \
207 src1 = __lasx_xvsrai_w(src1, _sh); \
208 src0 = __lasx_xvmin_w(src0, vmax); \
209 src1 = __lasx_xvmin_w(src1, vmax); \
210 src0 = __lasx_xvpickev_h(src1, src0); \
211 src0 = __lasx_xvperm_w(src0, shuf); \
212 __lasx_xvst(src0, dst, 0); \
218 #define SCALE_4_8(_sh) \
220 src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
221 src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
222 src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
223 src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
224 src4 = __lasx_xvldrepl_w(src + filterPos[4], 0); \
225 src5 = __lasx_xvldrepl_w(src + filterPos[5], 0); \
226 src6 = __lasx_xvldrepl_w(src + filterPos[6], 0); \
227 src7 = __lasx_xvldrepl_w(src + filterPos[7], 0); \
228 filter0 = __lasx_xvld(filter, 0); \
229 filter1 = __lasx_xvld(filter, 32); \
232 DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5, \
233 src4, src7, src6, src0, src2, src4, src6); \
234 src0 = __lasx_xvilvl_d(src2, src0); \
235 src1 = __lasx_xvilvl_d(src6, src4); \
237 src0 = __lasx_vext2xv_hu_bu(src0); \
238 src1 = __lasx_vext2xv_hu_bu(src1); \
239 src0 = __lasx_xvdp2_w_h(filter0, src0); \
240 src1 = __lasx_xvdp2_w_h(filter1, src1); \
241 src0 = __lasx_xvhaddw_d_w(src0, src0); \
242 src1 = __lasx_xvhaddw_d_w(src1, src1); \
243 src0 = __lasx_xvpickev_w(src1, src0); \
244 src0 = __lasx_xvsrai_w(src0, _sh); \
245 src0 = __lasx_xvmin_w(src0, vmax); \
248 #define SCALE_4_4(_sh) \
250 src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
251 src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
252 src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
253 src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
254 filter0 = __lasx_xvld(filter, 0); \
257 src0 = __lasx_xvilvl_w(src1, src0); \
258 src1 = __lasx_xvilvl_w(src3, src2); \
260 src0 = __lasx_xvilvl_d(src1, src0); \
261 src0 = __lasx_vext2xv_hu_bu(src0); \
262 src0 = __lasx_xvdp2_w_h(filter0, src0); \
263 src0 = __lasx_xvhaddw_d_w(src0, src0); \
264 src0 = __lasx_xvsrai_w(src0, _sh); \
265 src0 = __lasx_xvmin_w(src0, vmax); \
266 src0 = __lasx_xvpickev_w(src0, src0); \
267 src0 = __lasx_xvpermi_d(src0, 0xd8); \
270 #define SCALE_4_2(_sh) \
272 src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
273 src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
274 filter0 = __lasx_xvld(filter, 0); \
275 src0 = __lasx_xvilvl_w(src1, src0); \
276 src0 = __lasx_vext2xv_hu_bu(src0); \
277 src0 = __lasx_xvdp2_w_h(filter0, src0); \
278 src0 = __lasx_xvhaddw_d_w(src0, src0); \
279 src0 = __lasx_xvsrai_w(src0, _sh); \
280 src0 = __lasx_xvmin_w(src0, vmax); \
281 dst[0] = __lasx_xvpickve2gr_w(src0, 0); \
282 dst[1] = __lasx_xvpickve2gr_w(src0, 2); \
291 src0 = __lasx_xvldrepl_d((srcPos1 + j), 0); \
292 src1 = __lasx_xvldrepl_d((srcPos2 + j), 0); \
293 src2 = __lasx_xvldrepl_d((srcPos3 + j), 0); \
294 src3 = __lasx_xvldrepl_d((srcPos4 + j), 0); \
295 DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex, \
296 filterStart3, dex, filterStart4, dex, filter0, \
297 filter1, filter2, filter3); \
298 src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
299 src1 = __lasx_xvpermi_q(src2, src3, 0x02); \
300 filter0 = __lasx_xvpermi_q(filter0, filter1, 0x02); \
301 filter1 = __lasx_xvpermi_q(filter2, filter3, 0x02); \
302 src0 = __lasx_xvilvl_b(zero, src0); \
303 src1 = __lasx_xvilvl_b(zero, src1); \
304 out0 = __lasx_xvdp2_w_h(filter0, src0); \
305 out1 = __lasx_xvdp2_w_h(filter1, src1); \
306 src0 = __lasx_xvhaddw_d_w(out0, out0); \
307 src1 = __lasx_xvhaddw_d_w(out1, out1); \
308 out0 = __lasx_xvpackev_d(src1, src0); \
309 out1 = __lasx_xvpackod_d(src1, src0); \
310 out0 = __lasx_xvadd_w(out0, out1); \
311 out = __lasx_xvadd_w(out, out0); \
314 void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
315 const uint8_t *src, const int16_t *filter,
316 const int32_t *filterPos, int filterSize)
319 int max = (1 << 15) - 1;
321 if (filterSize == 8) {
322 __m256i src0, src1, src2, src3, src4, src5, src6, src7;
323 __m256i src8, src9, src10, src11, src12, src13, src14, src15;
324 __m256i filter0, filter1, filter2, filter3;
325 __m256i filter4, filter5, filter6, filter7;
326 __m256i vmax = __lasx_xvreplgr2vr_w(max);
327 __m256i shuf = {0x0000000400000000, 0x0000000500000001,
328 0x0000000600000002, 0x0000000700000003};
336 src0 = __lasx_xvpickev_h(src0, src0);
337 __lasx_xvstelm_d(src0, dst, 0, 0);
338 __lasx_xvstelm_d(src0, dst, 8, 2);
343 src0 = __lasx_xvpickev_h(src0, src0);
344 __lasx_xvstelm_d(src0, dst, 0, 0);
352 src0 = __lasx_xvldrepl_d(src + filterPos[0], 0);
353 filter0 = __lasx_xvld(filter, 0);
354 src0 = __lasx_vext2xv_hu_bu(src0);
355 src0 = __lasx_xvdp2_w_h(filter0, src0);
356 src0 = __lasx_xvhaddw_d_w(src0, src0);
357 src0 = __lasx_xvhaddw_q_d(src0, src0);
358 val = __lasx_xvpickve2gr_w(src0, 0);
359 dst[0] = FFMIN(val >> 7, max);
361 } else if (filterSize == 4) {
362 __m256i src0, src1, src2, src3, src4, src5, src6, src7;
363 __m256i src8, src9, src10, src11, src12, src13, src14, src15;
364 __m256i filter0, filter1, filter2, filter3;
365 __m256i vmax = __lasx_xvreplgr2vr_w(max);
366 __m256i shuf = {0x0000000400000000, 0x0000000500000001,
367 0x0000000600000002, 0x0000000700000003};
375 src0 = __lasx_xvpickev_h(src1, src0);
376 src0 = __lasx_xvperm_w(src0, shuf);
377 __lasx_xvstelm_d(src0, dst, 0, 0);
378 __lasx_xvstelm_d(src0, dst, 8, 1);
383 src0 = __lasx_xvpickev_h(src0, src0);
384 __lasx_xvstelm_d(src0, dst, 0, 0);
392 const uint8_t *srcPos = src + filterPos[0];
394 for (int j = 0; j < filterSize; j++) {
395 val += ((int)srcPos[j]) * filter[j];
397 dst[0] = FFMIN(val >> 7, max);
399 } else if (filterSize > 8) {
400 int filterlen = filterSize - 7;
403 __m256i zero = __lasx_xvldi(0);
406 __m256i src0, src1, src2, src3;
407 __m256i filter0, filter1, filter2, filter3, out0, out1;
409 const uint8_t *srcPos1 = src + filterPos[0];
410 const uint8_t *srcPos2 = src + filterPos[1];
411 const uint8_t *srcPos3 = src + filterPos[2];
412 const uint8_t *srcPos4 = src + filterPos[3];
413 const int16_t *filterStart1 = filter;
414 const int16_t *filterStart2 = filterStart1 + filterSize;
415 const int16_t *filterStart3 = filterStart2 + filterSize;
416 const int16_t *filterStart4 = filterStart3 + filterSize;
417 int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
419 for (j = 0; j < filterlen; j += 8) {
422 val1 = __lasx_xvpickve2gr_w(out, 0);
423 val2 = __lasx_xvpickve2gr_w(out, 4);
424 val3 = __lasx_xvpickve2gr_w(out, 2);
425 val4 = __lasx_xvpickve2gr_w(out, 6);
426 for (; j < filterSize; j++) {
427 val1 += ((int)srcPos1[j]) * filterStart1[j];
428 val2 += ((int)srcPos2[j]) * filterStart2[j];
429 val3 += ((int)srcPos3[j]) * filterStart3[j];
430 val4 += ((int)srcPos4[j]) * filterStart4[j];
432 dst[0] = FFMIN(val1 >> 7, max);
433 dst[1] = FFMIN(val2 >> 7, max);
434 dst[2] = FFMIN(val3 >> 7, max);
435 dst[3] = FFMIN(val4 >> 7, max);
438 filter = filterStart4 + filterSize;
440 for(i = 0; i < res; i++) {
442 const uint8_t *srcPos = src + filterPos[i];
443 __m256i src1, filter0, out0;
445 for (j = 0; j < filterlen; j += 8) {
446 src1 = __lasx_xvldrepl_d((srcPos + j), 0);
447 filter0 = __lasx_xvld(filter + j, 0);
448 src1 = __lasx_xvilvl_b(zero, src1);
449 out0 = __lasx_xvdp2_w_h(filter0, src1);
450 out0 = __lasx_xvhaddw_d_w(out0, out0);
451 out0 = __lasx_xvhaddw_q_d(out0, out0);
452 val += __lasx_xvpickve2gr_w(out0, 0);
454 for (; j < filterSize; j++) {
455 val += ((int)srcPos[j]) * filter[j];
457 dst[i] = FFMIN(val >> 7, max);
458 filter += filterSize;
461 for (i = 0; i < dstW; i++) {
463 const uint8_t *srcPos = src + filterPos[i];
465 for (int j = 0; j < filterSize; j++) {
466 val += ((int)srcPos[j]) * filter[j];
468 dst[i] = FFMIN(val >> 7, max);
469 filter += filterSize;
474 void ff_hscale_8_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
475 const uint8_t *src, const int16_t *filter,
476 const int32_t *filterPos, int filterSize)
479 int max = (1 << 19) - 1;
480 int32_t *dst = (int32_t *) _dst;
482 if (filterSize == 8) {
483 __m256i src0, src1, src2, src3, src4, src5, src6, src7;
484 __m256i filter0, filter1, filter2, filter3;
485 __m256i vmax = __lasx_xvreplgr2vr_w(max);
486 __m256i shuf = {0x0000000400000000, 0x0000000500000001,
487 0x0000000600000002, 0x0000000700000003};
492 __lasx_xvst(src0, dst, 0);
497 __lasx_xvstelm_d(src0, dst, 0, 0);
498 __lasx_xvstelm_d(src0, dst, 8, 1);
506 __m256i src0, filter0, out0;
508 src0 = __lasx_xvldrepl_d(src + filterPos[0], 0);
509 filter0 = __lasx_xvld(filter, 0);
510 src0 = __lasx_vext2xv_hu_bu(src0);
511 out0 = __lasx_xvdp2_w_h(filter0, src0);
512 out0 = __lasx_xvhaddw_d_w(out0, out0);
513 out0 = __lasx_xvhaddw_q_d(out0, out0);
514 val = __lasx_xvpickve2gr_w(out0, 0);
515 dst[0] = FFMIN(val >> 3, max);
517 } else if (filterSize == 4) {
518 __m256i src0, src1, src2, src3, src4, src5, src6, src7;
519 __m256i filter0, filter1;
520 __m256i vmax = __lasx_xvreplgr2vr_w(max);
521 __m256i shuf = {0x0000000100000000, 0x0000000500000004,
522 0x0000000300000002, 0x0000000700000006};
527 src0 = __lasx_xvperm_w(src0, shuf);
528 __lasx_xvst(src0, dst, 0);
533 __lasx_xvstelm_d(src0, dst, 0, 0);
534 __lasx_xvstelm_d(src0, dst, 8, 1);
542 const uint8_t *srcPos = src + filterPos[0];
544 for (int j = 0; j < filterSize; j++) {
545 val += ((int)srcPos[j]) * filter[j];
547 dst[0] = FFMIN(val >> 3, max);
549 } else if (filterSize > 8) {
552 int filterlen = filterSize - 7;
553 __m256i zero = __lasx_xvldi(0);
556 __m256i src0, src1, src2, src3;
557 __m256i filter0, filter1, filter2, filter3, out0, out1;
559 const uint8_t *srcPos1 = src + filterPos[0];
560 const uint8_t *srcPos2 = src + filterPos[1];
561 const uint8_t *srcPos3 = src + filterPos[2];
562 const uint8_t *srcPos4 = src + filterPos[3];
563 const int16_t *filterStart1 = filter;
564 const int16_t *filterStart2 = filterStart1 + filterSize;
565 const int16_t *filterStart3 = filterStart2 + filterSize;
566 const int16_t *filterStart4 = filterStart3 + filterSize;
567 int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
569 for (j = 0; j < filterlen; j += 8) {
572 val1 = __lasx_xvpickve2gr_w(out, 0);
573 val2 = __lasx_xvpickve2gr_w(out, 4);
574 val3 = __lasx_xvpickve2gr_w(out, 2);
575 val4 = __lasx_xvpickve2gr_w(out, 6);
576 for (; j < filterSize; j++) {
577 val1 += ((int)srcPos1[j]) * filterStart1[j];
578 val2 += ((int)srcPos2[j]) * filterStart2[j];
579 val3 += ((int)srcPos3[j]) * filterStart3[j];
580 val4 += ((int)srcPos4[j]) * filterStart4[j];
582 dst[0] = FFMIN(val1 >> 3, max);
583 dst[1] = FFMIN(val2 >> 3, max);
584 dst[2] = FFMIN(val3 >> 3, max);
585 dst[3] = FFMIN(val4 >> 3, max);
588 filter = filterStart4 + filterSize;
590 for (i = 0; i < res; i++) {
592 const uint8_t *srcPos = src + filterPos[i];
593 __m256i src1, filter0, out0;
595 for (j = 0; j < filterlen; j += 8) {
596 src1 = __lasx_xvldrepl_d((srcPos + j), 0);
597 filter0 = __lasx_xvld(filter + j, 0);
598 src1 = __lasx_xvilvl_b(zero, src1);
599 out0 = __lasx_xvdp2_w_h(filter0, src1);
600 out0 = __lasx_xvhaddw_d_w(out0, out0);
601 out0 = __lasx_xvhaddw_q_d(out0, out0);
602 val += __lasx_xvpickve2gr_w(out0, 0);
604 for (; j < filterSize; j++) {
605 val += ((int)srcPos[j]) * filter[j];
607 dst[i] = FFMIN(val >> 3, max);
608 filter += filterSize;
611 for (i = 0; i < dstW; i++) {
613 const uint8_t *srcPos = src + filterPos[i];
615 for (int j = 0; j < filterSize; j++) {
616 val += ((int)srcPos[j]) * filter[j];
618 dst[i] = FFMIN(val >> 3, max);
619 filter += filterSize;
628 __m256i src0, src1, src2, src3, filter0, filter1, out0, out1; \
629 DUP4_ARG2(__lasx_xvld, src + filterPos[0], 0, src + filterPos[1], 0, \
630 src + filterPos[2], 0, src + filterPos[3], 0, src0, src1, src2,\
632 filter0 = __lasx_xvld(filter, 0); \
633 filter1 = __lasx_xvld(filter, 32); \
634 src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
635 src2 = __lasx_xvpermi_q(src2, src3, 0x02); \
636 out0 = __lasx_xvdp2_w_hu_h(src0, filter0); \
637 out1 = __lasx_xvdp2_w_hu_h(src2, filter1); \
638 src0 = __lasx_xvhaddw_d_w(out0, out0); \
639 src1 = __lasx_xvhaddw_d_w(out1, out1); \
640 out0 = __lasx_xvpackev_d(src1, src0); \
641 out1 = __lasx_xvpackod_d(src1, src0); \
642 out0 = __lasx_xvadd_w(out0, out1); \
643 out0 = __lasx_xvsra_w(out0, shift); \
644 out0 = __lasx_xvmin_w(out0, v_max); \
645 dst[0] = __lasx_xvpickve2gr_w(out0, 0); \
646 dst[1] = __lasx_xvpickve2gr_w(out0, 4); \
647 dst[2] = __lasx_xvpickve2gr_w(out0, 2); \
648 dst[3] = __lasx_xvpickve2gr_w(out0, 6); \
657 DUP4_ARG2(__lasx_xvldx, srcPos1, dex, srcPos2, dex, srcPos3, dex, \
658 srcPos4, dex, src0, src1, src2, src3); \
659 DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex, \
660 filterStart3, dex, filterStart4, dex, filter0, \
661 filter1, filter2, filter3); \
662 src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
663 src1 = __lasx_xvpermi_q(src2, src3, 0x02); \
664 filter0 = __lasx_xvpermi_q(filter0, filter1, 0x02); \
665 filter1 = __lasx_xvpermi_q(filter2, filter3, 0x02); \
666 out0 = __lasx_xvdp2_w_hu_h(src0, filter0); \
667 out1 = __lasx_xvdp2_w_hu_h(src1, filter1); \
668 src0 = __lasx_xvhaddw_d_w(out0, out0); \
669 src1 = __lasx_xvhaddw_d_w(out1, out1); \
670 out0 = __lasx_xvpackev_d(src1, src0); \
671 out1 = __lasx_xvpackod_d(src1, src0); \
672 out0 = __lasx_xvadd_w(out0, out1); \
673 out = __lasx_xvadd_w(out, out0); \
676 void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
677 const uint8_t *_src, const int16_t *filter,
678 const int32_t *filterPos, int filterSize)
680 const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
682 const uint16_t *src = (const uint16_t *) _src;
683 int sh = desc->comp[0].depth - 1;
684 int max = (1 << 15) - 1;
688 __m256i zero = __lasx_xvldi(0);
691 sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
692 (desc->comp[0].depth - 1);
693 } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
696 shift = __lasx_xvreplgr2vr_w(sh);
698 if (filterSize == 8) {
699 __m256i v_max = __lasx_xvreplgr2vr_w(max);
700 for (i = 0; i < len; i++) {
703 for (i = 0; i < res; i++) {
705 __m256i src0, filter0, out0;
707 src0 = __lasx_xvld(src + filterPos[i], 0);
708 filter0 = __lasx_xvld(filter, 0);
709 out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
710 out0 = __lasx_xvhaddw_d_w(out0, out0);
711 out0 = __lasx_xvhaddw_q_d(out0, out0);
712 val = __lasx_xvpickve2gr_w(out0, 0);
713 dst[i] = FFMIN(val >> sh, max);
716 } else if (filterSize == 4) {
717 __m256i v_max = __lasx_xvreplgr2vr_w(max);
718 for (i = 0; i < len; i++) {
719 __m256i src1, src2, src3, src4, src0, filter0, out0;
721 src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
722 src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
723 src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
724 src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
725 filter0 = __lasx_xvld(filter, 0);
726 src1 = __lasx_xvextrins_d(src1, src2, 0x10);
727 src3 = __lasx_xvextrins_d(src3, src4, 0x10);
728 src0 = __lasx_xvpermi_q(src1, src3, 0x02);
729 out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
730 out0 = __lasx_xvhaddw_d_w(out0, out0);
731 out0 = __lasx_xvsra_w(out0, shift);
732 out0 = __lasx_xvmin_w(out0, v_max);
733 dst[0] = __lasx_xvpickve2gr_w(out0, 0);
734 dst[1] = __lasx_xvpickve2gr_w(out0, 2);
735 dst[2] = __lasx_xvpickve2gr_w(out0, 4);
736 dst[3] = __lasx_xvpickve2gr_w(out0, 6);
741 for (i = 0; i < res; i++) {
743 const uint16_t *srcPos = src + filterPos[i];
745 for (int j = 0; j < filterSize; j++) {
746 val += ((int)srcPos[j]) * filter[j];
748 dst[i] = FFMIN(val >> sh, max);
751 } else if (filterSize > 8) {
752 int filterlen = filterSize - 7;
754 for (i = 0; i < len; i++) {
755 __m256i src0, src1, src2, src3;
756 __m256i filter0, filter1, filter2, filter3, out0, out1;
758 const uint16_t *srcPos1 = src + filterPos[0];
759 const uint16_t *srcPos2 = src + filterPos[1];
760 const uint16_t *srcPos3 = src + filterPos[2];
761 const uint16_t *srcPos4 = src + filterPos[3];
762 const int16_t *filterStart1 = filter;
763 const int16_t *filterStart2 = filterStart1 + filterSize;
764 const int16_t *filterStart3 = filterStart2 + filterSize;
765 const int16_t *filterStart4 = filterStart3 + filterSize;
766 int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
768 for (j = 0; j < filterlen; j += 8) {
771 val1 = __lasx_xvpickve2gr_w(out, 0);
772 val2 = __lasx_xvpickve2gr_w(out, 4);
773 val3 = __lasx_xvpickve2gr_w(out, 2);
774 val4 = __lasx_xvpickve2gr_w(out, 6);
775 for (; j < filterSize; j++) {
776 val1 += ((int)srcPos1[j]) * filterStart1[j];
777 val2 += ((int)srcPos2[j]) * filterStart2[j];
778 val3 += ((int)srcPos3[j]) * filterStart3[j];
779 val4 += ((int)srcPos4[j]) * filterStart4[j];
781 dst[0] = FFMIN(val1 >> sh, max);
782 dst[1] = FFMIN(val2 >> sh, max);
783 dst[2] = FFMIN(val3 >> sh, max);
784 dst[3] = FFMIN(val4 >> sh, max);
787 filter = filterStart4 + filterSize;
789 for (i = 0; i < res; i++) {
791 const uint16_t *srcPos = src + filterPos[i];
792 __m256i src0, filter0, out0;
794 for (j = 0; j < filterlen; j += 8) {
796 src0 = __lasx_xvldx(srcPos, dex);
797 filter0 = __lasx_xvldx(filter, dex);
798 out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
799 out0 = __lasx_xvhaddw_d_w(out0, out0);
800 out0 = __lasx_xvhaddw_q_d(out0, out0);
801 val += __lasx_xvpickve2gr_w(out0, 0);
803 for (; j < filterSize; j++) {
804 val += ((int)srcPos[j]) * filter[j];
806 dst[i] = FFMIN(val >> sh, max);
807 filter += filterSize;
810 for (i = 0; i < dstW; i++) {
812 const uint16_t *srcPos = src + filterPos[i];
814 for (int j = 0; j < filterSize; j++) {
815 val += ((int)srcPos[j]) * filter[j];
817 dst[i] = FFMIN(val >> sh, max);
818 filter += filterSize;
823 void ff_hscale_16_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
824 const uint8_t *_src, const int16_t *filter,
825 const int32_t *filterPos, int filterSize)
827 const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
829 int32_t *dst = (int32_t *) _dst;
830 const uint16_t *src = (const uint16_t *) _src;
831 int sh = desc->comp[0].depth - 5;
832 int max = (1 << 19) - 1;
836 __m256i zero = __lasx_xvldi(0);
838 if ((isAnyRGB(c->srcFormat) || c->srcFormat == AV_PIX_FMT_PAL8)
839 && desc->comp[0].depth<16) {
841 } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) {
844 shift = __lasx_xvreplgr2vr_w(sh);
846 if (filterSize == 8) {
847 __m256i v_max = __lasx_xvreplgr2vr_w(max);
848 for (i = 0; i < len; i++) {
851 for (i = 0; i < res; i++) {
853 __m256i src0, filter0, out0;
855 src0 = __lasx_xvld(src + filterPos[i], 0);
856 filter0 = __lasx_xvld(filter, 0);
857 out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
858 out0 = __lasx_xvhaddw_d_w(out0, out0);
859 out0 = __lasx_xvhaddw_q_d(out0, out0);
860 val = __lasx_xvpickve2gr_w(out0, 0);
861 dst[i] = FFMIN(val >> sh, max);
864 } else if (filterSize == 4) {
865 __m256i v_max = __lasx_xvreplgr2vr_w(max);
866 for (i = 0; i < len; i++) {
867 __m256i src1, src2, src3, src4, src0, filter0, out0;
869 src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
870 src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
871 src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
872 src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
873 filter0 = __lasx_xvld(filter, 0);
874 src1 = __lasx_xvextrins_d(src1, src2, 0x10);
875 src3 = __lasx_xvextrins_d(src3, src4, 0x10);
876 src0 = __lasx_xvpermi_q(src1, src3, 0x02);
877 out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
878 out0 = __lasx_xvhaddw_d_w(out0, out0);
879 out0 = __lasx_xvsra_w(out0, shift);
880 out0 = __lasx_xvmin_w(out0, v_max);
881 dst[0] = __lasx_xvpickve2gr_w(out0, 0);
882 dst[1] = __lasx_xvpickve2gr_w(out0, 2);
883 dst[2] = __lasx_xvpickve2gr_w(out0, 4);
884 dst[3] = __lasx_xvpickve2gr_w(out0, 6);
889 for (i = 0; i < res; i++) {
891 const uint16_t *srcPos = src + filterPos[i];
893 for (int j = 0; j < filterSize; j++) {
894 val += ((int)srcPos[j]) * filter[j];
896 dst[i] = FFMIN(val >> sh, max);
899 } else if (filterSize > 8) {
900 int filterlen = filterSize - 7;
902 for (i = 0; i < len; i ++) {
903 __m256i src0, src1, src2, src3;
904 __m256i filter0, filter1, filter2, filter3, out0, out1;
906 const uint16_t *srcPos1 = src + filterPos[0];
907 const uint16_t *srcPos2 = src + filterPos[1];
908 const uint16_t *srcPos3 = src + filterPos[2];
909 const uint16_t *srcPos4 = src + filterPos[3];
910 const int16_t *filterStart1 = filter;
911 const int16_t *filterStart2 = filterStart1 + filterSize;
912 const int16_t *filterStart3 = filterStart2 + filterSize;
913 const int16_t *filterStart4 = filterStart3 + filterSize;
914 int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
916 for (j = 0; j < filterlen; j += 8) {
919 val1 = __lasx_xvpickve2gr_w(out, 0);
920 val2 = __lasx_xvpickve2gr_w(out, 4);
921 val3 = __lasx_xvpickve2gr_w(out, 2);
922 val4 = __lasx_xvpickve2gr_w(out, 6);
923 for (; j < filterSize; j++) {
924 val1 += ((int)srcPos1[j]) * filterStart1[j];
925 val2 += ((int)srcPos2[j]) * filterStart2[j];
926 val3 += ((int)srcPos3[j]) * filterStart3[j];
927 val4 += ((int)srcPos4[j]) * filterStart4[j];
929 dst[0] = FFMIN(val1 >> sh, max);
930 dst[1] = FFMIN(val2 >> sh, max);
931 dst[2] = FFMIN(val3 >> sh, max);
932 dst[3] = FFMIN(val4 >> sh, max);
935 filter = filterStart4 + filterSize;
937 for (i = 0; i < res; i++) {
939 const uint16_t *srcPos = src + filterPos[i];
940 __m256i src0, filter0, out0;
942 for (j = 0; j < filterlen; j += 8) {
944 src0 = __lasx_xvldx(srcPos, dex);
945 filter0 = __lasx_xvldx(filter, dex);
946 out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
947 out0 = __lasx_xvhaddw_d_w(out0, out0);
948 out0 = __lasx_xvhaddw_q_d(out0, out0);
949 val += __lasx_xvpickve2gr_w(out0, 0);
951 for (; j < filterSize; j++) {
952 val += ((int)srcPos[j]) * filter[j];
954 dst[i] = FFMIN(val >> sh, max);
955 filter += filterSize;
958 for (i = 0; i < dstW; i++) {
960 const uint16_t *srcPos = src + filterPos[i];
962 for (int j = 0; j < filterSize; j++) {
963 val += ((int)srcPos[j]) * filter[j];
965 dst[i] = FFMIN(val >> sh, max);
966 filter += filterSize;