Imported Upstream version 6.1
[platform/upstream/ffmpeg.git] / libswscale / loongarch / swscale_lasx.c
1 /*
2  * Copyright (C) 2022 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen(chenhao@loongson.cn)
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "swscale_loongarch.h"
23 #include "libavutil/loongarch/loongson_intrinsics.h"
24 #include "libavutil/intreadwrite.h"
25
26 #define SCALE_8_16(_sh)                                               \
27 {                                                                     \
28     src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
29     src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
30     src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);               \
31     src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);               \
32     src4    = __lasx_xvldrepl_d(src + filterPos[4], 0);               \
33     src5    = __lasx_xvldrepl_d(src + filterPos[5], 0);               \
34     src6    = __lasx_xvldrepl_d(src + filterPos[6], 0);               \
35     src7    = __lasx_xvldrepl_d(src + filterPos[7], 0);               \
36     src8    = __lasx_xvldrepl_d(src + filterPos[8], 0);               \
37     src9    = __lasx_xvldrepl_d(src + filterPos[9], 0);               \
38     src10   = __lasx_xvldrepl_d(src + filterPos[10], 0);              \
39     src11   = __lasx_xvldrepl_d(src + filterPos[11], 0);              \
40     src12   = __lasx_xvldrepl_d(src + filterPos[12], 0);              \
41     src13   = __lasx_xvldrepl_d(src + filterPos[13], 0);              \
42     src14   = __lasx_xvldrepl_d(src + filterPos[14], 0);              \
43     src15   = __lasx_xvldrepl_d(src + filterPos[15], 0);              \
44     DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
45               filter, 96, filter0, filter1, filter2, filter3);        \
46     DUP4_ARG2(__lasx_xvld, filter, 128, filter, 160,                  \
47               filter, 192, filter, 224, filter4,                      \
48               filter5, filter6, filter7);                             \
49     DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2,                \
50               src5, src4, src7, src6, src0, src2, src4, src6);        \
51     DUP4_ARG2(__lasx_xvilvl_d, src9, src8, src11, src10,              \
52               src13, src12, src15, src14, src8, src10, src12, src14); \
53     DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6,           \
54               src0, src2, src4, src6);                                \
55     DUP4_ARG1(__lasx_vext2xv_hu_bu, src8, src10, src12,               \
56               src14, src8, src10, src12, src14);                      \
57     DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2,         \
58               filter2, src4, filter3, src6, src0, src1, src2, src3);  \
59     DUP4_ARG2(__lasx_xvdp2_w_h, filter4, src8, filter5, src10,        \
60               filter6, src12, filter7, src14, src4, src5, src6, src7);\
61     src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
62     src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
63     src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
64     src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
65     src4 = __lasx_xvhaddw_d_w(src4, src4);                            \
66     src5 = __lasx_xvhaddw_d_w(src5, src5);                            \
67     src6 = __lasx_xvhaddw_d_w(src6, src6);                            \
68     src7 = __lasx_xvhaddw_d_w(src7, src7);                            \
69     DUP4_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2,              \
70               src5, src4, src7, src6, src0, src1, src2, src3);        \
71     src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
72     src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
73     src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
74     src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
75     src0 = __lasx_xvpickev_w(src1, src0);                             \
76     src1 = __lasx_xvpickev_w(src3, src2);                             \
77     src0 = __lasx_xvsrai_w(src0, _sh);                                \
78     src1 = __lasx_xvsrai_w(src1, _sh);                                \
79     src0 = __lasx_xvmin_w(src0, vmax);                                \
80     src1 = __lasx_xvmin_w(src1, vmax);                                \
81     src0 = __lasx_xvperm_w(src0, shuf);                               \
82     src1 = __lasx_xvperm_w(src1, shuf);                               \
83     src0 = __lasx_xvpickev_h(src1, src0);                             \
84     src0 = __lasx_xvpermi_d(src0, 0xd8);                              \
85     __lasx_xvst(src0, dst, 0);                                        \
86     filterPos += 16;                                                  \
87     filter    += 128;                                                 \
88     dst       += 16;                                                  \
89 }
90
91 #define SCALE_8_8(_sh)                                                \
92 {                                                                     \
93     src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
94     src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
95     src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);               \
96     src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);               \
97     src4    = __lasx_xvldrepl_d(src + filterPos[4], 0);               \
98     src5    = __lasx_xvldrepl_d(src + filterPos[5], 0);               \
99     src6    = __lasx_xvldrepl_d(src + filterPos[6], 0);               \
100     src7    = __lasx_xvldrepl_d(src + filterPos[7], 0);               \
101     DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
102               filter, 96, filter0, filter1, filter2, filter3);        \
103     filterPos += 8;                                                   \
104     filter    += 64;                                                  \
105     DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2,                \
106               src5, src4, src7, src6, src0, src2, src4, src6);        \
107     DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6,           \
108               src0, src2, src4, src6);                                \
109     DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2,         \
110               filter2, src4, filter3, src6, src0, src1, src2,src3);   \
111     src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
112     src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
113     src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
114     src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
115     src0 = __lasx_xvpickev_w(src1, src0);                             \
116     src1 = __lasx_xvpickev_w(src3, src2);                             \
117     src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
118     src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
119     src0 = __lasx_xvpickev_w(src1, src0);                             \
120     src0 = __lasx_xvsrai_w(src0, _sh);                                \
121     src0 = __lasx_xvmin_w(src0, vmax);                                \
122     src0 = __lasx_xvperm_w(src0, shuf);                               \
123 }
124
125 #define SCALE_8_4(_sh)                                                    \
126 {                                                                         \
127     src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);                   \
128     src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);                   \
129     src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);                   \
130     src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);                   \
131     filter0 = __lasx_xvld(filter, 0);                                     \
132     filter1 = __lasx_xvld(filter, 32);                                    \
133     filterPos += 4;                                                       \
134     filter    += 32;                                                      \
135     src0    = __lasx_xvilvl_d(src1, src0);                                \
136     src2    = __lasx_xvilvl_d(src3, src2);                                \
137     src0    = __lasx_vext2xv_hu_bu(src0);                                 \
138     src2    = __lasx_vext2xv_hu_bu(src2);                                 \
139     src0    = __lasx_xvdp2_w_h(src0, filter0);                            \
140     src1    = __lasx_xvdp2_w_h(src2, filter1);                            \
141     src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
142     src1 = __lasx_xvhaddw_d_w(src1, src1);                                \
143     src0 = __lasx_xvpickev_w(src1, src0);                                 \
144     src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
145     src0 = __lasx_xvpickev_w(src0, src0);                                 \
146     src0 = __lasx_xvsrai_w(src0, _sh);                                    \
147     src0 = __lasx_xvmin_w(src0, vmax);                                    \
148     src0 = __lasx_xvperm_w(src0, shuf);                                   \
149 }
150
151 #define SCALE_8_2(_sh)                                                \
152 {                                                                     \
153     src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
154     src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
155     filter0 = __lasx_xvld(filter, 0);                                 \
156     src0 = __lasx_xvilvl_d(src1, src0);                               \
157     src0 = __lasx_vext2xv_hu_bu(src0);                                \
158     src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
159     src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
160     src0 = __lasx_xvhaddw_q_d(src0, src0);                            \
161     src0 = __lasx_xvsrai_w(src0, _sh);                                \
162     src0 = __lasx_xvmin_w(src0, vmax);                                \
163     dst[0] = __lasx_xvpickve2gr_w(src0, 0);                           \
164     dst[1] = __lasx_xvpickve2gr_w(src0, 4);                           \
165     filterPos += 2;                                                   \
166     filter    += 16;                                                  \
167     dst       += 2;                                                   \
168 }
169
170 #define SCALE_4_16(_sh)                                               \
171 {                                                                     \
172     src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
173     src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
174     src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);               \
175     src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);               \
176     src4    = __lasx_xvldrepl_w(src + filterPos[4], 0);               \
177     src5    = __lasx_xvldrepl_w(src + filterPos[5], 0);               \
178     src6    = __lasx_xvldrepl_w(src + filterPos[6], 0);               \
179     src7    = __lasx_xvldrepl_w(src + filterPos[7], 0);               \
180     src8    = __lasx_xvldrepl_w(src + filterPos[8], 0);               \
181     src9    = __lasx_xvldrepl_w(src + filterPos[9], 0);               \
182     src10   = __lasx_xvldrepl_w(src + filterPos[10], 0);              \
183     src11   = __lasx_xvldrepl_w(src + filterPos[11], 0);              \
184     src12   = __lasx_xvldrepl_w(src + filterPos[12], 0);              \
185     src13   = __lasx_xvldrepl_w(src + filterPos[13], 0);              \
186     src14   = __lasx_xvldrepl_w(src + filterPos[14], 0);              \
187     src15   = __lasx_xvldrepl_w(src + filterPos[15], 0);              \
188     DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
189               filter, 96, filter0, filter1, filter2, filter3);        \
190     DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5,          \
191               src4, src7, src6, src0, src2, src4, src6);              \
192     DUP4_ARG2(__lasx_xvilvl_w, src9, src8, src11, src10, src13,       \
193               src12, src15, src14, src8, src10, src12, src14);        \
194     DUP4_ARG2(__lasx_xvilvl_d, src2, src0, src6, src4, src10,         \
195                    src8, src14, src12, src0, src1, src2, src3);       \
196     DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src1, src2, src3,           \
197               src0, src1, src2, src3);                                \
198     DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1,         \
199               filter2, src2, filter3, src3, src0, src1, src2, src3);  \
200     src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
201     src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
202     src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
203     src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
204     src0 = __lasx_xvpickev_w(src1, src0);                             \
205     src1 = __lasx_xvpickev_w(src3, src2);                             \
206     src0 = __lasx_xvsrai_w(src0, _sh);                                \
207     src1 = __lasx_xvsrai_w(src1, _sh);                                \
208     src0 = __lasx_xvmin_w(src0, vmax);                                \
209     src1 = __lasx_xvmin_w(src1, vmax);                                \
210     src0 = __lasx_xvpickev_h(src1, src0);                             \
211     src0 = __lasx_xvperm_w(src0, shuf);                               \
212     __lasx_xvst(src0, dst, 0);                                        \
213     filterPos += 16;                                                  \
214     filter    += 64;                                                  \
215     dst       += 16;                                                  \
216 }
217
218 #define SCALE_4_8(_sh)                                                    \
219 {                                                                         \
220     src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);                   \
221     src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);                   \
222     src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);                   \
223     src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);                   \
224     src4    = __lasx_xvldrepl_w(src + filterPos[4], 0);                   \
225     src5    = __lasx_xvldrepl_w(src + filterPos[5], 0);                   \
226     src6    = __lasx_xvldrepl_w(src + filterPos[6], 0);                   \
227     src7    = __lasx_xvldrepl_w(src + filterPos[7], 0);                   \
228     filter0 = __lasx_xvld(filter, 0);                                     \
229     filter1 = __lasx_xvld(filter, 32);                                    \
230     filterPos += 8;                                                       \
231     filter    += 32;                                                      \
232     DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5,              \
233               src4, src7, src6, src0, src2, src4, src6);                  \
234     src0 = __lasx_xvilvl_d(src2, src0);                                   \
235     src1 = __lasx_xvilvl_d(src6, src4);                                   \
236                                                                           \
237     src0 = __lasx_vext2xv_hu_bu(src0);                                    \
238     src1 = __lasx_vext2xv_hu_bu(src1);                                    \
239     src0 = __lasx_xvdp2_w_h(filter0, src0);                               \
240     src1 = __lasx_xvdp2_w_h(filter1, src1);                               \
241     src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
242     src1 = __lasx_xvhaddw_d_w(src1, src1);                                \
243     src0 = __lasx_xvpickev_w(src1, src0);                                 \
244     src0 = __lasx_xvsrai_w(src0, _sh);                                    \
245     src0 = __lasx_xvmin_w(src0, vmax);                                    \
246 }
247
248 #define SCALE_4_4(_sh)                                                \
249 {                                                                     \
250     src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
251     src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
252     src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);               \
253     src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);               \
254     filter0 = __lasx_xvld(filter, 0);                                 \
255     filterPos += 4;                                                   \
256     filter    += 16;                                                  \
257     src0    = __lasx_xvilvl_w(src1, src0);                            \
258     src1    = __lasx_xvilvl_w(src3, src2);                            \
259                                                                       \
260     src0 = __lasx_xvilvl_d(src1, src0);                               \
261     src0 = __lasx_vext2xv_hu_bu(src0);                                \
262     src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
263     src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
264     src0 = __lasx_xvsrai_w(src0, _sh);                                \
265     src0 = __lasx_xvmin_w(src0, vmax);                                \
266     src0 = __lasx_xvpickev_w(src0, src0);                             \
267     src0 = __lasx_xvpermi_d(src0, 0xd8);                              \
268 }
269
270 #define SCALE_4_2(_sh)                                                \
271 {                                                                     \
272     src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
273     src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
274     filter0 = __lasx_xvld(filter, 0);                                 \
275     src0 = __lasx_xvilvl_w(src1, src0);                               \
276     src0 = __lasx_vext2xv_hu_bu(src0);                                \
277     src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
278     src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
279     src0 = __lasx_xvsrai_w(src0, _sh);                                \
280     src0 = __lasx_xvmin_w(src0, vmax);                                \
281     dst[0] = __lasx_xvpickve2gr_w(src0, 0);                           \
282     dst[1] = __lasx_xvpickve2gr_w(src0, 2);                           \
283     filterPos += 2;                                                   \
284     filter    += 8;                                                   \
285     dst       += 2;                                                   \
286 }
287
288 #define SCALE_16                                                      \
289 {                                                                     \
290     int dex  = j << 1;                                                \
291     src0     = __lasx_xvldrepl_d((srcPos1 + j), 0);                   \
292     src1     = __lasx_xvldrepl_d((srcPos2 + j), 0);                   \
293     src2     = __lasx_xvldrepl_d((srcPos3 + j), 0);                   \
294     src3     = __lasx_xvldrepl_d((srcPos4 + j), 0);                   \
295     DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex,     \
296               filterStart3, dex, filterStart4, dex, filter0,          \
297               filter1, filter2, filter3);                             \
298     src0     = __lasx_xvpermi_q(src0, src1, 0x02);                    \
299     src1     = __lasx_xvpermi_q(src2, src3, 0x02);                    \
300     filter0  = __lasx_xvpermi_q(filter0, filter1, 0x02);              \
301     filter1  = __lasx_xvpermi_q(filter2, filter3, 0x02);              \
302     src0     = __lasx_xvilvl_b(zero, src0);                           \
303     src1     = __lasx_xvilvl_b(zero, src1);                           \
304     out0     = __lasx_xvdp2_w_h(filter0, src0);                       \
305     out1     = __lasx_xvdp2_w_h(filter1, src1);                       \
306     src0     = __lasx_xvhaddw_d_w(out0, out0);                        \
307     src1     = __lasx_xvhaddw_d_w(out1, out1);                        \
308     out0     = __lasx_xvpackev_d(src1, src0);                         \
309     out1     = __lasx_xvpackod_d(src1, src0);                         \
310     out0     = __lasx_xvadd_w(out0, out1);                            \
311     out      = __lasx_xvadd_w(out, out0);                             \
312 }
313
314 void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
315                             const uint8_t *src, const int16_t *filter,
316                             const int32_t *filterPos, int filterSize)
317 {
318     int i;
319     int max = (1 << 15) - 1;
320
321     if (filterSize == 8) {
322         __m256i src0, src1, src2, src3, src4, src5, src6, src7;
323         __m256i src8, src9, src10, src11, src12, src13, src14, src15;
324         __m256i filter0, filter1, filter2, filter3;
325         __m256i filter4, filter5, filter6, filter7;
326         __m256i vmax = __lasx_xvreplgr2vr_w(max);
327         __m256i shuf = {0x0000000400000000, 0x0000000500000001,
328                         0x0000000600000002, 0x0000000700000003};
329         int len = dstW >> 4;
330         int res = dstW & 15;
331         while (len--) {
332             SCALE_8_16(7);
333         }
334         if (res & 8) {
335             SCALE_8_8(7);
336             src0 = __lasx_xvpickev_h(src0, src0);
337             __lasx_xvstelm_d(src0, dst, 0, 0);
338             __lasx_xvstelm_d(src0, dst, 8, 2);
339             dst       += 8;
340         }
341         if (res & 4) {
342             SCALE_8_4(7);
343             src0 = __lasx_xvpickev_h(src0, src0);
344             __lasx_xvstelm_d(src0, dst, 0, 0);
345             dst       += 4;
346         }
347         if (res & 2) {
348             SCALE_8_2(7);
349         }
350         if (res & 1) {
351             int val = 0;
352             src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);
353             filter0 = __lasx_xvld(filter, 0);
354             src0 = __lasx_vext2xv_hu_bu(src0);
355             src0 = __lasx_xvdp2_w_h(filter0, src0);
356             src0    = __lasx_xvhaddw_d_w(src0, src0);
357             src0    = __lasx_xvhaddw_q_d(src0, src0);
358             val     = __lasx_xvpickve2gr_w(src0, 0);
359             dst[0]  = FFMIN(val >> 7, max);
360         }
361     } else if (filterSize == 4) {
362         __m256i src0, src1, src2, src3, src4, src5, src6, src7;
363         __m256i src8, src9, src10, src11, src12, src13, src14, src15;
364         __m256i filter0, filter1, filter2, filter3;
365         __m256i vmax = __lasx_xvreplgr2vr_w(max);
366         __m256i shuf = {0x0000000400000000, 0x0000000500000001,
367                         0x0000000600000002, 0x0000000700000003};
368         int len = dstW >> 4;
369         int res = dstW & 15;
370         while (len--) {
371             SCALE_4_16(7);
372         }
373         if (res & 8) {
374             SCALE_4_8(7);
375             src0 = __lasx_xvpickev_h(src1, src0);
376             src0 = __lasx_xvperm_w(src0, shuf);
377             __lasx_xvstelm_d(src0, dst, 0, 0);
378             __lasx_xvstelm_d(src0, dst, 8, 1);
379             dst       += 8;
380         }
381         if (res & 4) {
382             SCALE_4_4(7);
383             src0 = __lasx_xvpickev_h(src0, src0);
384             __lasx_xvstelm_d(src0, dst, 0, 0);
385             dst       += 4;
386         }
387         if (res & 2) {
388             SCALE_4_2(7);
389         }
390         if (res & 1) {
391             int val = 0;
392             const uint8_t *srcPos = src + filterPos[0];
393
394             for (int j = 0; j < filterSize; j++) {
395                 val += ((int)srcPos[j]) * filter[j];
396             }
397             dst[0] = FFMIN(val >> 7, max);
398         }
399     } else if (filterSize > 8) {
400         int filterlen = filterSize - 7;
401         int len = dstW >> 2;
402         int res = dstW & 3;
403         __m256i zero = __lasx_xvldi(0);
404
405         while (len--) {
406             __m256i src0, src1, src2, src3;
407             __m256i filter0, filter1, filter2, filter3, out0, out1;
408             __m256i out = zero;
409             const uint8_t *srcPos1 = src + filterPos[0];
410             const uint8_t *srcPos2 = src + filterPos[1];
411             const uint8_t *srcPos3 = src + filterPos[2];
412             const uint8_t *srcPos4 = src + filterPos[3];
413             const int16_t *filterStart1 = filter;
414             const int16_t *filterStart2 = filterStart1 + filterSize;
415             const int16_t *filterStart3 = filterStart2 + filterSize;
416             const int16_t *filterStart4 = filterStart3 + filterSize;
417             int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
418
419             for (j = 0; j < filterlen; j += 8) {
420                 SCALE_16
421             }
422             val1 = __lasx_xvpickve2gr_w(out, 0);
423             val2 = __lasx_xvpickve2gr_w(out, 4);
424             val3 = __lasx_xvpickve2gr_w(out, 2);
425             val4 = __lasx_xvpickve2gr_w(out, 6);
426             for (; j < filterSize; j++) {
427                 val1 += ((int)srcPos1[j]) * filterStart1[j];
428                 val2 += ((int)srcPos2[j]) * filterStart2[j];
429                 val3 += ((int)srcPos3[j]) * filterStart3[j];
430                 val4 += ((int)srcPos4[j]) * filterStart4[j];
431             }
432             dst[0] = FFMIN(val1 >> 7, max);
433             dst[1] = FFMIN(val2 >> 7, max);
434             dst[2] = FFMIN(val3 >> 7, max);
435             dst[3] = FFMIN(val4 >> 7, max);
436             dst       += 4;
437             filterPos += 4;
438             filter     = filterStart4 + filterSize;
439         }
440         for(i = 0; i < res; i++) {
441             int j, val = 0;
442             const uint8_t *srcPos = src + filterPos[i];
443             __m256i src1, filter0, out0;
444
445             for (j = 0; j < filterlen; j += 8) {
446                 src1   = __lasx_xvldrepl_d((srcPos + j), 0);
447                 filter0 = __lasx_xvld(filter + j, 0);
448                 src1 = __lasx_xvilvl_b(zero, src1);
449                 out0 = __lasx_xvdp2_w_h(filter0, src1);
450                 out0 = __lasx_xvhaddw_d_w(out0, out0);
451                 out0 = __lasx_xvhaddw_q_d(out0, out0);
452                 val += __lasx_xvpickve2gr_w(out0, 0);
453             }
454             for (; j < filterSize; j++) {
455                 val += ((int)srcPos[j]) * filter[j];
456             }
457             dst[i]  = FFMIN(val >> 7, max);
458             filter += filterSize;
459         }
460     } else {
461         for (i = 0; i < dstW; i++) {
462             int val = 0;
463             const uint8_t *srcPos = src + filterPos[i];
464
465             for (int j = 0; j < filterSize; j++) {
466                 val += ((int)srcPos[j]) * filter[j];
467             }
468             dst[i]  = FFMIN(val >> 7, max);
469             filter += filterSize;
470         }
471     }
472 }
473
474 void ff_hscale_8_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
475                             const uint8_t *src, const int16_t *filter,
476                             const int32_t *filterPos, int filterSize)
477 {
478     int i;
479     int max = (1 << 19) - 1;
480     int32_t *dst = (int32_t *) _dst;
481
482     if (filterSize == 8) {
483         __m256i src0, src1, src2, src3, src4, src5, src6, src7;
484         __m256i filter0, filter1, filter2, filter3;
485         __m256i vmax = __lasx_xvreplgr2vr_w(max);
486         __m256i shuf = {0x0000000400000000, 0x0000000500000001,
487                         0x0000000600000002, 0x0000000700000003};
488         int len = dstW >> 3;
489         int res = dstW & 7;
490         while (len--) {
491             SCALE_8_8(3);
492             __lasx_xvst(src0, dst, 0);
493             dst       += 8;
494         }
495         if (res & 4) {
496             SCALE_8_4(3);
497             __lasx_xvstelm_d(src0, dst, 0, 0);
498             __lasx_xvstelm_d(src0, dst, 8, 1);
499             dst       += 4;
500         }
501         if (res & 2) {
502             SCALE_8_2(3);
503         }
504         if (res & 1) {
505             int val = 0;
506             __m256i src0, filter0, out0;
507
508             src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);
509             filter0 = __lasx_xvld(filter, 0);
510             src0 = __lasx_vext2xv_hu_bu(src0);
511             out0 = __lasx_xvdp2_w_h(filter0, src0);
512             out0    = __lasx_xvhaddw_d_w(out0, out0);
513             out0    = __lasx_xvhaddw_q_d(out0, out0);
514             val     = __lasx_xvpickve2gr_w(out0, 0);
515             dst[0]  = FFMIN(val >> 3, max);
516         }
517     } else if (filterSize == 4) {
518         __m256i src0, src1, src2, src3, src4, src5, src6, src7;
519         __m256i filter0, filter1;
520         __m256i vmax = __lasx_xvreplgr2vr_w(max);
521         __m256i shuf = {0x0000000100000000, 0x0000000500000004,
522                         0x0000000300000002, 0x0000000700000006};
523         int len = dstW >> 3;
524         int res = dstW & 7;
525         while (len--) {
526             SCALE_4_8(3);
527             src0 = __lasx_xvperm_w(src0, shuf);
528             __lasx_xvst(src0, dst, 0);
529             dst       += 8;
530         }
531         if (res & 4) {
532             SCALE_4_4(3);
533             __lasx_xvstelm_d(src0, dst, 0, 0);
534             __lasx_xvstelm_d(src0, dst, 8, 1);
535             dst       += 4;
536         }
537         if (res & 2) {
538             SCALE_4_2(3);
539         }
540         if (res & 1) {
541             int val = 0;
542             const uint8_t *srcPos = src + filterPos[0];
543
544             for (int j = 0; j < filterSize; j++) {
545                 val += ((int)srcPos[j]) * filter[j];
546             }
547             dst[0] = FFMIN(val >> 3, max);
548         }
549     } else if (filterSize > 8) {
550         int len = dstW >> 2;
551         int res = dstW & 3;
552         int filterlen = filterSize - 7;
553         __m256i zero = __lasx_xvldi(0);
554
555         while (len--) {
556             __m256i src0, src1, src2, src3;
557             __m256i filter0, filter1, filter2, filter3, out0, out1;
558             __m256i out = zero;
559             const uint8_t *srcPos1 = src + filterPos[0];
560             const uint8_t *srcPos2 = src + filterPos[1];
561             const uint8_t *srcPos3 = src + filterPos[2];
562             const uint8_t *srcPos4 = src + filterPos[3];
563             const int16_t *filterStart1 = filter;
564             const int16_t *filterStart2 = filterStart1 + filterSize;
565             const int16_t *filterStart3 = filterStart2 + filterSize;
566             const int16_t *filterStart4 = filterStart3 + filterSize;
567             int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
568
569             for (j = 0; j < filterlen; j += 8) {
570                 SCALE_16
571             }
572             val1 = __lasx_xvpickve2gr_w(out, 0);
573             val2 = __lasx_xvpickve2gr_w(out, 4);
574             val3 = __lasx_xvpickve2gr_w(out, 2);
575             val4 = __lasx_xvpickve2gr_w(out, 6);
576             for (; j < filterSize; j++) {
577                 val1 += ((int)srcPos1[j]) * filterStart1[j];
578                 val2 += ((int)srcPos2[j]) * filterStart2[j];
579                 val3 += ((int)srcPos3[j]) * filterStart3[j];
580                 val4 += ((int)srcPos4[j]) * filterStart4[j];
581             }
582             dst[0] = FFMIN(val1 >> 3, max);
583             dst[1] = FFMIN(val2 >> 3, max);
584             dst[2] = FFMIN(val3 >> 3, max);
585             dst[3] = FFMIN(val4 >> 3, max);
586             dst       += 4;
587             filterPos += 4;
588             filter     = filterStart4 + filterSize;
589         }
590         for (i = 0; i < res; i++) {
591             int j, val = 0;
592             const uint8_t *srcPos = src + filterPos[i];
593             __m256i src1, filter0, out0;
594
595             for (j = 0; j < filterlen; j += 8) {
596                 src1   = __lasx_xvldrepl_d((srcPos + j), 0);
597                 filter0 = __lasx_xvld(filter + j, 0);
598                 src1 = __lasx_xvilvl_b(zero, src1);
599                 out0 = __lasx_xvdp2_w_h(filter0, src1);
600                 out0 = __lasx_xvhaddw_d_w(out0, out0);
601                 out0 = __lasx_xvhaddw_q_d(out0, out0);
602                 val += __lasx_xvpickve2gr_w(out0, 0);
603             }
604             for (; j < filterSize; j++) {
605                 val += ((int)srcPos[j]) * filter[j];
606             }
607             dst[i] = FFMIN(val >> 3, max);
608             filter += filterSize;
609         }
610     } else {
611         for (i = 0; i < dstW; i++) {
612             int val = 0;
613             const uint8_t *srcPos = src + filterPos[i];
614
615             for (int j = 0; j < filterSize; j++) {
616                 val += ((int)srcPos[j]) * filter[j];
617             }
618             dst[i]  = FFMIN(val >> 3, max);
619             filter += filterSize;
620         }
621     }
622 }
623
624 #undef SCALE_16
625
626 #define SCALE_8                                                              \
627 {                                                                            \
628     __m256i src0, src1, src2, src3, filter0, filter1, out0, out1;            \
629     DUP4_ARG2(__lasx_xvld, src + filterPos[0], 0, src + filterPos[1], 0,     \
630               src + filterPos[2], 0, src + filterPos[3], 0, src0, src1, src2,\
631               src3);                                                         \
632     filter0 = __lasx_xvld(filter, 0);                                        \
633     filter1 = __lasx_xvld(filter, 32);                                       \
634     src0    = __lasx_xvpermi_q(src0, src1, 0x02);                            \
635     src2    = __lasx_xvpermi_q(src2, src3, 0x02);                            \
636     out0    = __lasx_xvdp2_w_hu_h(src0, filter0);                            \
637     out1    = __lasx_xvdp2_w_hu_h(src2, filter1);                            \
638     src0    = __lasx_xvhaddw_d_w(out0, out0);                                \
639     src1    = __lasx_xvhaddw_d_w(out1, out1);                                \
640     out0    = __lasx_xvpackev_d(src1, src0);                                 \
641     out1    = __lasx_xvpackod_d(src1, src0);                                 \
642     out0    = __lasx_xvadd_w(out0, out1);                                    \
643     out0    = __lasx_xvsra_w(out0, shift);                                   \
644     out0    = __lasx_xvmin_w(out0, v_max);                                   \
645     dst[0]  = __lasx_xvpickve2gr_w(out0, 0);                                 \
646     dst[1]  = __lasx_xvpickve2gr_w(out0, 4);                                 \
647     dst[2]  = __lasx_xvpickve2gr_w(out0, 2);                                 \
648     dst[3]  = __lasx_xvpickve2gr_w(out0, 6);                                 \
649     filterPos += 4;                                                          \
650     filter += 32;                                                            \
651     dst += 4;                                                                \
652 }
653
654 #define SCALE_16                                                             \
655 {                                                                            \
656     int dex = j << 1;                                                        \
657     DUP4_ARG2(__lasx_xvldx, srcPos1, dex, srcPos2, dex, srcPos3, dex,        \
658               srcPos4, dex, src0, src1, src2, src3);                         \
659     DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex,            \
660               filterStart3, dex, filterStart4, dex, filter0,                 \
661               filter1, filter2, filter3);                                    \
662     src0     = __lasx_xvpermi_q(src0, src1, 0x02);                           \
663     src1     = __lasx_xvpermi_q(src2, src3, 0x02);                           \
664     filter0  = __lasx_xvpermi_q(filter0, filter1, 0x02);                     \
665     filter1  = __lasx_xvpermi_q(filter2, filter3, 0x02);                     \
666     out0     = __lasx_xvdp2_w_hu_h(src0, filter0);                           \
667     out1     = __lasx_xvdp2_w_hu_h(src1, filter1);                           \
668     src0     = __lasx_xvhaddw_d_w(out0, out0);                               \
669     src1     = __lasx_xvhaddw_d_w(out1, out1);                               \
670     out0     = __lasx_xvpackev_d(src1, src0);                                \
671     out1     = __lasx_xvpackod_d(src1, src0);                                \
672     out0     = __lasx_xvadd_w(out0, out1);                                   \
673     out      = __lasx_xvadd_w(out, out0);                                    \
674 }
675
676 void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
677                              const uint8_t *_src, const int16_t *filter,
678                              const int32_t *filterPos, int filterSize)
679 {
680     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
681     int i;
682     const uint16_t *src = (const uint16_t *) _src;
683     int sh              = desc->comp[0].depth - 1;
684     int max = (1 << 15) - 1;
685     int len = dstW >> 2;
686     int res = dstW & 3;
687     __m256i shift;
688     __m256i zero = __lasx_xvldi(0);
689
690     if (sh < 15) {
691         sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
692                       (desc->comp[0].depth - 1);
693     } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
694         sh = 15;
695     }
696     shift = __lasx_xvreplgr2vr_w(sh);
697
698     if (filterSize == 8) {
699         __m256i v_max = __lasx_xvreplgr2vr_w(max);
700         for (i = 0; i < len; i++) {
701             SCALE_8
702         }
703         for (i = 0; i < res; i++) {
704             int val = 0;
705             __m256i src0, filter0, out0;
706
707             src0    = __lasx_xvld(src + filterPos[i], 0);
708             filter0 = __lasx_xvld(filter, 0);
709             out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
710             out0    = __lasx_xvhaddw_d_w(out0, out0);
711             out0    = __lasx_xvhaddw_q_d(out0, out0);
712             val     = __lasx_xvpickve2gr_w(out0, 0);
713             dst[i]  = FFMIN(val >> sh, max);
714             filter += 8;
715         }
716     } else if (filterSize == 4) {
717         __m256i v_max = __lasx_xvreplgr2vr_w(max);
718         for (i = 0; i < len; i++) {
719             __m256i src1, src2, src3, src4, src0, filter0, out0;
720
721             src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
722             src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
723             src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
724             src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
725             filter0 = __lasx_xvld(filter, 0);
726             src1 = __lasx_xvextrins_d(src1, src2, 0x10);
727             src3 = __lasx_xvextrins_d(src3, src4, 0x10);
728             src0 = __lasx_xvpermi_q(src1, src3, 0x02);
729             out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
730             out0 = __lasx_xvhaddw_d_w(out0, out0);
731             out0 = __lasx_xvsra_w(out0, shift);
732             out0 = __lasx_xvmin_w(out0, v_max);
733             dst[0] = __lasx_xvpickve2gr_w(out0, 0);
734             dst[1] = __lasx_xvpickve2gr_w(out0, 2);
735             dst[2] = __lasx_xvpickve2gr_w(out0, 4);
736             dst[3] = __lasx_xvpickve2gr_w(out0, 6);
737             dst       += 4;
738             filterPos += 4;
739             filter    += 16;
740         }
741         for (i = 0; i < res; i++) {
742             int val = 0;
743             const uint16_t *srcPos = src + filterPos[i];
744
745             for (int j = 0; j < filterSize; j++) {
746                 val += ((int)srcPos[j]) * filter[j];
747             }
748             dst[i]  = FFMIN(val >> sh, max);
749             filter += 4;
750         }
751     } else if (filterSize > 8) {
752         int filterlen = filterSize - 7;
753
754         for (i = 0; i < len; i++) {
755             __m256i src0, src1, src2, src3;
756             __m256i filter0, filter1, filter2, filter3, out0, out1;
757             __m256i out = zero;
758             const uint16_t *srcPos1 = src + filterPos[0];
759             const uint16_t *srcPos2 = src + filterPos[1];
760             const uint16_t *srcPos3 = src + filterPos[2];
761             const uint16_t *srcPos4 = src + filterPos[3];
762             const int16_t *filterStart1 = filter;
763             const int16_t *filterStart2 = filterStart1 + filterSize;
764             const int16_t *filterStart3 = filterStart2 + filterSize;
765             const int16_t *filterStart4 = filterStart3 + filterSize;
766             int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
767
768             for (j = 0; j < filterlen; j += 8) {
769                 SCALE_16
770             }
771             val1 = __lasx_xvpickve2gr_w(out, 0);
772             val2 = __lasx_xvpickve2gr_w(out, 4);
773             val3 = __lasx_xvpickve2gr_w(out, 2);
774             val4 = __lasx_xvpickve2gr_w(out, 6);
775             for (; j < filterSize; j++) {
776                 val1 += ((int)srcPos1[j]) * filterStart1[j];
777                 val2 += ((int)srcPos2[j]) * filterStart2[j];
778                 val3 += ((int)srcPos3[j]) * filterStart3[j];
779                 val4 += ((int)srcPos4[j]) * filterStart4[j];
780             }
781             dst[0] = FFMIN(val1 >> sh, max);
782             dst[1] = FFMIN(val2 >> sh, max);
783             dst[2] = FFMIN(val3 >> sh, max);
784             dst[3] = FFMIN(val4 >> sh, max);
785             dst       += 4;
786             filterPos += 4;
787             filter     = filterStart4 + filterSize;
788         }
789         for (i = 0; i < res; i++) {
790             int j, val = 0;
791             const uint16_t *srcPos      = src + filterPos[i];
792             __m256i src0, filter0, out0;
793
794             for (j = 0; j < filterlen; j += 8) {
795                 int dex = j << 1;
796                 src0    = __lasx_xvldx(srcPos, dex);
797                 filter0 = __lasx_xvldx(filter, dex);
798                 out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
799                 out0    = __lasx_xvhaddw_d_w(out0, out0);
800                 out0    = __lasx_xvhaddw_q_d(out0, out0);
801                 val    += __lasx_xvpickve2gr_w(out0, 0);
802             }
803             for (; j < filterSize; j++) {
804                 val += ((int)srcPos[j]) * filter[j];
805             }
806             dst[i]  = FFMIN(val >> sh, max);
807             filter += filterSize;
808         }
809     } else {
810         for (i = 0; i < dstW; i++) {
811             int val = 0;
812             const uint16_t *srcPos = src + filterPos[i];
813
814             for (int j = 0; j < filterSize; j++) {
815                 val += ((int)srcPos[j]) * filter[j];
816             }
817             dst[i]  = FFMIN(val >> sh, max);
818             filter += filterSize;
819         }
820     }
821 }
822
823 void ff_hscale_16_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
824                              const uint8_t *_src, const int16_t *filter,
825                              const int32_t *filterPos, int filterSize)
826 {
827     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
828     int i;
829     int32_t *dst        = (int32_t *) _dst;
830     const uint16_t *src = (const uint16_t *) _src;
831     int sh              = desc->comp[0].depth - 5;
832     int max = (1 << 19) - 1;
833     int len = dstW >> 2;
834     int res = dstW & 3;
835     __m256i shift;
836     __m256i zero = __lasx_xvldi(0);
837
838     if ((isAnyRGB(c->srcFormat) || c->srcFormat == AV_PIX_FMT_PAL8)
839          && desc->comp[0].depth<16) {
840         sh = 9;
841     } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) {
842         sh = 11;
843     }
844     shift = __lasx_xvreplgr2vr_w(sh);
845
846     if (filterSize == 8) {
847         __m256i v_max = __lasx_xvreplgr2vr_w(max);
848         for (i = 0; i < len; i++) {
849             SCALE_8
850         }
851         for (i = 0; i < res; i++) {
852             int val = 0;
853             __m256i src0, filter0, out0;
854
855             src0 = __lasx_xvld(src + filterPos[i], 0);
856             filter0 = __lasx_xvld(filter, 0);
857             out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
858             out0 = __lasx_xvhaddw_d_w(out0, out0);
859             out0 = __lasx_xvhaddw_q_d(out0, out0);
860             val  = __lasx_xvpickve2gr_w(out0, 0);
861             dst[i] = FFMIN(val >> sh, max);
862             filter += 8;
863         }
864     } else if (filterSize == 4) {
865         __m256i v_max = __lasx_xvreplgr2vr_w(max);
866         for (i = 0; i < len; i++) {
867             __m256i src1, src2, src3, src4, src0, filter0, out0;
868
869             src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
870             src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
871             src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
872             src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
873             filter0 = __lasx_xvld(filter, 0);
874             src1 = __lasx_xvextrins_d(src1, src2, 0x10);
875             src3 = __lasx_xvextrins_d(src3, src4, 0x10);
876             src0 = __lasx_xvpermi_q(src1, src3, 0x02);
877             out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
878             out0 = __lasx_xvhaddw_d_w(out0, out0);
879             out0 = __lasx_xvsra_w(out0, shift);
880             out0 = __lasx_xvmin_w(out0, v_max);
881             dst[0] = __lasx_xvpickve2gr_w(out0, 0);
882             dst[1] = __lasx_xvpickve2gr_w(out0, 2);
883             dst[2] = __lasx_xvpickve2gr_w(out0, 4);
884             dst[3] = __lasx_xvpickve2gr_w(out0, 6);
885             dst       += 4;
886             filterPos += 4;
887             filter    += 16;
888         }
889         for (i = 0; i < res; i++) {
890             int val = 0;
891             const uint16_t *srcPos = src + filterPos[i];
892
893             for (int j = 0; j < filterSize; j++) {
894                 val += ((int)srcPos[j]) * filter[j];
895             }
896             dst[i]  = FFMIN(val >> sh, max);
897             filter += 4;
898         }
899     } else if (filterSize > 8) {
900         int filterlen = filterSize - 7;
901
902         for (i = 0; i < len; i ++) {
903             __m256i src0, src1, src2, src3;
904             __m256i filter0, filter1, filter2, filter3, out0, out1;
905             __m256i out = zero;
906             const uint16_t *srcPos1 = src + filterPos[0];
907             const uint16_t *srcPos2 = src + filterPos[1];
908             const uint16_t *srcPos3 = src + filterPos[2];
909             const uint16_t *srcPos4 = src + filterPos[3];
910             const int16_t *filterStart1 = filter;
911             const int16_t *filterStart2 = filterStart1 + filterSize;
912             const int16_t *filterStart3 = filterStart2 + filterSize;
913             const int16_t *filterStart4 = filterStart3 + filterSize;
914             int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
915
916             for (j = 0; j < filterlen; j += 8) {
917                 SCALE_16
918             }
919             val1 = __lasx_xvpickve2gr_w(out, 0);
920             val2 = __lasx_xvpickve2gr_w(out, 4);
921             val3 = __lasx_xvpickve2gr_w(out, 2);
922             val4 = __lasx_xvpickve2gr_w(out, 6);
923             for (; j < filterSize; j++) {
924                 val1 += ((int)srcPos1[j]) * filterStart1[j];
925                 val2 += ((int)srcPos2[j]) * filterStart2[j];
926                 val3 += ((int)srcPos3[j]) * filterStart3[j];
927                 val4 += ((int)srcPos4[j]) * filterStart4[j];
928             }
929             dst[0] = FFMIN(val1 >> sh, max);
930             dst[1] = FFMIN(val2 >> sh, max);
931             dst[2] = FFMIN(val3 >> sh, max);
932             dst[3] = FFMIN(val4 >> sh, max);
933             dst       += 4;
934             filterPos += 4;
935             filter     = filterStart4 + filterSize;
936         }
937         for (i = 0; i < res; i++) {
938             int j, val = 0;
939             const uint16_t *srcPos      = src + filterPos[i];
940             __m256i src0, filter0, out0;
941
942             for (j = 0; j < filterlen; j += 8) {
943                 int dex = j << 1;
944                 src0 = __lasx_xvldx(srcPos, dex);
945                 filter0 = __lasx_xvldx(filter, dex);
946                 out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
947                 out0    = __lasx_xvhaddw_d_w(out0, out0);
948                 out0    = __lasx_xvhaddw_q_d(out0, out0);
949                 val    += __lasx_xvpickve2gr_w(out0, 0);
950             }
951             for (; j < filterSize; j++) {
952                 val += ((int)srcPos[j]) * filter[j];
953             }
954             dst[i]  = FFMIN(val >> sh, max);
955             filter += filterSize;
956         }
957     } else {
958         for (i = 0; i < dstW; i++) {
959             int val = 0;
960             const uint16_t *srcPos = src + filterPos[i];
961
962             for (int j = 0; j < filterSize; j++) {
963                 val += ((int)srcPos[j]) * filter[j];
964             }
965             dst[i]  = FFMIN(val >> sh, max);
966             filter += filterSize;
967         }
968     }
969 }
970
971 #undef SCALE_8
972 #undef SCALE_16