2 * Loongson LSX optimized swscale
4 * Copyright (c) 2023 Loongson Technology Corporation Limited
5 * Contributed by Lu Wang <wanglu@loongson.cn>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "libavcodec/loongarch/loongson_asm.S"
26 /* void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4],
27 * int width, int32_t *rgb2yuv)
29 function planar_rgb_to_y_lsx
56 vilvl.b vr11, vr7, vr8
57 vilvl.b vr12, vr7, vr9
58 vilvl.b vr13, vr7, vr10
59 vilvl.h vr14, vr7, vr11
60 vilvl.h vr15, vr7, vr12
61 vilvl.h vr16, vr7, vr13
62 vilvh.h vr17, vr7, vr11
63 vilvh.h vr18, vr7, vr12
64 vilvh.h vr19, vr7, vr13
65 vmul.w vr20, vr1, vr16
66 vmul.w vr21, vr1, vr19
67 vmadd.w vr20, vr2, vr14
68 vmadd.w vr20, vr3, vr15
69 vmadd.w vr21, vr2, vr17
70 vmadd.w vr21, vr3, vr18
71 vadd.w vr20, vr20, vr5
72 vadd.w vr21, vr21, vr5
73 vsra.w vr20, vr20, vr4
74 vsra.w vr21, vr21, vr4
75 vpickev.h vr20, vr21, vr20
91 vilvl.b vr11, vr7, vr8
92 vilvl.b vr12, vr7, vr9
93 vilvl.b vr13, vr7, vr10
94 vilvl.h vr14, vr7, vr11
95 vilvl.h vr15, vr7, vr12
96 vilvl.h vr16, vr7, vr13
97 vmul.w vr17, vr1, vr16
98 vmadd.w vr17, vr2, vr14
99 vmadd.w vr17, vr3, vr15
100 vadd.w vr17, vr17, vr5
101 vsra.w vr17, vr17, vr4
102 vpickev.h vr17, vr17, vr17
103 vstelm.d vr17, a0, 0, 0
134 /* void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
135 * int width, int32_t *rgb2yuv)
137 function planar_rgb_to_uv_lsx
146 ld.w t1, a4, 12 // ru
147 ld.w t2, a4, 16 // gu
148 ld.w t3, a4, 20 // bu
149 ld.w s1, a4, 24 // rv
150 ld.w s2, a4, 28 // gv
151 ld.w s3, a4, 32 // bv
165 bge a2, t8, .LOOP_WIDTH8
166 bge a2, t7, .LOOP_WIDTH4
167 blt zero, a2, .LOOP_WIDTH
174 vilvl.b vr9, vr0, vr9
175 vilvl.b vr10, vr0, vr10
176 vilvl.b vr11, vr0, vr11
177 vilvl.h vr12, vr0, vr9
178 vilvl.h vr13, vr0, vr10
179 vilvl.h vr14, vr0, vr11
180 vilvh.h vr15, vr0, vr9
181 vilvh.h vr16, vr0, vr10
182 vilvh.h vr17, vr0, vr11
183 vmul.w vr18, vr1, vr14
184 vmul.w vr19, vr1, vr17
185 vmul.w vr20, vr4, vr14
186 vmul.w vr21, vr4, vr17
187 vmadd.w vr18, vr2, vr12
188 vmadd.w vr18, vr3, vr13
189 vmadd.w vr19, vr2, vr15
190 vmadd.w vr19, vr3, vr16
191 vmadd.w vr20, vr5, vr12
192 vmadd.w vr20, vr6, vr13
193 vmadd.w vr21, vr5, vr15
194 vmadd.w vr21, vr6, vr16
195 vadd.w vr18, vr18, vr8
196 vadd.w vr19, vr19, vr8
197 vadd.w vr20, vr20, vr8
198 vadd.w vr21, vr21, vr8
199 vsra.w vr18, vr18, vr7
200 vsra.w vr19, vr19, vr7
201 vsra.w vr20, vr20, vr7
202 vsra.w vr21, vr21, vr7
203 vpickev.h vr18, vr19, vr18
204 vpickev.h vr20, vr21, vr20
213 bge a3, t8, .LOOP_WIDTH8
214 bge a3, t7, .LOOP_WIDTH4
215 blt zero, a3, .LOOP_WIDTH
222 vilvl.b vr9, vr0, vr9
223 vilvl.b vr10, vr0, vr10
224 vilvl.b vr11, vr0, vr11
225 vilvl.h vr12, vr0, vr9
226 vilvl.h vr13, vr0, vr10
227 vilvl.h vr14, vr0, vr11
228 vmul.w vr18, vr1, vr14
229 vmul.w vr19, vr4, vr14
230 vmadd.w vr18, vr2, vr12
231 vmadd.w vr18, vr3, vr13
232 vmadd.w vr19, vr5, vr12
233 vmadd.w vr19, vr6, vr13
234 vadd.w vr18, vr18, vr8
235 vadd.w vr19, vr19, vr8
236 vsra.w vr18, vr18, vr7
237 vsra.w vr19, vr19, vr7
238 vpickev.h vr18, vr18, vr18
239 vpickev.h vr19, vr19, vr19
240 vstelm.d vr18, a0, 0, 0
241 vstelm.d vr19, a1, 0, 0
248 bge a3, t7, .LOOP_WIDTH4
249 blt zero, a3, .LOOP_WIDTH
278 blt zero, a3, .LOOP_WIDTH