Imported Upstream version 6.1
[platform/upstream/ffmpeg.git] / libswscale / aarch64 / yuv2rgb_neon.S
1 /*
2  * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
3  * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23
24 .macro load_yoff_ycoeff yoff ycoeff
25 #if defined(__APPLE__)
26         ldp             w9, w10, [sp, #\yoff]
27 #else
28         ldr             w9,  [sp, #\yoff]
29         ldr             w10, [sp, #\ycoeff]
30 #endif
31 .endm
32
33 .macro load_args_nv12
34         ldr             x8,  [sp]                                       // table
35         load_yoff_ycoeff 8, 16                                           // y_offset, y_coeff
36         ld1             {v1.1d}, [x8]
37         dup             v0.8h, w10
38         dup             v3.8h, w9
39         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
40         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
41         sub             w7, w7, w0                                      // w7 = linesizeC - width     (paddingC)
42         neg             w11, w0
43 .endm
44
45 .macro load_args_nv21
46     load_args_nv12
47 .endm
48
49 .macro load_args_yuv420p
50         ldr             x13, [sp]                                       // srcV
51         ldr             w14, [sp, #8]                                   // linesizeV
52         ldr             x8,  [sp, #16]                                  // table
53         load_yoff_ycoeff 24, 32                                          // y_offset, y_coeff
54         ld1             {v1.1d}, [x8]
55         dup             v0.8h, w10
56         dup             v3.8h, w9
57         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
58         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
59         sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
60         sub             w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV)
61         lsr             w11, w0, #1
62         neg             w11, w11
63 .endm
64
65 .macro load_args_yuv422p
66         ldr             x13, [sp]                                       // srcV
67         ldr             w14, [sp, #8]                                   // linesizeV
68         ldr             x8,  [sp, #16]                                  // table
69         load_yoff_ycoeff 24, 32                                          // y_offset, y_coeff
70         ld1             {v1.1d}, [x8]
71         dup             v0.8h, w10
72         dup             v3.8h, w9
73         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
74         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
75         sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
76         sub             w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV)
77 .endm
78
79 .macro load_chroma_nv12
80         ld2             {v16.8b, v17.8b}, [x6], #16
81         ushll           v18.8h, v16.8b, #3
82         ushll           v19.8h, v17.8b, #3
83 .endm
84
85 .macro load_chroma_nv21
86         ld2             {v16.8b, v17.8b}, [x6], #16
87         ushll           v19.8h, v16.8b, #3
88         ushll           v18.8h, v17.8b, #3
89 .endm
90
91 .macro load_chroma_yuv420p
92         ld1             {v16.8b}, [ x6], #8
93         ld1             {v17.8b}, [x13], #8
94         ushll           v18.8h, v16.8b, #3
95         ushll           v19.8h, v17.8b, #3
96 .endm
97
98 .macro load_chroma_yuv422p
99     load_chroma_yuv420p
100 .endm
101
102 .macro increment_nv12
103         ands            w15, w1, #1
104         csel            w16, w7, w11, ne                                // incC = (h & 1) ? paddincC : -width
105         add             x6,  x6, w16, sxtw                              // srcC += incC
106 .endm
107
108 .macro increment_nv21
109     increment_nv12
110 .endm
111
112 .macro increment_yuv420p
113         ands            w15, w1, #1
114         csel            w16,  w7, w11, ne                               // incU = (h & 1) ? paddincU : -width/2
115         csel            w17, w14, w11, ne                               // incV = (h & 1) ? paddincV : -width/2
116         add             x6,  x6,  w16, sxtw                             // srcU += incU
117         add             x13, x13, w17, sxtw                             // srcV += incV
118 .endm
119
120 .macro increment_yuv422p
121         add             x6,  x6,  w7, sxtw                              // srcU += incU
122         add             x13, x13, w14, sxtw                             // srcV += incV
123 .endm
124
125 .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
126         add             v20.8h, v26.8h, v20.8h                          // Y1 + R1
127         add             v21.8h, v27.8h, v21.8h                          // Y2 + R2
128         add             v22.8h, v26.8h, v22.8h                          // Y1 + G1
129         add             v23.8h, v27.8h, v23.8h                          // Y2 + G2
130         add             v24.8h, v26.8h, v24.8h                          // Y1 + B1
131         add             v25.8h, v27.8h, v25.8h                          // Y2 + B2
132         sqrshrun        \r1, v20.8h, #1                                 // clip_u8((Y1 + R1) >> 1)
133         sqrshrun        \r2, v21.8h, #1                                 // clip_u8((Y2 + R1) >> 1)
134         sqrshrun        \g1, v22.8h, #1                                 // clip_u8((Y1 + G1) >> 1)
135         sqrshrun        \g2, v23.8h, #1                                 // clip_u8((Y2 + G1) >> 1)
136         sqrshrun        \b1, v24.8h, #1                                 // clip_u8((Y1 + B1) >> 1)
137         sqrshrun        \b2, v25.8h, #1                                 // clip_u8((Y2 + B1) >> 1)
138         movi            \a1, #255
139         movi            \a2, #255
140 .endm
141
142 .macro declare_func ifmt ofmt
143 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
144     load_args_\ifmt
145         mov             w9, w1
146 1:
147         mov             w8, w0                                          // w8 = width
148 2:
149         movi            v5.8h, #4, lsl #8                               // 128 * (1<<3)
150     load_chroma_\ifmt
151         sub             v18.8h, v18.8h, v5.8h                           // U*(1<<3) - 128*(1<<3)
152         sub             v19.8h, v19.8h, v5.8h                           // V*(1<<3) - 128*(1<<3)
153         sqdmulh         v20.8h, v19.8h, v1.h[0]                         // V * v2r            (R)
154         sqdmulh         v22.8h, v18.8h, v1.h[1]                         // U * u2g
155         sqdmulh         v19.8h, v19.8h, v1.h[2]                         //           V * v2g
156         add             v22.8h, v22.8h, v19.8h                          // U * u2g + V * v2g  (G)
157         sqdmulh         v24.8h, v18.8h, v1.h[3]                         // U * u2b            (B)
158         zip2            v21.8h, v20.8h, v20.8h                          // R2
159         zip1            v20.8h, v20.8h, v20.8h                          // R1
160         zip2            v23.8h, v22.8h, v22.8h                          // G2
161         zip1            v22.8h, v22.8h, v22.8h                          // G1
162         zip2            v25.8h, v24.8h, v24.8h                          // B2
163         zip1            v24.8h, v24.8h, v24.8h                          // B1
164         ld1             {v2.16b}, [x4], #16                             // load luma
165         ushll           v26.8h, v2.8b,  #3                              // Y1*(1<<3)
166         ushll2          v27.8h, v2.16b, #3                              // Y2*(1<<3)
167         sub             v26.8h, v26.8h, v3.8h                           // Y1*(1<<3) - y_offset
168         sub             v27.8h, v27.8h, v3.8h                           // Y2*(1<<3) - y_offset
169         sqdmulh         v26.8h, v26.8h, v0.8h                           // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
170         sqdmulh         v27.8h, v27.8h, v0.8h                           // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
171
172 .ifc \ofmt,argb // 1 2 3 0
173         compute_rgba    v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
174 .endif
175
176 .ifc \ofmt,rgba // 0 1 2 3
177         compute_rgba    v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
178 .endif
179
180 .ifc \ofmt,abgr // 3 2 1 0
181         compute_rgba    v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
182 .endif
183
184 .ifc \ofmt,bgra // 2 1 0 3
185         compute_rgba    v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
186 .endif
187
188         st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
189         st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
190         subs            w8, w8, #16                                     // width -= 16
191         b.gt            2b
192         add             x2, x2, w3, sxtw                                // dst  += padding
193         add             x4, x4, w5, sxtw                                // srcY += paddingY
194     increment_\ifmt
195         subs            w1, w1, #1                                      // height -= 1
196         b.gt            1b
197         mov             w0, w9
198     ret
199 endfunc
200 .endm
201
202 .macro declare_rgb_funcs ifmt
203         declare_func    \ifmt, argb
204         declare_func    \ifmt, rgba
205         declare_func    \ifmt, abgr
206         declare_func    \ifmt, bgra
207 .endm
208
209 declare_rgb_funcs nv12
210 declare_rgb_funcs nv21
211 declare_rgb_funcs yuv420p
212 declare_rgb_funcs yuv422p