2 * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
21 inner_product_gint16_none_1_neon (gint16 * o, const gint16 * a,
22 const gint16 * b, gint len, const gint16 * icoeff)
24 uint32_t remainder = len % 16;
25 len = len - remainder;
27 asm volatile (" vmov.s32 q0, #0\n"
32 " vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
33 " vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
34 " subs %[len], %[len], #16\n"
35 " vmlal.s16 q0, d16, d20\n"
36 " vmlal.s16 q1, d17, d21\n"
37 " vmlal.s16 q0, d18, d22\n"
38 " vmlal.s16 q1, d19, d23\n"
40 " vadd.s32 q0, q0, q1\n"
42 " cmp %[remainder], #0\n"
45 " vld1.16 {d16}, [%[b]]!\n"
46 " vld1.16 {d20}, [%[a]]!\n"
47 " subs %[remainder], %[remainder], #4\n"
48 " vmlal.s16 q0, d16, d20\n"
51 " vadd.s32 d0, d0, d1\n"
52 " vpadd.s32 d0, d0, d0\n"
53 " vqrshrn.s32 d0, q0, #15\n"
54 " vst1.s16 d0[0], [%[o]]\n"
55 : [a] "+r" (a), [b] "+r" (b),
56 [len] "+r" (len), [remainder] "+r" (remainder)
59 "d16", "d17", "d18", "d19",
60 "d20", "d21", "d22", "d23");
64 inner_product_gint16_linear_1_neon (gint16 * o, const gint16 * a,
65 const gint16 * b, gint len, const gint16 * icoeff)
67 uint32_t remainder = len % 8;
68 len = len - remainder;
70 asm volatile (" vmov.s16 q0, #0\n"
75 " vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
76 " vld1.16 {d20, d21}, [%[a]]!\n"
79 " subs %[len], %[len], #8\n"
80 " vmlal.s16 q0, d16, d20\n"
81 " vmlal.s16 q1, d17, d21\n"
82 " vmlal.s16 q0, d18, d22\n"
83 " vmlal.s16 q1, d19, d23\n"
85 " vadd.s32 q0, q0, q1\n"
87 " cmp %[remainder], #0\n"
90 " vld1.16 {d16, d17}, [%[b]]!\n"
91 " vld1.16 {d20}, [%[a]]!\n"
94 " subs %[remainder], %[remainder], #4\n"
95 " vmlal.s16 q0, d16, d20\n"
96 " vmlal.s16 q0, d17, d21\n"
99 " vshrn.s32 d0, q0, #15\n"
100 " vld1.16 {d20}, [%[ic]]\n"
101 " vmull.s16 q0, d0, d20\n"
102 " vadd.s32 d0, d0, d1\n"
103 " vpadd.s32 d0, d0, d0\n"
104 " vqrshrn.s32 d0, q0, #15\n"
105 " vst1.s16 d0[0], [%[o]]\n"
106 : [a] "+r" (a), [b] "+r" (b),
107 [len] "+r" (len), [remainder] "+r" (remainder)
108 : [o] "r" (o), [ic] "r" (icoeff)
110 "d16", "d17", "d18", "d19",
111 "d20", "d21", "d22", "d23" , "memory");
115 inner_product_gint16_cubic_1_neon (gint16 * o, const gint16 * a,
116 const gint16 * b, gint len, const gint16 * icoeff)
118 uint32_t remainder = len % 4;
119 len = len - remainder;
121 asm volatile (" vmov.s32 q0, #0\n"
126 " vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
127 " vld4.16 {d20[], d21[], d22[], d23[]}, [%[a]]!\n"
128 " subs %[len], %[len], #4\n"
129 " vmlal.s16 q0, d16, d20\n"
130 " vmlal.s16 q1, d17, d21\n"
131 " vmlal.s16 q0, d18, d22\n"
132 " vmlal.s16 q1, d19, d23\n"
134 " vadd.s32 q0, q0, q1\n"
136 " cmp %[remainder], #0\n"
139 " vld1.16 {d16}, [%[b]]!\n"
140 " vld1.16 {d20[]}, [%[a]]!\n"
141 " subs %[remainder], %[remainder], #1\n"
142 " vmlal.s16 q0, d16, d20\n"
145 " vshrn.s32 d0, q0, #15\n"
146 " vld1.16 {d20}, [%[ic]]\n"
147 " vmull.s16 q0, d0, d20\n"
148 " vadd.s32 d0, d0, d1\n"
149 " vpadd.s32 d0, d0, d0\n"
150 " vqrshrn.s32 d0, q0, #15\n"
151 " vst1.s16 d0[0], [%[o]]\n"
152 : [a] "+r" (a), [b] "+r" (b),
153 [len] "+r" (len), [remainder] "+r" (remainder)
154 : [o] "r" (o), [ic] "r" (icoeff)
156 "d16", "d17", "d18", "d19",
157 "d20", "d21", "d22", "d23" , "memory");
161 inner_product_gint32_none_1_neon (gint32 * o, const gint32 * a,
162 const gint32 * b, gint len, const gint32 * icoeff)
167 inner_product_gint32_linear_1_neon (gint32 * o, const gint32 * a,
168 const gint32 * b, gint len, const gint32 * icoeff)
173 inner_product_gint32_cubic_1_neon (gint32 * o, const gint32 * a,
174 const gint32 * b, gint len, const gint32 * icoeff)
179 inner_product_gfloat_none_1_neon (gfloat * o, const gfloat * a,
180 const gfloat * b, gint len, const gfloat * icoeff)
182 uint32_t remainder = len % 16;
183 len = len - remainder;
185 asm volatile (" vmov.f32 q0, #0.0\n"
188 " vmov.f32 q1, #0.0\n"
190 " vld1.32 {q4, q5}, [%[b]]!\n"
191 " vld1.32 {q8, q9}, [%[a]]!\n"
192 " vld1.32 {q6, q7}, [%[b]]!\n"
193 " vld1.32 {q10, q11}, [%[a]]!\n"
194 " subs %[len], %[len], #16\n"
195 " vmla.f32 q0, q4, q8\n"
196 " vmla.f32 q1, q5, q9\n"
197 " vmla.f32 q0, q6, q10\n"
198 " vmla.f32 q1, q7, q11\n"
200 " vadd.f32 q0, q0, q1\n"
202 " cmp %[remainder], #0\n"
205 " vld1.32 {q6}, [%[b]]!\n"
206 " vld1.32 {q10}, [%[a]]!\n"
207 " subs %[remainder], %[remainder], #4\n"
208 " vmla.f32 q0, q6, q10\n"
211 " vadd.f32 d0, d0, d1\n"
212 " vpadd.f32 d0, d0, d0\n"
213 " vst1.f32 d0[0], [%[o]]\n"
214 : [a] "+r" (a), [b] "+r" (b),
215 [len] "+r" (len), [remainder] "+r" (remainder)
217 : "cc", "q0", "q1", "q4", "q5", "q6", "q7", "q8",
223 inner_product_gfloat_linear_1_neon (gfloat * o, const gfloat * a,
224 const gfloat * b, gint len, const gfloat * icoeff)
226 uint32_t remainder = len % 8;
227 len = len - remainder;
229 asm volatile (" vmov.f32 q0, #0.0\n"
232 " vmov.f32 q1, #0.0\n"
234 " vld2.f32 {q4, q5}, [%[b]]!\n"
235 " vld2.f32 {q6, q7}, [%[b]]!\n"
236 " vld1.f32 {q8, q9}, [%[a]]!\n"
237 " subs %[len], %[len], #8\n"
238 " vmla.f32 q0, q4, q8\n"
239 " vmla.f32 q1, q5, q8\n"
240 " vmla.f32 q0, q6, q9\n"
241 " vmla.f32 q1, q7, q9\n"
243 " vadd.f32 q0, q0, q1\n"
245 " cmp %[remainder], #0\n"
248 " vld2.f32 {q4}, [%[b]]!\n"
249 " vld1.f32 {q8}, [%[a]]!\n"
250 " subs %[remainder], %[remainder], #2\n"
251 " vmla.f32 q0, q4, q8\n"
254 " vld1.f32 {q10}, [%[ic]]\n"
255 " vmul.f32 q0, q0, q10\n"
256 " vadd.f32 d0, d0, d1\n"
257 " vpadd.f32 d0, d0, d0\n"
258 " vst1.f32 d0[0], [%[o]]\n"
259 : [a] "+r" (a), [b] "+r" (b),
260 [len] "+r" (len), [remainder] "+r" (remainder)
261 : [o] "r" (o), [ic] "r" (icoeff)
262 : "cc", "q0", "q1", "q4", "q5", "q6", "q7", "q8",
263 "q9", "q10", "q11", "memory");
267 inner_product_gfloat_cubic_1_neon (gfloat * o, const gfloat * a,
268 const gfloat * b, gint len, const gfloat * icoeff)
270 uint32_t remainder = len % 4;
271 len = len - remainder;
273 asm volatile (" vmov.f32 q0, #0.0\n"
276 " vmov.f32 q1, #0.0\n"
278 " vld1.f32 {q4, q5}, [%[b]]!\n"
279 " vld1.f32 {q6, q7}, [%[b]]!\n"
280 " vld1.f32 {d16[], d17[]}, [%[a]]!\n"
281 " vld1.f32 {d18[], d19[]}, [%[a]]!\n"
282 " vld1.f32 {d20[], d21[]}, [%[a]]!\n"
283 " vld1.f32 {d22[], d23[]}, [%[a]]!\n"
284 " subs %[len], %[len], #4\n"
285 " vmla.f32 q0, q4, q8\n"
286 " vmla.f32 q1, q5, q9\n"
287 " vmla.f32 q0, q6, q10\n"
288 " vmla.f32 q1, q7, q11\n"
290 " vadd.f32 q0, q0, q1\n"
292 " cmp %[remainder], #0\n"
295 " vld1.f32 {q4}, [%[b]]!\n"
296 " vld1.f32 {d16[], d17[]}, [%[a]]!\n"
297 " subs %[remainder], %[remainder], #1\n"
298 " vmla.f32 q0, q4, q8\n"
301 " vld1.f32 {q10}, [%[ic]]\n"
302 " vmul.f32 q0, q0, q10\n"
303 " vadd.f32 d0, d0, d1\n"
304 " vpadd.f32 d0, d0, d0\n"
305 " vst1.f32 d0[0], [%[o]]\n"
306 : [a] "+r" (a), [b] "+r" (b),
307 [len] "+r" (len), [remainder] "+r" (remainder)
308 : [o] "r" (o), [ic] "r" (icoeff)
309 : "cc", "q0", "q1", "q4", "q5", "q6", "q7", "q8",
310 "q9", "q10", "q11", "memory");
314 inner_product_gdouble_none_1_neon (gdouble * o, const gdouble * a,
315 const gdouble * b, gint len, const gdouble * icoeff)
320 inner_product_gdouble_linear_1_neon (gdouble * o, const gdouble * a,
321 const gdouble * b, gint len, const gdouble * icoeff)
326 inner_product_gdouble_cubic_1_neon (gdouble * o, const gdouble * a,
327 const gdouble * b, gint len, const gdouble * icoeff)
332 interpolate_gdouble_linear_neon (gdouble * o, const gdouble * a,
333 gint len, const gdouble * icoeff)
338 interpolate_gdouble_cubic_neon (gdouble * o, const gdouble * a,
339 gint len, const gdouble * icoeff)
343 MAKE_RESAMPLE_FUNC (gint16, none, 1, neon);
344 MAKE_RESAMPLE_FUNC (gint16, linear, 1, neon);
345 MAKE_RESAMPLE_FUNC (gint16, cubic, 1, neon);
347 MAKE_RESAMPLE_FUNC (gint32, none, 1, neon);
348 MAKE_RESAMPLE_FUNC (gint32, linear, 1, neon);
349 MAKE_RESAMPLE_FUNC (gint32, cubic, 1, neon);
351 MAKE_RESAMPLE_FUNC (gfloat, none, 1, neon);
352 MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon);
353 MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon);
355 MAKE_RESAMPLE_FUNC (gdouble, none, 1, neon);
356 MAKE_RESAMPLE_FUNC (gdouble, linear, 1, neon);
357 MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, neon);
360 audio_resampler_check_neon (const gchar *target_name, const gchar *option)
362 if (!strcmp (target_name, "neon")) {
363 GST_DEBUG ("enable NEON optimisations");
364 resample_gint16_none_1 = resample_gint16_none_1_neon;
365 resample_gint16_linear_1 = resample_gint16_linear_1_neon;
366 resample_gint16_cubic_1 = resample_gint16_cubic_1_neon;
368 resample_gfloat_none_1 = resample_gfloat_none_1_neon;
369 resample_gfloat_linear_1 = resample_gfloat_linear_1_neon;
370 resample_gfloat_cubic_1 = resample_gfloat_cubic_1_neon;
373 resample_gint32_none_1 = resample_gint32_none_1_neon;
374 resample_gint32_linear_1 = resample_gint32_linear_1_neon;
375 resample_gint32_cubic_1 = resample_gint32_cubic_1_neon;
377 resample_gdouble_none_1 = resample_gdouble_none_1_neon;
378 resample_gdouble_linear_1 = resample_gdouble_linear_1_neon;
379 resample_gdouble_cubic_1 = resample_gdouble_cubic_1_neon;
381 interpolate_gdouble_linear = interpolate_gdouble_linear_neon;
382 interpolate_gdouble_cubic = interpolate_gdouble_cubic_neon;