2 * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
21 inner_product_gint16_none_1_neon (gint16 * o, const gint16 * a,
22 const gint16 * b, gint len, const gint16 * icoeff)
24 uint32_t remainder = len % 16;
25 len = len - remainder;
27 asm volatile (" vmov.s32 q0, #0\n"
32 " vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
33 " vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
34 " subs %[len], %[len], #16\n"
35 " vmlal.s16 q0, d16, d20\n"
36 " vmlal.s16 q1, d17, d21\n"
37 " vmlal.s16 q0, d18, d22\n"
38 " vmlal.s16 q1, d19, d23\n"
40 " vadd.s32 q0, q0, q1\n"
42 " cmp %[remainder], #0\n"
45 " vld1.16 {d16}, [%[b]]!\n"
46 " vld1.16 {d20}, [%[a]]!\n"
47 " subs %[remainder], %[remainder], #4\n"
48 " vmlal.s16 q0, d16, d20\n"
51 " vadd.s32 d0, d0, d1\n"
52 " vpadd.s32 d0, d0, d0\n"
53 " vqrshrn.s32 d0, q0, #15\n"
54 " vst1.s16 d0[0], [%[o]]\n"
55 : [a] "+r" (a), [b] "+r" (b),
56 [len] "+r" (len), [remainder] "+r" (remainder)
59 "d16", "d17", "d18", "d19",
60 "d20", "d21", "d22", "d23");
64 inner_product_gint16_linear_1_neon (gint16 * o, const gint16 * a,
65 const gint16 * b, gint len, const gint16 * icoeff)
67 uint32_t remainder = len % 8;
68 len = len - remainder;
70 asm volatile (" vmov.s16 q0, #0\n"
75 " vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
76 " vld1.16 {d20, d21}, [%[a]]!\n"
79 " subs %[len], %[len], #8\n"
80 " vmlal.s16 q0, d16, d20\n"
81 " vmlal.s16 q1, d17, d21\n"
82 " vmlal.s16 q0, d18, d22\n"
83 " vmlal.s16 q1, d19, d23\n"
85 " vadd.s32 q0, q0, q1\n"
87 " cmp %[remainder], #0\n"
90 " vld1.16 {d16, d17}, [%[b]]!\n"
91 " vld1.16 {d20}, [%[a]]!\n"
94 " subs %[remainder], %[remainder], #4\n"
95 " vmlal.s16 q0, d16, d20\n"
96 " vmlal.s16 q0, d17, d21\n"
99 " vshrn.s32 d0, q0, #15\n"
100 " vld1.16 {d20}, [%[ic]]\n"
101 " vmull.s16 q0, d0, d20\n"
102 " vadd.s32 d0, d0, d1\n"
103 " vpadd.s32 d0, d0, d0\n"
104 " vqrshrn.s32 d0, q0, #15\n"
105 " vst1.s16 d0[0], [%[o]]\n"
106 : [a] "+r" (a), [b] "+r" (b),
107 [len] "+r" (len), [remainder] "+r" (remainder)
108 : [o] "r" (o), [ic] "r" (icoeff)
110 "d16", "d17", "d18", "d19",
111 "d20", "d21", "d22", "d23" , "memory");
115 inner_product_gint16_cubic_1_neon (gint16 * o, const gint16 * a,
116 const gint16 * b, gint len, const gint16 * icoeff)
118 asm volatile (" vmov.s32 q0, #0\n"
123 " vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
124 " vld4.16 {d20[], d21[], d22[], d23[]}, [%[a]]!\n"
125 " subs %[len], %[len], #4\n"
126 " vmlal.s16 q0, d16, d20\n"
127 " vmlal.s16 q1, d17, d21\n"
128 " vmlal.s16 q0, d18, d22\n"
129 " vmlal.s16 q1, d19, d23\n"
131 " vadd.s32 q0, q0, q1\n"
133 " vshrn.s32 d0, q0, #15\n"
134 " vld1.16 {d20}, [%[ic]]\n"
135 " vmull.s16 q0, d0, d20\n"
136 " vadd.s32 d0, d0, d1\n"
137 " vpadd.s32 d0, d0, d0\n"
138 " vqrshrn.s32 d0, q0, #15\n"
139 " vst1.s16 d0[0], [%[o]]\n"
140 : [a] "+r" (a), [b] "+r" (b),
142 : [o] "r" (o), [ic] "r" (icoeff)
144 "d16", "d17", "d18", "d19",
145 "d20", "d21", "d22", "d23" , "memory");
149 inner_product_gint32_none_1_neon (gint32 * o, const gint32 * a,
150 const gint32 * b, gint len, const gint32 * icoeff)
152 uint32_t remainder = len % 8;
153 len = len - remainder;
155 asm volatile (" vmov.s64 q0, #0\n"
160 " vld1.32 {d16, d17, d18, d19}, [%[b]]!\n"
161 " vld1.32 {d20, d21, d22, d23}, [%[a]]!\n"
162 " subs %[len], %[len], #8\n"
163 " vmlal.s32 q0, d16, d20\n"
164 " vmlal.s32 q1, d17, d21\n"
165 " vmlal.s32 q0, d18, d22\n"
166 " vmlal.s32 q1, d19, d23\n"
168 " vadd.s64 q0, q0, q1\n"
170 " cmp %[remainder], #0\n"
173 " vld1.32 {d16, d17}, [%[b]]!\n"
174 " vld1.32 {d20, d21}, [%[a]]!\n"
175 " subs %[remainder], %[remainder], #4\n"
176 " vmlal.s32 q0, d16, d20\n"
177 " vmlal.s32 q0, d17, d21\n"
180 " vadd.s64 d0, d0, d1\n"
181 " vqrshrn.s64 d0, q0, #31\n"
182 " vst1.s32 d0[0], [%[o]]\n"
183 : [a] "+r" (a), [b] "+r" (b),
184 [len] "+r" (len), [remainder] "+r" (remainder)
187 "d16", "d17", "d18", "d19",
188 "d20", "d21", "d22", "d23");
192 inner_product_gint32_linear_1_neon (gint32 * o, const gint32 * a,
193 const gint32 * b, gint len, const gint32 * icoeff)
195 asm volatile (" vmov.s64 q0, #0\n"
200 " vld1.s32 {d16, d17, d18, d19}, [%[b]]!\n"
201 " vld2.s32 {d20[], d21[]}, [%[a]]!\n"
202 " vld2.s32 {d22[], d23[]}, [%[a]]!\n"
203 " subs %[len], %[len], #4\n"
204 " vmlal.s32 q0, d16, d20\n"
205 " vmlal.s32 q1, d17, d21\n"
206 " vmlal.s32 q0, d18, d22\n"
207 " vmlal.s32 q1, d19, d23\n"
209 " vadd.s64 q0, q0, q1\n"
211 " vld1.s32 {d20}, [%[ic]]\n"
212 " vshrn.s64 d0, q0, #31\n"
213 " vmull.s32 q0, d0, d20\n"
214 " vadd.s64 d0, d0, d1\n"
215 " vqrshrn.s64 d0, q0, #31\n"
216 " vst1.s32 d0[0], [%[o]]\n"
217 : [a] "+r" (a), [b] "+r" (b),
219 : [o] "r" (o), [ic] "r" (icoeff)
221 "d16", "d17", "d18", "d19",
222 "d20", "d21", "d22", "d23", "memory");
226 inner_product_gint32_cubic_1_neon (gint32 * o, const gint32 * a,
227 const gint32 * b, gint len, const gint32 * icoeff)
229 asm volatile (" vmov.s64 q0, #0\n"
234 " vld1.s32 {q4, q5}, [%[b]]!\n"
235 " vld1.s32 {q6, q7}, [%[b]]!\n"
236 " vld1.s32 {d16[], d17[]}, [%[a]]!\n"
237 " vld1.s32 {d18[], d19[]}, [%[a]]!\n"
238 " vld1.s32 {d20[], d21[]}, [%[a]]!\n"
239 " vld1.s32 {d22[], d23[]}, [%[a]]!\n"
240 " subs %[len], %[len], #4\n"
241 " vmlal.s32 q0, d16, d8\n"
242 " vmlal.s32 q1, d17, d9\n"
243 " vmlal.s32 q0, d18, d10\n"
244 " vmlal.s32 q1, d19, d11\n"
245 " vmlal.s32 q0, d20, d12\n"
246 " vmlal.s32 q1, d21, d13\n"
247 " vmlal.s32 q0, d22, d14\n"
248 " vmlal.s32 q1, d23, d15\n"
251 " vld1.s32 {d20, d21}, [%[ic]]\n"
252 " vshrn.s64 d16, q0, #31\n"
253 " vshrn.s64 d17, q1, #31\n"
254 " vmull.s32 q0, d20, d16\n"
255 " vmlal.s32 q0, d21, d17\n"
256 " vadd.s64 d0, d0, d1\n"
257 " vqrshrn.s64 d0, q0, #31\n"
258 " vst1.s32 d0[0], [%[o]]\n"
259 : [a] "+r" (a), [b] "+r" (b),
261 : [o] "r" (o), [ic] "r" (icoeff)
262 : "cc", "q0", "q1", "q4", "q5", "q6", "q7", "q8",
263 "q9", "q10", "q11", "memory");
267 inner_product_gfloat_none_1_neon (gfloat * o, const gfloat * a,
268 const gfloat * b, gint len, const gfloat * icoeff)
270 uint32_t remainder = len % 16;
271 len = len - remainder;
273 asm volatile (" vmov.f32 q0, #0.0\n"
276 " vmov.f32 q1, #0.0\n"
278 " vld1.32 {q4, q5}, [%[b]]!\n"
279 " vld1.32 {q8, q9}, [%[a]]!\n"
280 " vld1.32 {q6, q7}, [%[b]]!\n"
281 " vld1.32 {q10, q11}, [%[a]]!\n"
282 " subs %[len], %[len], #16\n"
283 " vmla.f32 q0, q4, q8\n"
284 " vmla.f32 q1, q5, q9\n"
285 " vmla.f32 q0, q6, q10\n"
286 " vmla.f32 q1, q7, q11\n"
288 " vadd.f32 q0, q0, q1\n"
290 " cmp %[remainder], #0\n"
293 " vld1.32 {q6}, [%[b]]!\n"
294 " vld1.32 {q10}, [%[a]]!\n"
295 " subs %[remainder], %[remainder], #4\n"
296 " vmla.f32 q0, q6, q10\n"
299 " vadd.f32 d0, d0, d1\n"
300 " vpadd.f32 d0, d0, d0\n"
301 " vst1.f32 d0[0], [%[o]]\n"
302 : [a] "+r" (a), [b] "+r" (b),
303 [len] "+r" (len), [remainder] "+r" (remainder)
305 : "cc", "q0", "q1", "q4", "q5", "q6", "q7", "q8",
310 inner_product_gfloat_linear_1_neon (gfloat * o, const gfloat * a,
311 const gfloat * b, gint len, const gfloat * icoeff)
313 uint32_t remainder = len % 8;
314 len = len - remainder;
316 asm volatile (" vmov.f32 q0, #0.0\n"
319 " vmov.f32 q1, #0.0\n"
321 " vld2.f32 {q4, q5}, [%[b]]!\n"
322 " vld2.f32 {q6, q7}, [%[b]]!\n"
323 " vld1.f32 {q8, q9}, [%[a]]!\n"
324 " subs %[len], %[len], #8\n"
325 " vmla.f32 q0, q4, q8\n"
326 " vmla.f32 q1, q5, q8\n"
327 " vmla.f32 q0, q6, q9\n"
328 " vmla.f32 q1, q7, q9\n"
330 " vadd.f32 q0, q0, q1\n"
332 " cmp %[remainder], #0\n"
335 " vld2.f32 {q4}, [%[b]]!\n"
336 " vld1.f32 {q8}, [%[a]]!\n"
337 " subs %[remainder], %[remainder], #4\n"
338 " vmla.f32 q0, q4, q8\n"
341 " vld1.f32 {q10}, [%[ic]]\n"
342 " vmul.f32 q0, q0, q10\n"
343 " vadd.f32 d0, d0, d1\n"
344 " vpadd.f32 d0, d0, d0\n"
345 " vst1.f32 d0[0], [%[o]]\n"
346 : [a] "+r" (a), [b] "+r" (b),
347 [len] "+r" (len), [remainder] "+r" (remainder)
348 : [o] "r" (o), [ic] "r" (icoeff)
349 : "cc", "q0", "q1", "q4", "q5", "q6", "q7", "q8",
350 "q9", "q10", "q11", "memory");
354 inner_product_gfloat_cubic_1_neon (gfloat * o, const gfloat * a,
355 const gfloat * b, gint len, const gfloat * icoeff)
357 asm volatile (" vmov.f32 q0, #0.0\n"
360 " vmov.f32 q1, #0.0\n"
362 " vld1.f32 {q4, q5}, [%[b]]!\n"
363 " vld1.f32 {q6, q7}, [%[b]]!\n"
364 " vld1.f32 {d16[], d17[]}, [%[a]]!\n"
365 " vld1.f32 {d18[], d19[]}, [%[a]]!\n"
366 " vld1.f32 {d20[], d21[]}, [%[a]]!\n"
367 " vld1.f32 {d22[], d23[]}, [%[a]]!\n"
368 " subs %[len], %[len], #4\n"
369 " vmla.f32 q0, q4, q8\n"
370 " vmla.f32 q1, q5, q9\n"
371 " vmla.f32 q0, q6, q10\n"
372 " vmla.f32 q1, q7, q11\n"
374 " vadd.f32 q0, q0, q1\n"
376 " vld1.f32 {q10}, [%[ic]]\n"
377 " vmul.f32 q0, q0, q10\n"
378 " vadd.f32 d0, d0, d1\n"
379 " vpadd.f32 d0, d0, d0\n"
380 " vst1.f32 d0[0], [%[o]]\n"
381 : [a] "+r" (a), [b] "+r" (b),
383 : [o] "r" (o), [ic] "r" (icoeff)
384 : "cc", "q0", "q1", "q4", "q5", "q6", "q7", "q8",
385 "q9", "q10", "q11", "memory");
389 inner_product_gdouble_none_1_neon (gdouble * o, const gdouble * a,
390 const gdouble * b, gint len, const gdouble * icoeff)
395 inner_product_gdouble_linear_1_neon (gdouble * o, const gdouble * a,
396 const gdouble * b, gint len, const gdouble * icoeff)
401 inner_product_gdouble_cubic_1_neon (gdouble * o, const gdouble * a,
402 const gdouble * b, gint len, const gdouble * icoeff)
407 interpolate_gdouble_linear_neon (gdouble * o, const gdouble * a,
408 gint len, const gdouble * icoeff)
413 interpolate_gdouble_cubic_neon (gdouble * o, const gdouble * a,
414 gint len, const gdouble * icoeff)
418 MAKE_RESAMPLE_FUNC (gint16, none, 1, neon);
419 MAKE_RESAMPLE_FUNC (gint16, linear, 1, neon);
420 MAKE_RESAMPLE_FUNC (gint16, cubic, 1, neon);
422 MAKE_RESAMPLE_FUNC (gint32, none, 1, neon);
423 MAKE_RESAMPLE_FUNC (gint32, linear, 1, neon);
424 MAKE_RESAMPLE_FUNC (gint32, cubic, 1, neon);
426 MAKE_RESAMPLE_FUNC (gfloat, none, 1, neon);
427 MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon);
428 MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon);
430 MAKE_RESAMPLE_FUNC (gdouble, none, 1, neon);
431 MAKE_RESAMPLE_FUNC (gdouble, linear, 1, neon);
432 MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, neon);
435 audio_resampler_check_neon (const gchar *target_name, const gchar *option)
437 if (!strcmp (target_name, "neon")) {
438 GST_DEBUG ("enable NEON optimisations");
439 resample_gint16_none_1 = resample_gint16_none_1_neon;
440 resample_gint16_linear_1 = resample_gint16_linear_1_neon;
441 resample_gint16_cubic_1 = resample_gint16_cubic_1_neon;
443 resample_gint32_none_1 = resample_gint32_none_1_neon;
444 resample_gint32_linear_1 = resample_gint32_linear_1_neon;
445 resample_gint32_cubic_1 = resample_gint32_cubic_1_neon;
447 resample_gfloat_none_1 = resample_gfloat_none_1_neon;
448 resample_gfloat_linear_1 = resample_gfloat_linear_1_neon;
449 resample_gfloat_cubic_1 = resample_gfloat_cubic_1_neon;
453 resample_gdouble_none_1 = resample_gdouble_none_1_neon;
454 resample_gdouble_linear_1 = resample_gdouble_linear_1_neon;
455 resample_gdouble_cubic_1 = resample_gdouble_cubic_1_neon;
457 interpolate_gdouble_linear = interpolate_gdouble_linear_neon;
458 interpolate_gdouble_cubic = interpolate_gdouble_cubic_neon;