[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / LinearMath / btVector3.cpp
1 /*
2  Copyright (c) 2011 Apple Inc.
3  https://bulletphysics.org
4  
5  This software is provided 'as-is', without any express or implied warranty.
6  In no event will the authors be held liable for any damages arising from the use of this software.
7  Permission is granted to anyone to use this software for any purpose, 
8  including commercial applications, and to alter it and redistribute it freely, 
9  subject to the following restrictions:
10  
11  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13  3. This notice may not be removed or altered from any source distribution.
14  
15  This source version has been altered.
16  */
17
18 #if defined(_WIN32) || defined(__i386__)
19 #define BT_USE_SSE_IN_API
20 #endif
21
22 #include "btVector3.h"
23
24 #if defined BT_USE_SIMD_VECTOR3
25
26 #if DEBUG
27 #include <string.h>  //for memset
28 #endif
29
30 #ifdef __APPLE__
31 #include <stdint.h>
32 typedef float float4 __attribute__((vector_size(16)));
33 #else
34 #define float4 __m128
35 #endif
36 //typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
37
38 #if defined BT_USE_SSE || defined _WIN32
39
40 #define LOG2_ARRAY_SIZE 6
41 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
42
43 #include <emmintrin.h>
44
45 long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
46 long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
47 {
48         const float4 *vertices = (const float4 *)vv;
49         static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
50         float4 dotMax = btAssign128(-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY);
51         float4 vvec = _mm_loadu_ps(vec);
52         float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));  /// zzzz
53         float4 vLo = _mm_movelh_ps(vvec, vvec);                                    /// xyxy
54
55         long maxIndex = -1L;
56
57         size_t segment = 0;
58         float4 stack_array[STACK_ARRAY_COUNT];
59
60 #if DEBUG
61         //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
62 #endif
63
64         size_t index;
65         float4 max;
66         // Faster loop without cleanup code for full tiles
67         for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
68         {
69                 max = dotMax;
70
71                 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
72                 {  // do four dot products at a time. Carefully avoid touching the w element.
73                         float4 v0 = vertices[0];
74                         float4 v1 = vertices[1];
75                         float4 v2 = vertices[2];
76                         float4 v3 = vertices[3];
77                         vertices += 4;
78
79                         float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
80                         float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
81                         float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
82                         float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
83
84                         lo0 = lo0 * vLo;
85                         lo1 = lo1 * vLo;
86                         float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
87                         float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
88                         float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
89                         z = z * vHi;
90                         x = x + y;
91                         x = x + z;
92                         stack_array[index] = x;
93                         max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
94
95                         v0 = vertices[0];
96                         v1 = vertices[1];
97                         v2 = vertices[2];
98                         v3 = vertices[3];
99                         vertices += 4;
100
101                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
102                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
103                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
104                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
105
106                         lo0 = lo0 * vLo;
107                         lo1 = lo1 * vLo;
108                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
109                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
110                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
111                         z = z * vHi;
112                         x = x + y;
113                         x = x + z;
114                         stack_array[index + 1] = x;
115                         max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
116
117                         v0 = vertices[0];
118                         v1 = vertices[1];
119                         v2 = vertices[2];
120                         v3 = vertices[3];
121                         vertices += 4;
122
123                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
124                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
125                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
126                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
127
128                         lo0 = lo0 * vLo;
129                         lo1 = lo1 * vLo;
130                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
131                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
132                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
133                         z = z * vHi;
134                         x = x + y;
135                         x = x + z;
136                         stack_array[index + 2] = x;
137                         max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
138
139                         v0 = vertices[0];
140                         v1 = vertices[1];
141                         v2 = vertices[2];
142                         v3 = vertices[3];
143                         vertices += 4;
144
145                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
146                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
147                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
148                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
149
150                         lo0 = lo0 * vLo;
151                         lo1 = lo1 * vLo;
152                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
153                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
154                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
155                         z = z * vHi;
156                         x = x + y;
157                         x = x + z;
158                         stack_array[index + 3] = x;
159                         max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
160
161                         // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
162                 }
163
164                 // If we found a new max
165                 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
166                 {
167                         // copy the new max across all lanes of our max accumulator
168                         max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
169                         max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
170
171                         dotMax = max;
172
173                         // find first occurrence of that max
174                         size_t test;
175                         for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)  // local_count must be a multiple of 4
176                         {
177                         }
178                         // record where it is.
179                         maxIndex = 4 * index + segment + indexTable[test];
180                 }
181         }
182
183         // account for work we've already done
184         count -= segment;
185
186         // Deal with the last < STACK_ARRAY_COUNT vectors
187         max = dotMax;
188         index = 0;
189
190         if (btUnlikely(count > 16))
191         {
192                 for (; index + 4 <= count / 4; index += 4)
193                 {  // do four dot products at a time. Carefully avoid touching the w element.
194                         float4 v0 = vertices[0];
195                         float4 v1 = vertices[1];
196                         float4 v2 = vertices[2];
197                         float4 v3 = vertices[3];
198                         vertices += 4;
199
200                         float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
201                         float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
202                         float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
203                         float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
204
205                         lo0 = lo0 * vLo;
206                         lo1 = lo1 * vLo;
207                         float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208                         float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209                         float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
210                         z = z * vHi;
211                         x = x + y;
212                         x = x + z;
213                         stack_array[index] = x;
214                         max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
215
216                         v0 = vertices[0];
217                         v1 = vertices[1];
218                         v2 = vertices[2];
219                         v3 = vertices[3];
220                         vertices += 4;
221
222                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
223                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
224                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
225                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
226
227                         lo0 = lo0 * vLo;
228                         lo1 = lo1 * vLo;
229                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
230                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
231                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
232                         z = z * vHi;
233                         x = x + y;
234                         x = x + z;
235                         stack_array[index + 1] = x;
236                         max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
237
238                         v0 = vertices[0];
239                         v1 = vertices[1];
240                         v2 = vertices[2];
241                         v3 = vertices[3];
242                         vertices += 4;
243
244                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
245                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
246                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
247                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
248
249                         lo0 = lo0 * vLo;
250                         lo1 = lo1 * vLo;
251                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
252                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
253                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
254                         z = z * vHi;
255                         x = x + y;
256                         x = x + z;
257                         stack_array[index + 2] = x;
258                         max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
259
260                         v0 = vertices[0];
261                         v1 = vertices[1];
262                         v2 = vertices[2];
263                         v3 = vertices[3];
264                         vertices += 4;
265
266                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
267                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
268                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
269                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
270
271                         lo0 = lo0 * vLo;
272                         lo1 = lo1 * vLo;
273                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
274                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
275                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
276                         z = z * vHi;
277                         x = x + y;
278                         x = x + z;
279                         stack_array[index + 3] = x;
280                         max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
281
282                         // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
283                 }
284         }
285
286         size_t localCount = (count & -4L) - 4 * index;
287         if (localCount)
288         {
289 #ifdef __APPLE__
290                 float4 t0, t1, t2, t3, t4;
291                 float4 *sap = &stack_array[index + localCount / 4];
292                 vertices += localCount;  // counter the offset
293                 size_t byteIndex = -(localCount) * sizeof(float);
294                 //AT&T Code style assembly
295                 asm volatile(
296                         ".align 4                                                                   \n\
297              0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
298           movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
299           movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
300           movaps  %[t0], %[max]                               // vertices[0]      \n\
301           movlhps %[t1], %[max]                               // x0y0x1y1         \n\
302          movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
303          movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
304           mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\
305          movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
306          movaps  %[t3], %[t0]                                // vertices[2]      \n\
307          movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
308          mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
309           movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
310           shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
311           mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
312          movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\
313          shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\
314          shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
315          addps   %[t3], %[max]                               // x + y            \n\
316          addps   %[t1], %[max]                               // x + y + z        \n\
317          movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
318          maxps   %[t2], %[max]                               // record max, restore max   \n\
319          add     $16, %[byteIndex]                           // advance loop counter\n\
320          jnz     0b                                          \n\
321      "
322                         : [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
323                         : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
324                         : "memory", "cc");
325                 index += localCount / 4;
326 #else
327                 {
328                         for (unsigned int i = 0; i < localCount / 4; i++, index++)
329                         {  // do four dot products at a time. Carefully avoid touching the w element.
330                                 float4 v0 = vertices[0];
331                                 float4 v1 = vertices[1];
332                                 float4 v2 = vertices[2];
333                                 float4 v3 = vertices[3];
334                                 vertices += 4;
335
336                                 float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
337                                 float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
338                                 float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
339                                 float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
340
341                                 lo0 = lo0 * vLo;
342                                 lo1 = lo1 * vLo;
343                                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
344                                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
345                                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
346                                 z = z * vHi;
347                                 x = x + y;
348                                 x = x + z;
349                                 stack_array[index] = x;
350                                 max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
351                         }
352                 }
353 #endif  //__APPLE__
354         }
355
356         // process the last few points
357         if (count & 3)
358         {
359                 float4 v0, v1, v2, x, y, z;
360                 switch (count & 3)
361                 {
362                         case 3:
363                         {
364                                 v0 = vertices[0];
365                                 v1 = vertices[1];
366                                 v2 = vertices[2];
367
368                                 // Calculate 3 dot products, transpose, duplicate v2
369                                 float4 lo0 = _mm_movelh_ps(v0, v1);  // xyxy.lo
370                                 float4 hi0 = _mm_movehl_ps(v1, v0);  // z?z?.lo
371                                 lo0 = lo0 * vLo;
372                                 z = _mm_shuffle_ps(hi0, v2, 0xa8);  // z0z1z2z2
373                                 z = z * vHi;
374                                 float4 lo1 = _mm_movelh_ps(v2, v2);  // xyxy
375                                 lo1 = lo1 * vLo;
376                                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
377                                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
378                         }
379                         break;
380                         case 2:
381                         {
382                                 v0 = vertices[0];
383                                 v1 = vertices[1];
384                                 float4 xy = _mm_movelh_ps(v0, v1);
385                                 z = _mm_movehl_ps(v1, v0);
386                                 xy = xy * vLo;
387                                 z = _mm_shuffle_ps(z, z, 0xa8);
388                                 x = _mm_shuffle_ps(xy, xy, 0xa8);
389                                 y = _mm_shuffle_ps(xy, xy, 0xfd);
390                                 z = z * vHi;
391                         }
392                         break;
393                         case 1:
394                         {
395                                 float4 xy = vertices[0];
396                                 z = _mm_shuffle_ps(xy, xy, 0xaa);
397                                 xy = xy * vLo;
398                                 z = z * vHi;
399                                 x = _mm_shuffle_ps(xy, xy, 0);
400                                 y = _mm_shuffle_ps(xy, xy, 0x55);
401                         }
402                         break;
403                 }
404                 x = x + y;
405                 x = x + z;
406                 stack_array[index] = x;
407                 max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
408                 index++;
409         }
410
411         // if we found a new max.
412         if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
413         {  // we found a new max. Search for it
414                 // find max across the max vector, place in all elements of max -- big latency hit here
415                 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
416                 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
417
418                 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
419                 // this where it actually makes a difference is handled in the early out at the top of the function,
420                 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
421                 // complexity, and removed it.
422
423                 dotMax = max;
424
425                 // scan for the first occurence of max in the array
426                 size_t test;
427                 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)  // local_count must be a multiple of 4
428                 {
429                 }
430                 maxIndex = 4 * index + segment + indexTable[test];
431         }
432
433         _mm_store_ss(dotResult, dotMax);
434         return maxIndex;
435 }
436
437 long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
438
439 long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
440 {
441         const float4 *vertices = (const float4 *)vv;
442         static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
443         float4 dotmin = btAssign128(BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY);
444         float4 vvec = _mm_loadu_ps(vec);
445         float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));  /// zzzz
446         float4 vLo = _mm_movelh_ps(vvec, vvec);                                    /// xyxy
447
448         long minIndex = -1L;
449
450         size_t segment = 0;
451         float4 stack_array[STACK_ARRAY_COUNT];
452
453 #if DEBUG
454         //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
455 #endif
456
457         size_t index;
458         float4 min;
459         // Faster loop without cleanup code for full tiles
460         for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
461         {
462                 min = dotmin;
463
464                 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
465                 {  // do four dot products at a time. Carefully avoid touching the w element.
466                         float4 v0 = vertices[0];
467                         float4 v1 = vertices[1];
468                         float4 v2 = vertices[2];
469                         float4 v3 = vertices[3];
470                         vertices += 4;
471
472                         float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
473                         float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
474                         float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
475                         float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
476
477                         lo0 = lo0 * vLo;
478                         lo1 = lo1 * vLo;
479                         float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
480                         float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
481                         float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
482                         z = z * vHi;
483                         x = x + y;
484                         x = x + z;
485                         stack_array[index] = x;
486                         min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
487
488                         v0 = vertices[0];
489                         v1 = vertices[1];
490                         v2 = vertices[2];
491                         v3 = vertices[3];
492                         vertices += 4;
493
494                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
495                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
496                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
497                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
498
499                         lo0 = lo0 * vLo;
500                         lo1 = lo1 * vLo;
501                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
502                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
503                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
504                         z = z * vHi;
505                         x = x + y;
506                         x = x + z;
507                         stack_array[index + 1] = x;
508                         min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
509
510                         v0 = vertices[0];
511                         v1 = vertices[1];
512                         v2 = vertices[2];
513                         v3 = vertices[3];
514                         vertices += 4;
515
516                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
517                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
518                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
519                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
520
521                         lo0 = lo0 * vLo;
522                         lo1 = lo1 * vLo;
523                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
524                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
525                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
526                         z = z * vHi;
527                         x = x + y;
528                         x = x + z;
529                         stack_array[index + 2] = x;
530                         min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
531
532                         v0 = vertices[0];
533                         v1 = vertices[1];
534                         v2 = vertices[2];
535                         v3 = vertices[3];
536                         vertices += 4;
537
538                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
539                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
540                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
541                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
542
543                         lo0 = lo0 * vLo;
544                         lo1 = lo1 * vLo;
545                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
546                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
547                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
548                         z = z * vHi;
549                         x = x + y;
550                         x = x + z;
551                         stack_array[index + 3] = x;
552                         min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
553
554                         // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
555                 }
556
557                 // If we found a new min
558                 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
559                 {
560                         // copy the new min across all lanes of our min accumulator
561                         min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
562                         min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
563
564                         dotmin = min;
565
566                         // find first occurrence of that min
567                         size_t test;
568                         for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)  // local_count must be a multiple of 4
569                         {
570                         }
571                         // record where it is.
572                         minIndex = 4 * index + segment + indexTable[test];
573                 }
574         }
575
576         // account for work we've already done
577         count -= segment;
578
579         // Deal with the last < STACK_ARRAY_COUNT vectors
580         min = dotmin;
581         index = 0;
582
583         if (btUnlikely(count > 16))
584         {
585                 for (; index + 4 <= count / 4; index += 4)
586                 {  // do four dot products at a time. Carefully avoid touching the w element.
587                         float4 v0 = vertices[0];
588                         float4 v1 = vertices[1];
589                         float4 v2 = vertices[2];
590                         float4 v3 = vertices[3];
591                         vertices += 4;
592
593                         float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
594                         float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
595                         float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
596                         float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
597
598                         lo0 = lo0 * vLo;
599                         lo1 = lo1 * vLo;
600                         float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
601                         float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
602                         float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
603                         z = z * vHi;
604                         x = x + y;
605                         x = x + z;
606                         stack_array[index] = x;
607                         min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
608
609                         v0 = vertices[0];
610                         v1 = vertices[1];
611                         v2 = vertices[2];
612                         v3 = vertices[3];
613                         vertices += 4;
614
615                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
616                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
617                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
618                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
619
620                         lo0 = lo0 * vLo;
621                         lo1 = lo1 * vLo;
622                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
623                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
624                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
625                         z = z * vHi;
626                         x = x + y;
627                         x = x + z;
628                         stack_array[index + 1] = x;
629                         min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
630
631                         v0 = vertices[0];
632                         v1 = vertices[1];
633                         v2 = vertices[2];
634                         v3 = vertices[3];
635                         vertices += 4;
636
637                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
638                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
639                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
640                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
641
642                         lo0 = lo0 * vLo;
643                         lo1 = lo1 * vLo;
644                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
645                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
646                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
647                         z = z * vHi;
648                         x = x + y;
649                         x = x + z;
650                         stack_array[index + 2] = x;
651                         min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
652
653                         v0 = vertices[0];
654                         v1 = vertices[1];
655                         v2 = vertices[2];
656                         v3 = vertices[3];
657                         vertices += 4;
658
659                         lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
660                         hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
661                         lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
662                         hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
663
664                         lo0 = lo0 * vLo;
665                         lo1 = lo1 * vLo;
666                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
667                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
668                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
669                         z = z * vHi;
670                         x = x + y;
671                         x = x + z;
672                         stack_array[index + 3] = x;
673                         min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
674
675                         // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
676                 }
677         }
678
679         size_t localCount = (count & -4L) - 4 * index;
680         if (localCount)
681         {
682 #ifdef __APPLE__
683                 vertices += localCount;  // counter the offset
684                 float4 t0, t1, t2, t3, t4;
685                 size_t byteIndex = -(localCount) * sizeof(float);
686                 float4 *sap = &stack_array[index + localCount / 4];
687
688                 asm volatile(
689                         ".align 4                                                                   \n\
690              0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
691              movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
692              movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
693              movaps  %[t0], %[min]                               // vertices[0]      \n\
694              movlhps %[t1], %[min]                               // x0y0x1y1         \n\
695              movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
696              movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
697              mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\
698              movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
699              movaps  %[t3], %[t0]                                // vertices[2]      \n\
700              movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
701              movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
702              mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
703              shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
704              mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
705              movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\
706              shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\
707              shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
708              addps   %[t3], %[min]                               // x + y            \n\
709              addps   %[t1], %[min]                               // x + y + z        \n\
710              movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
711              minps   %[t2], %[min]                               // record min, restore min   \n\
712              add     $16, %[byteIndex]                           // advance loop counter\n\
713              jnz     0b                                          \n\
714              "
715                         : [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
716                         : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
717                         : "memory", "cc");
718                 index += localCount / 4;
719 #else
720                 {
721                         for (unsigned int i = 0; i < localCount / 4; i++, index++)
722                         {  // do four dot products at a time. Carefully avoid touching the w element.
723                                 float4 v0 = vertices[0];
724                                 float4 v1 = vertices[1];
725                                 float4 v2 = vertices[2];
726                                 float4 v3 = vertices[3];
727                                 vertices += 4;
728
729                                 float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
730                                 float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
731                                 float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
732                                 float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
733
734                                 lo0 = lo0 * vLo;
735                                 lo1 = lo1 * vLo;
736                                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
737                                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
738                                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
739                                 z = z * vHi;
740                                 x = x + y;
741                                 x = x + z;
742                                 stack_array[index] = x;
743                                 min = _mm_min_ps(x, min);  // control the order here so that max is never NaN even if x is nan
744                         }
745                 }
746
747 #endif
748         }
749
750         // process the last few points
751         if (count & 3)
752         {
753                 float4 v0, v1, v2, x, y, z;
754                 switch (count & 3)
755                 {
756                         case 3:
757                         {
758                                 v0 = vertices[0];
759                                 v1 = vertices[1];
760                                 v2 = vertices[2];
761
762                                 // Calculate 3 dot products, transpose, duplicate v2
763                                 float4 lo0 = _mm_movelh_ps(v0, v1);  // xyxy.lo
764                                 float4 hi0 = _mm_movehl_ps(v1, v0);  // z?z?.lo
765                                 lo0 = lo0 * vLo;
766                                 z = _mm_shuffle_ps(hi0, v2, 0xa8);  // z0z1z2z2
767                                 z = z * vHi;
768                                 float4 lo1 = _mm_movelh_ps(v2, v2);  // xyxy
769                                 lo1 = lo1 * vLo;
770                                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
771                                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
772                         }
773                         break;
774                         case 2:
775                         {
776                                 v0 = vertices[0];
777                                 v1 = vertices[1];
778                                 float4 xy = _mm_movelh_ps(v0, v1);
779                                 z = _mm_movehl_ps(v1, v0);
780                                 xy = xy * vLo;
781                                 z = _mm_shuffle_ps(z, z, 0xa8);
782                                 x = _mm_shuffle_ps(xy, xy, 0xa8);
783                                 y = _mm_shuffle_ps(xy, xy, 0xfd);
784                                 z = z * vHi;
785                         }
786                         break;
787                         case 1:
788                         {
789                                 float4 xy = vertices[0];
790                                 z = _mm_shuffle_ps(xy, xy, 0xaa);
791                                 xy = xy * vLo;
792                                 z = z * vHi;
793                                 x = _mm_shuffle_ps(xy, xy, 0);
794                                 y = _mm_shuffle_ps(xy, xy, 0x55);
795                         }
796                         break;
797                 }
798                 x = x + y;
799                 x = x + z;
800                 stack_array[index] = x;
801                 min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
802                 index++;
803         }
804
805         // if we found a new min.
806         if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
807         {  // we found a new min. Search for it
808                 // find min across the min vector, place in all elements of min -- big latency hit here
809                 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
810                 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
811
812                 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
813                 // this where it actually makes a difference is handled in the early out at the top of the function,
814                 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
815                 // complexity, and removed it.
816
817                 dotmin = min;
818
819                 // scan for the first occurence of min in the array
820                 size_t test;
821                 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)  // local_count must be a multiple of 4
822                 {
823                 }
824                 minIndex = 4 * index + segment + indexTable[test];
825         }
826
827         _mm_store_ss(dotResult, dotmin);
828         return minIndex;
829 }
830
831 #elif defined BT_USE_NEON
832
833 #define ARM_NEON_GCC_COMPATIBILITY 1
834 #include <arm_neon.h>
835 #include <sys/types.h>
836 #include <sys/sysctl.h>  //for sysctlbyname
837
838 static long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
839 static long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
840 static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
841 static long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
842 static long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
843 static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
844
845 long (*_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _maxdot_large_sel;
846 long (*_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _mindot_large_sel;
847
848 static inline uint32_t btGetCpuCapabilities(void)
849 {
850         static uint32_t capabilities = 0;
851         static bool testedCapabilities = false;
852
853         if (0 == testedCapabilities)
854         {
855                 uint32_t hasFeature = 0;
856                 size_t featureSize = sizeof(hasFeature);
857                 int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
858
859                 if (0 == err && hasFeature)
860                         capabilities |= 0x2000;
861
862                 testedCapabilities = true;
863         }
864
865         return capabilities;
866 }
867
868 static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
869 {
870         if (btGetCpuCapabilities() & 0x2000)
871                 _maxdot_large = _maxdot_large_v1;
872         else
873                 _maxdot_large = _maxdot_large_v0;
874
875         return _maxdot_large(vv, vec, count, dotResult);
876 }
877
878 static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
879 {
880         if (btGetCpuCapabilities() & 0x2000)
881                 _mindot_large = _mindot_large_v1;
882         else
883                 _mindot_large = _mindot_large_v0;
884
885         return _mindot_large(vv, vec, count, dotResult);
886 }
887
888 #if defined __arm__
889 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
890 #else
891 //support 64bit arm
892 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
893 #endif
894
895 long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
896 {
897         unsigned long i = 0;
898         float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
899         float32x2_t vLo = vget_low_f32(vvec);
900         float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
901         float32x2_t dotMaxLo = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
902         float32x2_t dotMaxHi = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
903         uint32x2_t indexLo = (uint32x2_t){0, 1};
904         uint32x2_t indexHi = (uint32x2_t){2, 3};
905         uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
906         uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
907         const uint32x2_t four = (uint32x2_t){4, 4};
908
909         for (; i + 8 <= count; i += 8)
910         {
911                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
912                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
913                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
914                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
915
916                 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
917                 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
918                 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
919                 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
920
921                 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
922                 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
923                 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
924                 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
925
926                 float32x2_t rLo = vpadd_f32(xy0, xy1);
927                 float32x2_t rHi = vpadd_f32(xy2, xy3);
928                 rLo = vadd_f32(rLo, zLo);
929                 rHi = vadd_f32(rHi, zHi);
930
931                 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
932                 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
933                 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
934                 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
935                 iLo = vbsl_u32(maskLo, indexLo, iLo);
936                 iHi = vbsl_u32(maskHi, indexHi, iHi);
937                 indexLo = vadd_u32(indexLo, four);
938                 indexHi = vadd_u32(indexHi, four);
939
940                 v0 = vld1q_f32_aligned_postincrement(vv);
941                 v1 = vld1q_f32_aligned_postincrement(vv);
942                 v2 = vld1q_f32_aligned_postincrement(vv);
943                 v3 = vld1q_f32_aligned_postincrement(vv);
944
945                 xy0 = vmul_f32(vget_low_f32(v0), vLo);
946                 xy1 = vmul_f32(vget_low_f32(v1), vLo);
947                 xy2 = vmul_f32(vget_low_f32(v2), vLo);
948                 xy3 = vmul_f32(vget_low_f32(v3), vLo);
949
950                 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
951                 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
952                 zLo = vmul_f32(z0.val[0], vHi);
953                 zHi = vmul_f32(z1.val[0], vHi);
954
955                 rLo = vpadd_f32(xy0, xy1);
956                 rHi = vpadd_f32(xy2, xy3);
957                 rLo = vadd_f32(rLo, zLo);
958                 rHi = vadd_f32(rHi, zHi);
959
960                 maskLo = vcgt_f32(rLo, dotMaxLo);
961                 maskHi = vcgt_f32(rHi, dotMaxHi);
962                 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
963                 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
964                 iLo = vbsl_u32(maskLo, indexLo, iLo);
965                 iHi = vbsl_u32(maskHi, indexHi, iHi);
966                 indexLo = vadd_u32(indexLo, four);
967                 indexHi = vadd_u32(indexHi, four);
968         }
969
970         for (; i + 4 <= count; i += 4)
971         {
972                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
973                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
974                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
975                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
976
977                 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
978                 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
979                 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
980                 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
981
982                 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
983                 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
984                 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
985                 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
986
987                 float32x2_t rLo = vpadd_f32(xy0, xy1);
988                 float32x2_t rHi = vpadd_f32(xy2, xy3);
989                 rLo = vadd_f32(rLo, zLo);
990                 rHi = vadd_f32(rHi, zHi);
991
992                 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
993                 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
994                 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
995                 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
996                 iLo = vbsl_u32(maskLo, indexLo, iLo);
997                 iHi = vbsl_u32(maskHi, indexHi, iHi);
998                 indexLo = vadd_u32(indexLo, four);
999                 indexHi = vadd_u32(indexHi, four);
1000         }
1001
1002         switch (count & 3)
1003         {
1004                 case 3:
1005                 {
1006                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1007                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1008                         float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1009
1010                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1011                         float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1012                         float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1013
1014                         float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1015                         float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1016                         float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1017
1018                         float32x2_t rLo = vpadd_f32(xy0, xy1);
1019                         float32x2_t rHi = vpadd_f32(xy2, xy2);
1020                         rLo = vadd_f32(rLo, zLo);
1021                         rHi = vadd_f32(rHi, zHi);
1022
1023                         uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1024                         uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
1025                         dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1026                         dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
1027                         iLo = vbsl_u32(maskLo, indexLo, iLo);
1028                         iHi = vbsl_u32(maskHi, indexHi, iHi);
1029                 }
1030                 break;
1031                 case 2:
1032                 {
1033                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1034                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1035
1036                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1037                         float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1038
1039                         float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1040                         float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1041
1042                         float32x2_t rLo = vpadd_f32(xy0, xy1);
1043                         rLo = vadd_f32(rLo, zLo);
1044
1045                         uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1046                         dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1047                         iLo = vbsl_u32(maskLo, indexLo, iLo);
1048                 }
1049                 break;
1050                 case 1:
1051                 {
1052                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1053                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1054                         float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1055                         float32x2_t zLo = vmul_f32(z0, vHi);
1056                         float32x2_t rLo = vpadd_f32(xy0, xy0);
1057                         rLo = vadd_f32(rLo, zLo);
1058                         uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1059                         dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1060                         iLo = vbsl_u32(maskLo, indexLo, iLo);
1061                 }
1062                 break;
1063
1064                 default:
1065                         break;
1066         }
1067
1068         // select best answer between hi and lo results
1069         uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
1070         dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1071         iLo = vbsl_u32(mask, iHi, iLo);
1072
1073         // select best answer between even and odd results
1074         dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1075         iHi = vdup_lane_u32(iLo, 1);
1076         mask = vcgt_f32(dotMaxHi, dotMaxLo);
1077         dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1078         iLo = vbsl_u32(mask, iHi, iLo);
1079
1080         *dotResult = vget_lane_f32(dotMaxLo, 0);
1081         return vget_lane_u32(iLo, 0);
1082 }
1083
1084 long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1085 {
1086         float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1087         float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1088         float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1089         const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1090         uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1091         uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1092         float32x4_t maxDot = (float32x4_t){-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY};
1093
1094         unsigned long i = 0;
1095         for (; i + 8 <= count; i += 8)
1096         {
1097                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1098                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1099                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1100                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1101
1102                 // the next two lines should resolve to a single vswp d, d
1103                 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1104                 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1105                 // the next two lines should resolve to a single vswp d, d
1106                 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1107                 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1108
1109                 xy0 = vmulq_f32(xy0, vLo);
1110                 xy1 = vmulq_f32(xy1, vLo);
1111
1112                 float32x4x2_t zb = vuzpq_f32(z0, z1);
1113                 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1114                 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1115                 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1116                 x = vaddq_f32(x, z);
1117
1118                 uint32x4_t mask = vcgtq_f32(x, maxDot);
1119                 maxDot = vbslq_f32(mask, x, maxDot);
1120                 index = vbslq_u32(mask, local_index, index);
1121                 local_index = vaddq_u32(local_index, four);
1122
1123                 v0 = vld1q_f32_aligned_postincrement(vv);
1124                 v1 = vld1q_f32_aligned_postincrement(vv);
1125                 v2 = vld1q_f32_aligned_postincrement(vv);
1126                 v3 = vld1q_f32_aligned_postincrement(vv);
1127
1128                 // the next two lines should resolve to a single vswp d, d
1129                 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1130                 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1131                 // the next two lines should resolve to a single vswp d, d
1132                 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1133                 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1134
1135                 xy0 = vmulq_f32(xy0, vLo);
1136                 xy1 = vmulq_f32(xy1, vLo);
1137
1138                 zb = vuzpq_f32(z0, z1);
1139                 z = vmulq_f32(zb.val[0], vHi);
1140                 xy = vuzpq_f32(xy0, xy1);
1141                 x = vaddq_f32(xy.val[0], xy.val[1]);
1142                 x = vaddq_f32(x, z);
1143
1144                 mask = vcgtq_f32(x, maxDot);
1145                 maxDot = vbslq_f32(mask, x, maxDot);
1146                 index = vbslq_u32(mask, local_index, index);
1147                 local_index = vaddq_u32(local_index, four);
1148         }
1149
1150         for (; i + 4 <= count; i += 4)
1151         {
1152                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1153                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1154                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1155                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1156
1157                 // the next two lines should resolve to a single vswp d, d
1158                 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1159                 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1160                 // the next two lines should resolve to a single vswp d, d
1161                 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1162                 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1163
1164                 xy0 = vmulq_f32(xy0, vLo);
1165                 xy1 = vmulq_f32(xy1, vLo);
1166
1167                 float32x4x2_t zb = vuzpq_f32(z0, z1);
1168                 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1169                 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1170                 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1171                 x = vaddq_f32(x, z);
1172
1173                 uint32x4_t mask = vcgtq_f32(x, maxDot);
1174                 maxDot = vbslq_f32(mask, x, maxDot);
1175                 index = vbslq_u32(mask, local_index, index);
1176                 local_index = vaddq_u32(local_index, four);
1177         }
1178
1179         switch (count & 3)
1180         {
1181                 case 3:
1182                 {
1183                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1184                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1185                         float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1186
1187                         // the next two lines should resolve to a single vswp d, d
1188                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1189                         float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1190                         // the next two lines should resolve to a single vswp d, d
1191                         float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1192                         float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1193
1194                         xy0 = vmulq_f32(xy0, vLo);
1195                         xy1 = vmulq_f32(xy1, vLo);
1196
1197                         float32x4x2_t zb = vuzpq_f32(z0, z1);
1198                         float32x4_t z = vmulq_f32(zb.val[0], vHi);
1199                         float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1200                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1201                         x = vaddq_f32(x, z);
1202
1203                         uint32x4_t mask = vcgtq_f32(x, maxDot);
1204                         maxDot = vbslq_f32(mask, x, maxDot);
1205                         index = vbslq_u32(mask, local_index, index);
1206                         local_index = vaddq_u32(local_index, four);
1207                 }
1208                 break;
1209
1210                 case 2:
1211                 {
1212                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1213                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1214
1215                         // the next two lines should resolve to a single vswp d, d
1216                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1217                         // the next two lines should resolve to a single vswp d, d
1218                         float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1219
1220                         xy0 = vmulq_f32(xy0, vLo);
1221
1222                         float32x4x2_t zb = vuzpq_f32(z0, z0);
1223                         float32x4_t z = vmulq_f32(zb.val[0], vHi);
1224                         float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1225                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1226                         x = vaddq_f32(x, z);
1227
1228                         uint32x4_t mask = vcgtq_f32(x, maxDot);
1229                         maxDot = vbslq_f32(mask, x, maxDot);
1230                         index = vbslq_u32(mask, local_index, index);
1231                         local_index = vaddq_u32(local_index, four);
1232                 }
1233                 break;
1234
1235                 case 1:
1236                 {
1237                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1238
1239                         // the next two lines should resolve to a single vswp d, d
1240                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1241                         // the next two lines should resolve to a single vswp d, d
1242                         float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1243
1244                         xy0 = vmulq_f32(xy0, vLo);
1245
1246                         z = vmulq_f32(z, vHi);
1247                         float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1248                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1249                         x = vaddq_f32(x, z);
1250
1251                         uint32x4_t mask = vcgtq_f32(x, maxDot);
1252                         maxDot = vbslq_f32(mask, x, maxDot);
1253                         index = vbslq_u32(mask, local_index, index);
1254                         local_index = vaddq_u32(local_index, four);
1255                 }
1256                 break;
1257
1258                 default:
1259                         break;
1260         }
1261
1262         // select best answer between hi and lo results
1263         uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
1264         float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1265         uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1266
1267         // select best answer between even and odd results
1268         float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1269         uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1270         mask = vcgt_f32(maxDotO, maxDot2);
1271         maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1272         index2 = vbsl_u32(mask, indexHi, index2);
1273
1274         *dotResult = vget_lane_f32(maxDot2, 0);
1275         return vget_lane_u32(index2, 0);
1276 }
1277
1278 long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
1279 {
1280         unsigned long i = 0;
1281         float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1282         float32x2_t vLo = vget_low_f32(vvec);
1283         float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1284         float32x2_t dotMinLo = (float32x2_t){BT_INFINITY, BT_INFINITY};
1285         float32x2_t dotMinHi = (float32x2_t){BT_INFINITY, BT_INFINITY};
1286         uint32x2_t indexLo = (uint32x2_t){0, 1};
1287         uint32x2_t indexHi = (uint32x2_t){2, 3};
1288         uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1289         uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1290         const uint32x2_t four = (uint32x2_t){4, 4};
1291
1292         for (; i + 8 <= count; i += 8)
1293         {
1294                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1295                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1296                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1297                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1298
1299                 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1300                 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1301                 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1302                 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1303
1304                 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1305                 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1306                 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1307                 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1308
1309                 float32x2_t rLo = vpadd_f32(xy0, xy1);
1310                 float32x2_t rHi = vpadd_f32(xy2, xy3);
1311                 rLo = vadd_f32(rLo, zLo);
1312                 rHi = vadd_f32(rHi, zHi);
1313
1314                 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1315                 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1316                 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1317                 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1318                 iLo = vbsl_u32(maskLo, indexLo, iLo);
1319                 iHi = vbsl_u32(maskHi, indexHi, iHi);
1320                 indexLo = vadd_u32(indexLo, four);
1321                 indexHi = vadd_u32(indexHi, four);
1322
1323                 v0 = vld1q_f32_aligned_postincrement(vv);
1324                 v1 = vld1q_f32_aligned_postincrement(vv);
1325                 v2 = vld1q_f32_aligned_postincrement(vv);
1326                 v3 = vld1q_f32_aligned_postincrement(vv);
1327
1328                 xy0 = vmul_f32(vget_low_f32(v0), vLo);
1329                 xy1 = vmul_f32(vget_low_f32(v1), vLo);
1330                 xy2 = vmul_f32(vget_low_f32(v2), vLo);
1331                 xy3 = vmul_f32(vget_low_f32(v3), vLo);
1332
1333                 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1334                 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1335                 zLo = vmul_f32(z0.val[0], vHi);
1336                 zHi = vmul_f32(z1.val[0], vHi);
1337
1338                 rLo = vpadd_f32(xy0, xy1);
1339                 rHi = vpadd_f32(xy2, xy3);
1340                 rLo = vadd_f32(rLo, zLo);
1341                 rHi = vadd_f32(rHi, zHi);
1342
1343                 maskLo = vclt_f32(rLo, dotMinLo);
1344                 maskHi = vclt_f32(rHi, dotMinHi);
1345                 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1346                 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1347                 iLo = vbsl_u32(maskLo, indexLo, iLo);
1348                 iHi = vbsl_u32(maskHi, indexHi, iHi);
1349                 indexLo = vadd_u32(indexLo, four);
1350                 indexHi = vadd_u32(indexHi, four);
1351         }
1352
1353         for (; i + 4 <= count; i += 4)
1354         {
1355                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1356                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1357                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1358                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1359
1360                 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1361                 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1362                 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1363                 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1364
1365                 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1366                 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1367                 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1368                 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1369
1370                 float32x2_t rLo = vpadd_f32(xy0, xy1);
1371                 float32x2_t rHi = vpadd_f32(xy2, xy3);
1372                 rLo = vadd_f32(rLo, zLo);
1373                 rHi = vadd_f32(rHi, zHi);
1374
1375                 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1376                 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1377                 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1378                 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1379                 iLo = vbsl_u32(maskLo, indexLo, iLo);
1380                 iHi = vbsl_u32(maskHi, indexHi, iHi);
1381                 indexLo = vadd_u32(indexLo, four);
1382                 indexHi = vadd_u32(indexHi, four);
1383         }
1384         switch (count & 3)
1385         {
1386                 case 3:
1387                 {
1388                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1389                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1390                         float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1391
1392                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1393                         float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1394                         float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1395
1396                         float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1397                         float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1398                         float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1399
1400                         float32x2_t rLo = vpadd_f32(xy0, xy1);
1401                         float32x2_t rHi = vpadd_f32(xy2, xy2);
1402                         rLo = vadd_f32(rLo, zLo);
1403                         rHi = vadd_f32(rHi, zHi);
1404
1405                         uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1406                         uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1407                         dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1408                         dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1409                         iLo = vbsl_u32(maskLo, indexLo, iLo);
1410                         iHi = vbsl_u32(maskHi, indexHi, iHi);
1411                 }
1412                 break;
1413                 case 2:
1414                 {
1415                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1416                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1417
1418                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1419                         float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1420
1421                         float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1422                         float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1423
1424                         float32x2_t rLo = vpadd_f32(xy0, xy1);
1425                         rLo = vadd_f32(rLo, zLo);
1426
1427                         uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1428                         dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1429                         iLo = vbsl_u32(maskLo, indexLo, iLo);
1430                 }
1431                 break;
1432                 case 1:
1433                 {
1434                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1435                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1436                         float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1437                         float32x2_t zLo = vmul_f32(z0, vHi);
1438                         float32x2_t rLo = vpadd_f32(xy0, xy0);
1439                         rLo = vadd_f32(rLo, zLo);
1440                         uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1441                         dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1442                         iLo = vbsl_u32(maskLo, indexLo, iLo);
1443                 }
1444                 break;
1445
1446                 default:
1447                         break;
1448         }
1449
1450         // select best answer between hi and lo results
1451         uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
1452         dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1453         iLo = vbsl_u32(mask, iHi, iLo);
1454
1455         // select best answer between even and odd results
1456         dotMinHi = vdup_lane_f32(dotMinLo, 1);
1457         iHi = vdup_lane_u32(iLo, 1);
1458         mask = vclt_f32(dotMinHi, dotMinLo);
1459         dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1460         iLo = vbsl_u32(mask, iHi, iLo);
1461
1462         *dotResult = vget_lane_f32(dotMinLo, 0);
1463         return vget_lane_u32(iLo, 0);
1464 }
1465
1466 long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1467 {
1468         float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1469         float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1470         float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1471         const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1472         uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1473         uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1474         float32x4_t minDot = (float32x4_t){BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY};
1475
1476         unsigned long i = 0;
1477         for (; i + 8 <= count; i += 8)
1478         {
1479                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1480                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1481                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1482                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1483
1484                 // the next two lines should resolve to a single vswp d, d
1485                 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1486                 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1487                 // the next two lines should resolve to a single vswp d, d
1488                 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1489                 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1490
1491                 xy0 = vmulq_f32(xy0, vLo);
1492                 xy1 = vmulq_f32(xy1, vLo);
1493
1494                 float32x4x2_t zb = vuzpq_f32(z0, z1);
1495                 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1496                 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1497                 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1498                 x = vaddq_f32(x, z);
1499
1500                 uint32x4_t mask = vcltq_f32(x, minDot);
1501                 minDot = vbslq_f32(mask, x, minDot);
1502                 index = vbslq_u32(mask, local_index, index);
1503                 local_index = vaddq_u32(local_index, four);
1504
1505                 v0 = vld1q_f32_aligned_postincrement(vv);
1506                 v1 = vld1q_f32_aligned_postincrement(vv);
1507                 v2 = vld1q_f32_aligned_postincrement(vv);
1508                 v3 = vld1q_f32_aligned_postincrement(vv);
1509
1510                 // the next two lines should resolve to a single vswp d, d
1511                 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1512                 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1513                 // the next two lines should resolve to a single vswp d, d
1514                 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1515                 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1516
1517                 xy0 = vmulq_f32(xy0, vLo);
1518                 xy1 = vmulq_f32(xy1, vLo);
1519
1520                 zb = vuzpq_f32(z0, z1);
1521                 z = vmulq_f32(zb.val[0], vHi);
1522                 xy = vuzpq_f32(xy0, xy1);
1523                 x = vaddq_f32(xy.val[0], xy.val[1]);
1524                 x = vaddq_f32(x, z);
1525
1526                 mask = vcltq_f32(x, minDot);
1527                 minDot = vbslq_f32(mask, x, minDot);
1528                 index = vbslq_u32(mask, local_index, index);
1529                 local_index = vaddq_u32(local_index, four);
1530         }
1531
1532         for (; i + 4 <= count; i += 4)
1533         {
1534                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1535                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1536                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1537                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1538
1539                 // the next two lines should resolve to a single vswp d, d
1540                 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1541                 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1542                 // the next two lines should resolve to a single vswp d, d
1543                 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1544                 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1545
1546                 xy0 = vmulq_f32(xy0, vLo);
1547                 xy1 = vmulq_f32(xy1, vLo);
1548
1549                 float32x4x2_t zb = vuzpq_f32(z0, z1);
1550                 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1551                 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1552                 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1553                 x = vaddq_f32(x, z);
1554
1555                 uint32x4_t mask = vcltq_f32(x, minDot);
1556                 minDot = vbslq_f32(mask, x, minDot);
1557                 index = vbslq_u32(mask, local_index, index);
1558                 local_index = vaddq_u32(local_index, four);
1559         }
1560
1561         switch (count & 3)
1562         {
1563                 case 3:
1564                 {
1565                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1566                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1567                         float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1568
1569                         // the next two lines should resolve to a single vswp d, d
1570                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1571                         float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1572                         // the next two lines should resolve to a single vswp d, d
1573                         float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1574                         float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1575
1576                         xy0 = vmulq_f32(xy0, vLo);
1577                         xy1 = vmulq_f32(xy1, vLo);
1578
1579                         float32x4x2_t zb = vuzpq_f32(z0, z1);
1580                         float32x4_t z = vmulq_f32(zb.val[0], vHi);
1581                         float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1582                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1583                         x = vaddq_f32(x, z);
1584
1585                         uint32x4_t mask = vcltq_f32(x, minDot);
1586                         minDot = vbslq_f32(mask, x, minDot);
1587                         index = vbslq_u32(mask, local_index, index);
1588                         local_index = vaddq_u32(local_index, four);
1589                 }
1590                 break;
1591
1592                 case 2:
1593                 {
1594                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1595                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1596
1597                         // the next two lines should resolve to a single vswp d, d
1598                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1599                         // the next two lines should resolve to a single vswp d, d
1600                         float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1601
1602                         xy0 = vmulq_f32(xy0, vLo);
1603
1604                         float32x4x2_t zb = vuzpq_f32(z0, z0);
1605                         float32x4_t z = vmulq_f32(zb.val[0], vHi);
1606                         float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1607                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1608                         x = vaddq_f32(x, z);
1609
1610                         uint32x4_t mask = vcltq_f32(x, minDot);
1611                         minDot = vbslq_f32(mask, x, minDot);
1612                         index = vbslq_u32(mask, local_index, index);
1613                         local_index = vaddq_u32(local_index, four);
1614                 }
1615                 break;
1616
1617                 case 1:
1618                 {
1619                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1620
1621                         // the next two lines should resolve to a single vswp d, d
1622                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1623                         // the next two lines should resolve to a single vswp d, d
1624                         float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1625
1626                         xy0 = vmulq_f32(xy0, vLo);
1627
1628                         z = vmulq_f32(z, vHi);
1629                         float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1630                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1631                         x = vaddq_f32(x, z);
1632
1633                         uint32x4_t mask = vcltq_f32(x, minDot);
1634                         minDot = vbslq_f32(mask, x, minDot);
1635                         index = vbslq_u32(mask, local_index, index);
1636                         local_index = vaddq_u32(local_index, four);
1637                 }
1638                 break;
1639
1640                 default:
1641                         break;
1642         }
1643
1644         // select best answer between hi and lo results
1645         uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
1646         float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1647         uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1648
1649         // select best answer between even and odd results
1650         float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1651         uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1652         mask = vclt_f32(minDotO, minDot2);
1653         minDot2 = vbsl_f32(mask, minDotO, minDot2);
1654         index2 = vbsl_u32(mask, indexHi, index2);
1655
1656         *dotResult = vget_lane_f32(minDot2, 0);
1657         return vget_lane_u32(index2, 0);
1658 }
1659
1660 #else
1661 #error Unhandled __APPLE__ arch
1662 #endif
1663
1664 #endif /* __APPLE__ */