2 Copyright (c) 2011 Apple Inc.
3 https://bulletphysics.org
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose,
8 including commercial applications, and to alter it and redistribute it freely,
9 subject to the following restrictions:
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
15 This source version has been altered.
18 #if defined(_WIN32) || defined(__i386__)
19 #define BT_USE_SSE_IN_API
22 #include "btVector3.h"
24 #if defined BT_USE_SIMD_VECTOR3
27 #include <string.h> //for memset
32 typedef float float4 __attribute__((vector_size(16)));
36 //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
38 #if defined BT_USE_SSE || defined _WIN32
40 #define LOG2_ARRAY_SIZE 6
41 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
43 #include <emmintrin.h>
45 long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
46 long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
48 const float4 *vertices = (const float4 *)vv;
49 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
50 float4 dotMax = btAssign128(-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY);
51 float4 vvec = _mm_loadu_ps(vec);
52 float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa)); /// zzzz
53 float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy
58 float4 stack_array[STACK_ARRAY_COUNT];
61 //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
66 // Faster loop without cleanup code for full tiles
67 for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
71 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
72 { // do four dot products at a time. Carefully avoid touching the w element.
73 float4 v0 = vertices[0];
74 float4 v1 = vertices[1];
75 float4 v2 = vertices[2];
76 float4 v3 = vertices[3];
79 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
80 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
81 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
82 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
86 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
87 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
88 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
92 stack_array[index] = x;
93 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
101 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
102 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
103 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
104 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
108 z = _mm_shuffle_ps(hi0, hi1, 0x88);
109 x = _mm_shuffle_ps(lo0, lo1, 0x88);
110 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
114 stack_array[index + 1] = x;
115 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
123 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
124 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
125 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
126 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
130 z = _mm_shuffle_ps(hi0, hi1, 0x88);
131 x = _mm_shuffle_ps(lo0, lo1, 0x88);
132 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
136 stack_array[index + 2] = x;
137 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
145 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
146 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
147 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
148 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
152 z = _mm_shuffle_ps(hi0, hi1, 0x88);
153 x = _mm_shuffle_ps(lo0, lo1, 0x88);
154 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
158 stack_array[index + 3] = x;
159 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
161 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
164 // If we found a new max
165 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
167 // copy the new max across all lanes of our max accumulator
168 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
169 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
173 // find first occurrence of that max
175 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
178 // record where it is.
179 maxIndex = 4 * index + segment + indexTable[test];
183 // account for work we've already done
186 // Deal with the last < STACK_ARRAY_COUNT vectors
190 if (btUnlikely(count > 16))
192 for (; index + 4 <= count / 4; index += 4)
193 { // do four dot products at a time. Carefully avoid touching the w element.
194 float4 v0 = vertices[0];
195 float4 v1 = vertices[1];
196 float4 v2 = vertices[2];
197 float4 v3 = vertices[3];
200 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
201 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
202 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
203 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
207 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
213 stack_array[index] = x;
214 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
222 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
223 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
224 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
225 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
229 z = _mm_shuffle_ps(hi0, hi1, 0x88);
230 x = _mm_shuffle_ps(lo0, lo1, 0x88);
231 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
235 stack_array[index + 1] = x;
236 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
244 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
245 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
246 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
247 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
251 z = _mm_shuffle_ps(hi0, hi1, 0x88);
252 x = _mm_shuffle_ps(lo0, lo1, 0x88);
253 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
257 stack_array[index + 2] = x;
258 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
266 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
267 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
268 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
269 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
273 z = _mm_shuffle_ps(hi0, hi1, 0x88);
274 x = _mm_shuffle_ps(lo0, lo1, 0x88);
275 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
279 stack_array[index + 3] = x;
280 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
282 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
286 size_t localCount = (count & -4L) - 4 * index;
290 float4 t0, t1, t2, t3, t4;
291 float4 *sap = &stack_array[index + localCount / 4];
292 vertices += localCount; // counter the offset
293 size_t byteIndex = -(localCount) * sizeof(float);
294 //AT&T Code style assembly
297 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
298 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
299 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
300 movaps %[t0], %[max] // vertices[0] \n\
301 movlhps %[t1], %[max] // x0y0x1y1 \n\
302 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
303 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
304 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
305 movhlps %[t0], %[t1] // z0w0z1w1 \n\
306 movaps %[t3], %[t0] // vertices[2] \n\
307 movlhps %[t4], %[t0] // x2y2x3y3 \n\
308 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
309 movhlps %[t3], %[t4] // z2w2z3w3 \n\
310 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
311 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
312 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
313 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
314 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
315 addps %[t3], %[max] // x + y \n\
316 addps %[t1], %[max] // x + y + z \n\
317 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
318 maxps %[t2], %[max] // record max, restore max \n\
319 add $16, %[byteIndex] // advance loop counter\n\
322 : [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
323 : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
325 index += localCount / 4;
328 for (unsigned int i = 0; i < localCount / 4; i++, index++)
329 { // do four dot products at a time. Carefully avoid touching the w element.
330 float4 v0 = vertices[0];
331 float4 v1 = vertices[1];
332 float4 v2 = vertices[2];
333 float4 v3 = vertices[3];
336 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
337 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
338 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
339 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
343 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
344 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
345 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
349 stack_array[index] = x;
350 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
356 // process the last few points
359 float4 v0, v1, v2, x, y, z;
368 // Calculate 3 dot products, transpose, duplicate v2
369 float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
370 float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
372 z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
374 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
376 x = _mm_shuffle_ps(lo0, lo1, 0x88);
377 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
384 float4 xy = _mm_movelh_ps(v0, v1);
385 z = _mm_movehl_ps(v1, v0);
387 z = _mm_shuffle_ps(z, z, 0xa8);
388 x = _mm_shuffle_ps(xy, xy, 0xa8);
389 y = _mm_shuffle_ps(xy, xy, 0xfd);
395 float4 xy = vertices[0];
396 z = _mm_shuffle_ps(xy, xy, 0xaa);
399 x = _mm_shuffle_ps(xy, xy, 0);
400 y = _mm_shuffle_ps(xy, xy, 0x55);
406 stack_array[index] = x;
407 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
411 // if we found a new max.
412 if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
413 { // we found a new max. Search for it
414 // find max across the max vector, place in all elements of max -- big latency hit here
415 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
416 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
418 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
419 // this where it actually makes a difference is handled in the early out at the top of the function,
420 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
421 // complexity, and removed it.
425 // scan for the first occurence of max in the array
427 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
430 maxIndex = 4 * index + segment + indexTable[test];
433 _mm_store_ss(dotResult, dotMax);
437 long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
439 long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
441 const float4 *vertices = (const float4 *)vv;
442 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
443 float4 dotmin = btAssign128(BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY);
444 float4 vvec = _mm_loadu_ps(vec);
445 float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa)); /// zzzz
446 float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy
451 float4 stack_array[STACK_ARRAY_COUNT];
454 //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
459 // Faster loop without cleanup code for full tiles
460 for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
464 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
465 { // do four dot products at a time. Carefully avoid touching the w element.
466 float4 v0 = vertices[0];
467 float4 v1 = vertices[1];
468 float4 v2 = vertices[2];
469 float4 v3 = vertices[3];
472 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
473 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
474 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
475 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
479 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
480 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
481 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
485 stack_array[index] = x;
486 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
494 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
495 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
496 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
497 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
501 z = _mm_shuffle_ps(hi0, hi1, 0x88);
502 x = _mm_shuffle_ps(lo0, lo1, 0x88);
503 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
507 stack_array[index + 1] = x;
508 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
516 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
517 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
518 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
519 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
523 z = _mm_shuffle_ps(hi0, hi1, 0x88);
524 x = _mm_shuffle_ps(lo0, lo1, 0x88);
525 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
529 stack_array[index + 2] = x;
530 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
538 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
539 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
540 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
541 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
545 z = _mm_shuffle_ps(hi0, hi1, 0x88);
546 x = _mm_shuffle_ps(lo0, lo1, 0x88);
547 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
551 stack_array[index + 3] = x;
552 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
554 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
557 // If we found a new min
558 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
560 // copy the new min across all lanes of our min accumulator
561 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
562 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
566 // find first occurrence of that min
568 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
571 // record where it is.
572 minIndex = 4 * index + segment + indexTable[test];
576 // account for work we've already done
579 // Deal with the last < STACK_ARRAY_COUNT vectors
583 if (btUnlikely(count > 16))
585 for (; index + 4 <= count / 4; index += 4)
586 { // do four dot products at a time. Carefully avoid touching the w element.
587 float4 v0 = vertices[0];
588 float4 v1 = vertices[1];
589 float4 v2 = vertices[2];
590 float4 v3 = vertices[3];
593 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
594 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
595 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
596 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
600 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
601 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
602 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
606 stack_array[index] = x;
607 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
615 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
616 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
617 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
618 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
622 z = _mm_shuffle_ps(hi0, hi1, 0x88);
623 x = _mm_shuffle_ps(lo0, lo1, 0x88);
624 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
628 stack_array[index + 1] = x;
629 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
637 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
638 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
639 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
640 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
644 z = _mm_shuffle_ps(hi0, hi1, 0x88);
645 x = _mm_shuffle_ps(lo0, lo1, 0x88);
646 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
650 stack_array[index + 2] = x;
651 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
659 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
660 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
661 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
662 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
666 z = _mm_shuffle_ps(hi0, hi1, 0x88);
667 x = _mm_shuffle_ps(lo0, lo1, 0x88);
668 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
672 stack_array[index + 3] = x;
673 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
675 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
679 size_t localCount = (count & -4L) - 4 * index;
683 vertices += localCount; // counter the offset
684 float4 t0, t1, t2, t3, t4;
685 size_t byteIndex = -(localCount) * sizeof(float);
686 float4 *sap = &stack_array[index + localCount / 4];
690 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
691 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
692 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
693 movaps %[t0], %[min] // vertices[0] \n\
694 movlhps %[t1], %[min] // x0y0x1y1 \n\
695 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
696 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
697 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
698 movhlps %[t0], %[t1] // z0w0z1w1 \n\
699 movaps %[t3], %[t0] // vertices[2] \n\
700 movlhps %[t4], %[t0] // x2y2x3y3 \n\
701 movhlps %[t3], %[t4] // z2w2z3w3 \n\
702 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
703 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
704 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
705 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
706 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
707 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
708 addps %[t3], %[min] // x + y \n\
709 addps %[t1], %[min] // x + y + z \n\
710 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
711 minps %[t2], %[min] // record min, restore min \n\
712 add $16, %[byteIndex] // advance loop counter\n\
715 : [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
716 : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
718 index += localCount / 4;
721 for (unsigned int i = 0; i < localCount / 4; i++, index++)
722 { // do four dot products at a time. Carefully avoid touching the w element.
723 float4 v0 = vertices[0];
724 float4 v1 = vertices[1];
725 float4 v2 = vertices[2];
726 float4 v3 = vertices[3];
729 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
730 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
731 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
732 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
736 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
737 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
738 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
742 stack_array[index] = x;
743 min = _mm_min_ps(x, min); // control the order here so that max is never NaN even if x is nan
750 // process the last few points
753 float4 v0, v1, v2, x, y, z;
762 // Calculate 3 dot products, transpose, duplicate v2
763 float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
764 float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
766 z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
768 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
770 x = _mm_shuffle_ps(lo0, lo1, 0x88);
771 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
778 float4 xy = _mm_movelh_ps(v0, v1);
779 z = _mm_movehl_ps(v1, v0);
781 z = _mm_shuffle_ps(z, z, 0xa8);
782 x = _mm_shuffle_ps(xy, xy, 0xa8);
783 y = _mm_shuffle_ps(xy, xy, 0xfd);
789 float4 xy = vertices[0];
790 z = _mm_shuffle_ps(xy, xy, 0xaa);
793 x = _mm_shuffle_ps(xy, xy, 0);
794 y = _mm_shuffle_ps(xy, xy, 0x55);
800 stack_array[index] = x;
801 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
805 // if we found a new min.
806 if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
807 { // we found a new min. Search for it
808 // find min across the min vector, place in all elements of min -- big latency hit here
809 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
810 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
812 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
813 // this where it actually makes a difference is handled in the early out at the top of the function,
814 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
815 // complexity, and removed it.
819 // scan for the first occurence of min in the array
821 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
824 minIndex = 4 * index + segment + indexTable[test];
827 _mm_store_ss(dotResult, dotmin);
831 #elif defined BT_USE_NEON
833 #define ARM_NEON_GCC_COMPATIBILITY 1
834 #include <arm_neon.h>
835 #include <sys/types.h>
836 #include <sys/sysctl.h> //for sysctlbyname
838 static long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
839 static long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
840 static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
841 static long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
842 static long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
843 static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
845 long (*_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _maxdot_large_sel;
846 long (*_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _mindot_large_sel;
848 static inline uint32_t btGetCpuCapabilities(void)
850 static uint32_t capabilities = 0;
851 static bool testedCapabilities = false;
853 if (0 == testedCapabilities)
855 uint32_t hasFeature = 0;
856 size_t featureSize = sizeof(hasFeature);
857 int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
859 if (0 == err && hasFeature)
860 capabilities |= 0x2000;
862 testedCapabilities = true;
868 static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
870 if (btGetCpuCapabilities() & 0x2000)
871 _maxdot_large = _maxdot_large_v1;
873 _maxdot_large = _maxdot_large_v0;
875 return _maxdot_large(vv, vec, count, dotResult);
878 static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
880 if (btGetCpuCapabilities() & 0x2000)
881 _mindot_large = _mindot_large_v1;
883 _mindot_large = _mindot_large_v0;
885 return _mindot_large(vv, vec, count, dotResult);
889 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
892 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
895 long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
898 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
899 float32x2_t vLo = vget_low_f32(vvec);
900 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
901 float32x2_t dotMaxLo = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
902 float32x2_t dotMaxHi = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
903 uint32x2_t indexLo = (uint32x2_t){0, 1};
904 uint32x2_t indexHi = (uint32x2_t){2, 3};
905 uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
906 uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
907 const uint32x2_t four = (uint32x2_t){4, 4};
909 for (; i + 8 <= count; i += 8)
911 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
912 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
913 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
914 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
916 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
917 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
918 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
919 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
921 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
922 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
923 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
924 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
926 float32x2_t rLo = vpadd_f32(xy0, xy1);
927 float32x2_t rHi = vpadd_f32(xy2, xy3);
928 rLo = vadd_f32(rLo, zLo);
929 rHi = vadd_f32(rHi, zHi);
931 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
932 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
933 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
934 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
935 iLo = vbsl_u32(maskLo, indexLo, iLo);
936 iHi = vbsl_u32(maskHi, indexHi, iHi);
937 indexLo = vadd_u32(indexLo, four);
938 indexHi = vadd_u32(indexHi, four);
940 v0 = vld1q_f32_aligned_postincrement(vv);
941 v1 = vld1q_f32_aligned_postincrement(vv);
942 v2 = vld1q_f32_aligned_postincrement(vv);
943 v3 = vld1q_f32_aligned_postincrement(vv);
945 xy0 = vmul_f32(vget_low_f32(v0), vLo);
946 xy1 = vmul_f32(vget_low_f32(v1), vLo);
947 xy2 = vmul_f32(vget_low_f32(v2), vLo);
948 xy3 = vmul_f32(vget_low_f32(v3), vLo);
950 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
951 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
952 zLo = vmul_f32(z0.val[0], vHi);
953 zHi = vmul_f32(z1.val[0], vHi);
955 rLo = vpadd_f32(xy0, xy1);
956 rHi = vpadd_f32(xy2, xy3);
957 rLo = vadd_f32(rLo, zLo);
958 rHi = vadd_f32(rHi, zHi);
960 maskLo = vcgt_f32(rLo, dotMaxLo);
961 maskHi = vcgt_f32(rHi, dotMaxHi);
962 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
963 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
964 iLo = vbsl_u32(maskLo, indexLo, iLo);
965 iHi = vbsl_u32(maskHi, indexHi, iHi);
966 indexLo = vadd_u32(indexLo, four);
967 indexHi = vadd_u32(indexHi, four);
970 for (; i + 4 <= count; i += 4)
972 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
973 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
974 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
975 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
977 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
978 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
979 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
980 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
982 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
983 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
984 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
985 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
987 float32x2_t rLo = vpadd_f32(xy0, xy1);
988 float32x2_t rHi = vpadd_f32(xy2, xy3);
989 rLo = vadd_f32(rLo, zLo);
990 rHi = vadd_f32(rHi, zHi);
992 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
993 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
994 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
995 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
996 iLo = vbsl_u32(maskLo, indexLo, iLo);
997 iHi = vbsl_u32(maskHi, indexHi, iHi);
998 indexLo = vadd_u32(indexLo, four);
999 indexHi = vadd_u32(indexHi, four);
1006 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1007 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1008 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1010 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1011 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1012 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1014 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1015 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1016 float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1018 float32x2_t rLo = vpadd_f32(xy0, xy1);
1019 float32x2_t rHi = vpadd_f32(xy2, xy2);
1020 rLo = vadd_f32(rLo, zLo);
1021 rHi = vadd_f32(rHi, zHi);
1023 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1024 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
1025 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1026 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
1027 iLo = vbsl_u32(maskLo, indexLo, iLo);
1028 iHi = vbsl_u32(maskHi, indexHi, iHi);
1033 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1034 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1036 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1037 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1039 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1040 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1042 float32x2_t rLo = vpadd_f32(xy0, xy1);
1043 rLo = vadd_f32(rLo, zLo);
1045 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1046 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1047 iLo = vbsl_u32(maskLo, indexLo, iLo);
1052 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1053 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1054 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1055 float32x2_t zLo = vmul_f32(z0, vHi);
1056 float32x2_t rLo = vpadd_f32(xy0, xy0);
1057 rLo = vadd_f32(rLo, zLo);
1058 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1059 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1060 iLo = vbsl_u32(maskLo, indexLo, iLo);
1068 // select best answer between hi and lo results
1069 uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
1070 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1071 iLo = vbsl_u32(mask, iHi, iLo);
1073 // select best answer between even and odd results
1074 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1075 iHi = vdup_lane_u32(iLo, 1);
1076 mask = vcgt_f32(dotMaxHi, dotMaxLo);
1077 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1078 iLo = vbsl_u32(mask, iHi, iLo);
1080 *dotResult = vget_lane_f32(dotMaxLo, 0);
1081 return vget_lane_u32(iLo, 0);
1084 long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1086 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1087 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1088 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1089 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1090 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1091 uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1092 float32x4_t maxDot = (float32x4_t){-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY};
1094 unsigned long i = 0;
1095 for (; i + 8 <= count; i += 8)
1097 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1098 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1099 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1100 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1102 // the next two lines should resolve to a single vswp d, d
1103 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1104 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1105 // the next two lines should resolve to a single vswp d, d
1106 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1107 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1109 xy0 = vmulq_f32(xy0, vLo);
1110 xy1 = vmulq_f32(xy1, vLo);
1112 float32x4x2_t zb = vuzpq_f32(z0, z1);
1113 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1114 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1115 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1116 x = vaddq_f32(x, z);
1118 uint32x4_t mask = vcgtq_f32(x, maxDot);
1119 maxDot = vbslq_f32(mask, x, maxDot);
1120 index = vbslq_u32(mask, local_index, index);
1121 local_index = vaddq_u32(local_index, four);
1123 v0 = vld1q_f32_aligned_postincrement(vv);
1124 v1 = vld1q_f32_aligned_postincrement(vv);
1125 v2 = vld1q_f32_aligned_postincrement(vv);
1126 v3 = vld1q_f32_aligned_postincrement(vv);
1128 // the next two lines should resolve to a single vswp d, d
1129 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1130 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1131 // the next two lines should resolve to a single vswp d, d
1132 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1133 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1135 xy0 = vmulq_f32(xy0, vLo);
1136 xy1 = vmulq_f32(xy1, vLo);
1138 zb = vuzpq_f32(z0, z1);
1139 z = vmulq_f32(zb.val[0], vHi);
1140 xy = vuzpq_f32(xy0, xy1);
1141 x = vaddq_f32(xy.val[0], xy.val[1]);
1142 x = vaddq_f32(x, z);
1144 mask = vcgtq_f32(x, maxDot);
1145 maxDot = vbslq_f32(mask, x, maxDot);
1146 index = vbslq_u32(mask, local_index, index);
1147 local_index = vaddq_u32(local_index, four);
1150 for (; i + 4 <= count; i += 4)
1152 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1153 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1154 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1155 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1157 // the next two lines should resolve to a single vswp d, d
1158 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1159 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1160 // the next two lines should resolve to a single vswp d, d
1161 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1162 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1164 xy0 = vmulq_f32(xy0, vLo);
1165 xy1 = vmulq_f32(xy1, vLo);
1167 float32x4x2_t zb = vuzpq_f32(z0, z1);
1168 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1169 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1170 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1171 x = vaddq_f32(x, z);
1173 uint32x4_t mask = vcgtq_f32(x, maxDot);
1174 maxDot = vbslq_f32(mask, x, maxDot);
1175 index = vbslq_u32(mask, local_index, index);
1176 local_index = vaddq_u32(local_index, four);
1183 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1184 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1185 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1187 // the next two lines should resolve to a single vswp d, d
1188 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1189 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1190 // the next two lines should resolve to a single vswp d, d
1191 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1192 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1194 xy0 = vmulq_f32(xy0, vLo);
1195 xy1 = vmulq_f32(xy1, vLo);
1197 float32x4x2_t zb = vuzpq_f32(z0, z1);
1198 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1199 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1200 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1201 x = vaddq_f32(x, z);
1203 uint32x4_t mask = vcgtq_f32(x, maxDot);
1204 maxDot = vbslq_f32(mask, x, maxDot);
1205 index = vbslq_u32(mask, local_index, index);
1206 local_index = vaddq_u32(local_index, four);
1212 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1213 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1215 // the next two lines should resolve to a single vswp d, d
1216 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1217 // the next two lines should resolve to a single vswp d, d
1218 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1220 xy0 = vmulq_f32(xy0, vLo);
1222 float32x4x2_t zb = vuzpq_f32(z0, z0);
1223 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1224 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1225 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1226 x = vaddq_f32(x, z);
1228 uint32x4_t mask = vcgtq_f32(x, maxDot);
1229 maxDot = vbslq_f32(mask, x, maxDot);
1230 index = vbslq_u32(mask, local_index, index);
1231 local_index = vaddq_u32(local_index, four);
1237 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1239 // the next two lines should resolve to a single vswp d, d
1240 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1241 // the next two lines should resolve to a single vswp d, d
1242 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1244 xy0 = vmulq_f32(xy0, vLo);
1246 z = vmulq_f32(z, vHi);
1247 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1248 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1249 x = vaddq_f32(x, z);
1251 uint32x4_t mask = vcgtq_f32(x, maxDot);
1252 maxDot = vbslq_f32(mask, x, maxDot);
1253 index = vbslq_u32(mask, local_index, index);
1254 local_index = vaddq_u32(local_index, four);
1262 // select best answer between hi and lo results
1263 uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
1264 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1265 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1267 // select best answer between even and odd results
1268 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1269 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1270 mask = vcgt_f32(maxDotO, maxDot2);
1271 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1272 index2 = vbsl_u32(mask, indexHi, index2);
1274 *dotResult = vget_lane_f32(maxDot2, 0);
1275 return vget_lane_u32(index2, 0);
1278 long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
1280 unsigned long i = 0;
1281 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1282 float32x2_t vLo = vget_low_f32(vvec);
1283 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1284 float32x2_t dotMinLo = (float32x2_t){BT_INFINITY, BT_INFINITY};
1285 float32x2_t dotMinHi = (float32x2_t){BT_INFINITY, BT_INFINITY};
1286 uint32x2_t indexLo = (uint32x2_t){0, 1};
1287 uint32x2_t indexHi = (uint32x2_t){2, 3};
1288 uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1289 uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1290 const uint32x2_t four = (uint32x2_t){4, 4};
1292 for (; i + 8 <= count; i += 8)
1294 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1295 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1296 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1297 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1299 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1300 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1301 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1302 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1304 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1305 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1306 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1307 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1309 float32x2_t rLo = vpadd_f32(xy0, xy1);
1310 float32x2_t rHi = vpadd_f32(xy2, xy3);
1311 rLo = vadd_f32(rLo, zLo);
1312 rHi = vadd_f32(rHi, zHi);
1314 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1315 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1316 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1317 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1318 iLo = vbsl_u32(maskLo, indexLo, iLo);
1319 iHi = vbsl_u32(maskHi, indexHi, iHi);
1320 indexLo = vadd_u32(indexLo, four);
1321 indexHi = vadd_u32(indexHi, four);
1323 v0 = vld1q_f32_aligned_postincrement(vv);
1324 v1 = vld1q_f32_aligned_postincrement(vv);
1325 v2 = vld1q_f32_aligned_postincrement(vv);
1326 v3 = vld1q_f32_aligned_postincrement(vv);
1328 xy0 = vmul_f32(vget_low_f32(v0), vLo);
1329 xy1 = vmul_f32(vget_low_f32(v1), vLo);
1330 xy2 = vmul_f32(vget_low_f32(v2), vLo);
1331 xy3 = vmul_f32(vget_low_f32(v3), vLo);
1333 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1334 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1335 zLo = vmul_f32(z0.val[0], vHi);
1336 zHi = vmul_f32(z1.val[0], vHi);
1338 rLo = vpadd_f32(xy0, xy1);
1339 rHi = vpadd_f32(xy2, xy3);
1340 rLo = vadd_f32(rLo, zLo);
1341 rHi = vadd_f32(rHi, zHi);
1343 maskLo = vclt_f32(rLo, dotMinLo);
1344 maskHi = vclt_f32(rHi, dotMinHi);
1345 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1346 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1347 iLo = vbsl_u32(maskLo, indexLo, iLo);
1348 iHi = vbsl_u32(maskHi, indexHi, iHi);
1349 indexLo = vadd_u32(indexLo, four);
1350 indexHi = vadd_u32(indexHi, four);
1353 for (; i + 4 <= count; i += 4)
1355 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1356 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1357 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1358 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1360 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1361 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1362 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1363 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1365 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1366 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1367 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1368 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1370 float32x2_t rLo = vpadd_f32(xy0, xy1);
1371 float32x2_t rHi = vpadd_f32(xy2, xy3);
1372 rLo = vadd_f32(rLo, zLo);
1373 rHi = vadd_f32(rHi, zHi);
1375 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1376 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1377 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1378 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1379 iLo = vbsl_u32(maskLo, indexLo, iLo);
1380 iHi = vbsl_u32(maskHi, indexHi, iHi);
1381 indexLo = vadd_u32(indexLo, four);
1382 indexHi = vadd_u32(indexHi, four);
1388 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1389 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1390 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1392 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1393 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1394 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1396 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1397 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1398 float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1400 float32x2_t rLo = vpadd_f32(xy0, xy1);
1401 float32x2_t rHi = vpadd_f32(xy2, xy2);
1402 rLo = vadd_f32(rLo, zLo);
1403 rHi = vadd_f32(rHi, zHi);
1405 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1406 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1407 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1408 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1409 iLo = vbsl_u32(maskLo, indexLo, iLo);
1410 iHi = vbsl_u32(maskHi, indexHi, iHi);
1415 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1416 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1418 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1419 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1421 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1422 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1424 float32x2_t rLo = vpadd_f32(xy0, xy1);
1425 rLo = vadd_f32(rLo, zLo);
1427 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1428 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1429 iLo = vbsl_u32(maskLo, indexLo, iLo);
1434 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1435 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1436 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1437 float32x2_t zLo = vmul_f32(z0, vHi);
1438 float32x2_t rLo = vpadd_f32(xy0, xy0);
1439 rLo = vadd_f32(rLo, zLo);
1440 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1441 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1442 iLo = vbsl_u32(maskLo, indexLo, iLo);
1450 // select best answer between hi and lo results
1451 uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
1452 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1453 iLo = vbsl_u32(mask, iHi, iLo);
1455 // select best answer between even and odd results
1456 dotMinHi = vdup_lane_f32(dotMinLo, 1);
1457 iHi = vdup_lane_u32(iLo, 1);
1458 mask = vclt_f32(dotMinHi, dotMinLo);
1459 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1460 iLo = vbsl_u32(mask, iHi, iLo);
1462 *dotResult = vget_lane_f32(dotMinLo, 0);
1463 return vget_lane_u32(iLo, 0);
1466 long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1468 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1469 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1470 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1471 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1472 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1473 uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1474 float32x4_t minDot = (float32x4_t){BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY};
1476 unsigned long i = 0;
1477 for (; i + 8 <= count; i += 8)
1479 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1480 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1481 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1482 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1484 // the next two lines should resolve to a single vswp d, d
1485 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1486 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1487 // the next two lines should resolve to a single vswp d, d
1488 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1489 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1491 xy0 = vmulq_f32(xy0, vLo);
1492 xy1 = vmulq_f32(xy1, vLo);
1494 float32x4x2_t zb = vuzpq_f32(z0, z1);
1495 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1496 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1497 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1498 x = vaddq_f32(x, z);
1500 uint32x4_t mask = vcltq_f32(x, minDot);
1501 minDot = vbslq_f32(mask, x, minDot);
1502 index = vbslq_u32(mask, local_index, index);
1503 local_index = vaddq_u32(local_index, four);
1505 v0 = vld1q_f32_aligned_postincrement(vv);
1506 v1 = vld1q_f32_aligned_postincrement(vv);
1507 v2 = vld1q_f32_aligned_postincrement(vv);
1508 v3 = vld1q_f32_aligned_postincrement(vv);
1510 // the next two lines should resolve to a single vswp d, d
1511 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1512 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1513 // the next two lines should resolve to a single vswp d, d
1514 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1515 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1517 xy0 = vmulq_f32(xy0, vLo);
1518 xy1 = vmulq_f32(xy1, vLo);
1520 zb = vuzpq_f32(z0, z1);
1521 z = vmulq_f32(zb.val[0], vHi);
1522 xy = vuzpq_f32(xy0, xy1);
1523 x = vaddq_f32(xy.val[0], xy.val[1]);
1524 x = vaddq_f32(x, z);
1526 mask = vcltq_f32(x, minDot);
1527 minDot = vbslq_f32(mask, x, minDot);
1528 index = vbslq_u32(mask, local_index, index);
1529 local_index = vaddq_u32(local_index, four);
1532 for (; i + 4 <= count; i += 4)
1534 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1535 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1536 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1537 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1539 // the next two lines should resolve to a single vswp d, d
1540 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1541 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1542 // the next two lines should resolve to a single vswp d, d
1543 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1544 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1546 xy0 = vmulq_f32(xy0, vLo);
1547 xy1 = vmulq_f32(xy1, vLo);
1549 float32x4x2_t zb = vuzpq_f32(z0, z1);
1550 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1551 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1552 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1553 x = vaddq_f32(x, z);
1555 uint32x4_t mask = vcltq_f32(x, minDot);
1556 minDot = vbslq_f32(mask, x, minDot);
1557 index = vbslq_u32(mask, local_index, index);
1558 local_index = vaddq_u32(local_index, four);
1565 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1566 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1567 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1569 // the next two lines should resolve to a single vswp d, d
1570 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1571 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1572 // the next two lines should resolve to a single vswp d, d
1573 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1574 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1576 xy0 = vmulq_f32(xy0, vLo);
1577 xy1 = vmulq_f32(xy1, vLo);
1579 float32x4x2_t zb = vuzpq_f32(z0, z1);
1580 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1581 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1582 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1583 x = vaddq_f32(x, z);
1585 uint32x4_t mask = vcltq_f32(x, minDot);
1586 minDot = vbslq_f32(mask, x, minDot);
1587 index = vbslq_u32(mask, local_index, index);
1588 local_index = vaddq_u32(local_index, four);
1594 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1595 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1597 // the next two lines should resolve to a single vswp d, d
1598 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1599 // the next two lines should resolve to a single vswp d, d
1600 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1602 xy0 = vmulq_f32(xy0, vLo);
1604 float32x4x2_t zb = vuzpq_f32(z0, z0);
1605 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1606 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1607 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1608 x = vaddq_f32(x, z);
1610 uint32x4_t mask = vcltq_f32(x, minDot);
1611 minDot = vbslq_f32(mask, x, minDot);
1612 index = vbslq_u32(mask, local_index, index);
1613 local_index = vaddq_u32(local_index, four);
1619 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1621 // the next two lines should resolve to a single vswp d, d
1622 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1623 // the next two lines should resolve to a single vswp d, d
1624 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1626 xy0 = vmulq_f32(xy0, vLo);
1628 z = vmulq_f32(z, vHi);
1629 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1630 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1631 x = vaddq_f32(x, z);
1633 uint32x4_t mask = vcltq_f32(x, minDot);
1634 minDot = vbslq_f32(mask, x, minDot);
1635 index = vbslq_u32(mask, local_index, index);
1636 local_index = vaddq_u32(local_index, four);
1644 // select best answer between hi and lo results
1645 uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
1646 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1647 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1649 // select best answer between even and odd results
1650 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1651 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1652 mask = vclt_f32(minDotO, minDot2);
1653 minDot2 = vbsl_f32(mask, minDotO, minDot2);
1654 index2 = vbsl_u32(mask, indexHi, index2);
1656 *dotResult = vget_lane_f32(minDot2, 0);
1657 return vget_lane_u32(index2, 0);
1661 #error Unhandled __APPLE__ arch
1664 #endif /* __APPLE__ */