2 Copyright (c) 2011-213 Apple Inc. http://bulletphysics.org
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
14 This source version has been altered.
17 #if defined(_WIN32) || defined(__i386__)
18 #define B3_USE_SSE_IN_API
21 #include "b3Vector3.h"
23 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
27 typedef float float4 __attribute__((vector_size(16)));
31 //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
33 #if defined B3_USE_SSE || defined _WIN32
35 #define LOG2_ARRAY_SIZE 6
36 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
38 #include <emmintrin.h>
40 long b3_maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
41 long b3_maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
43 const float4 *vertices = (const float4 *)vv;
44 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
45 float4 dotMax = b3Assign128(-B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY);
46 float4 vvec = _mm_loadu_ps(vec);
47 float4 vHi = b3CastiTo128f(_mm_shuffle_epi32(b3CastfTo128i(vvec), 0xaa)); /// zzzz
48 float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy
53 float4 stack_array[STACK_ARRAY_COUNT];
56 // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
61 // Faster loop without cleanup code for full tiles
62 for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
66 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
67 { // do four dot products at a time. Carefully avoid touching the w element.
68 float4 v0 = vertices[0];
69 float4 v1 = vertices[1];
70 float4 v2 = vertices[2];
71 float4 v3 = vertices[3];
74 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
75 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
76 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
77 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
81 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
82 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
83 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
87 stack_array[index] = x;
88 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
96 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
97 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
98 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
99 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
103 z = _mm_shuffle_ps(hi0, hi1, 0x88);
104 x = _mm_shuffle_ps(lo0, lo1, 0x88);
105 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
109 stack_array[index + 1] = x;
110 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
118 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
119 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
120 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
121 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
125 z = _mm_shuffle_ps(hi0, hi1, 0x88);
126 x = _mm_shuffle_ps(lo0, lo1, 0x88);
127 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
131 stack_array[index + 2] = x;
132 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
140 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
141 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
142 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
143 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
147 z = _mm_shuffle_ps(hi0, hi1, 0x88);
148 x = _mm_shuffle_ps(lo0, lo1, 0x88);
149 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
153 stack_array[index + 3] = x;
154 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
156 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
159 // If we found a new max
160 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
162 // copy the new max across all lanes of our max accumulator
163 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
164 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
168 // find first occurrence of that max
170 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
173 // record where it is.
174 maxIndex = 4 * index + segment + indexTable[test];
178 // account for work we've already done
181 // Deal with the last < STACK_ARRAY_COUNT vectors
185 if (b3Unlikely(count > 16))
187 for (; index + 4 <= count / 4; index += 4)
188 { // do four dot products at a time. Carefully avoid touching the w element.
189 float4 v0 = vertices[0];
190 float4 v1 = vertices[1];
191 float4 v2 = vertices[2];
192 float4 v3 = vertices[3];
195 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
196 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
197 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
198 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
202 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
203 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
204 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
208 stack_array[index] = x;
209 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
217 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
218 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
219 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
220 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
224 z = _mm_shuffle_ps(hi0, hi1, 0x88);
225 x = _mm_shuffle_ps(lo0, lo1, 0x88);
226 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
230 stack_array[index + 1] = x;
231 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
239 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
240 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
241 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
242 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
246 z = _mm_shuffle_ps(hi0, hi1, 0x88);
247 x = _mm_shuffle_ps(lo0, lo1, 0x88);
248 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
252 stack_array[index + 2] = x;
253 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
261 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
262 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
263 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
264 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
268 z = _mm_shuffle_ps(hi0, hi1, 0x88);
269 x = _mm_shuffle_ps(lo0, lo1, 0x88);
270 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
274 stack_array[index + 3] = x;
275 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
277 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
281 size_t localCount = (count & -4L) - 4 * index;
285 float4 t0, t1, t2, t3, t4;
286 float4 *sap = &stack_array[index + localCount / 4];
287 vertices += localCount; // counter the offset
288 size_t byteIndex = -(localCount) * sizeof(float);
289 //AT&T Code style assembly
292 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
293 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
294 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
295 movaps %[t0], %[max] // vertices[0] \n\
296 movlhps %[t1], %[max] // x0y0x1y1 \n\
297 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
298 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
299 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
300 movhlps %[t0], %[t1] // z0w0z1w1 \n\
301 movaps %[t3], %[t0] // vertices[2] \n\
302 movlhps %[t4], %[t0] // x2y2x3y3 \n\
303 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
304 movhlps %[t3], %[t4] // z2w2z3w3 \n\
305 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
306 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
307 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
308 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
309 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
310 addps %[t3], %[max] // x + y \n\
311 addps %[t1], %[max] // x + y + z \n\
312 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
313 maxps %[t2], %[max] // record max, restore max \n\
314 add $16, %[byteIndex] // advance loop counter\n\
317 : [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
318 : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
320 index += localCount / 4;
323 for (unsigned int i = 0; i < localCount / 4; i++, index++)
324 { // do four dot products at a time. Carefully avoid touching the w element.
325 float4 v0 = vertices[0];
326 float4 v1 = vertices[1];
327 float4 v2 = vertices[2];
328 float4 v3 = vertices[3];
331 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
332 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
333 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
334 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
338 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
339 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
340 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
344 stack_array[index] = x;
345 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
351 // process the last few points
354 float4 v0, v1, v2, x, y, z;
363 // Calculate 3 dot products, transpose, duplicate v2
364 float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
365 float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
367 z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
369 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
371 x = _mm_shuffle_ps(lo0, lo1, 0x88);
372 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
379 float4 xy = _mm_movelh_ps(v0, v1);
380 z = _mm_movehl_ps(v1, v0);
382 z = _mm_shuffle_ps(z, z, 0xa8);
383 x = _mm_shuffle_ps(xy, xy, 0xa8);
384 y = _mm_shuffle_ps(xy, xy, 0xfd);
390 float4 xy = vertices[0];
391 z = _mm_shuffle_ps(xy, xy, 0xaa);
394 x = _mm_shuffle_ps(xy, xy, 0);
395 y = _mm_shuffle_ps(xy, xy, 0x55);
401 stack_array[index] = x;
402 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
406 // if we found a new max.
407 if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
408 { // we found a new max. Search for it
409 // find max across the max vector, place in all elements of max -- big latency hit here
410 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
411 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
413 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
414 // this where it actually makes a difference is handled in the early out at the top of the function,
415 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
416 // complexity, and removed it.
420 // scan for the first occurence of max in the array
422 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
425 maxIndex = 4 * index + segment + indexTable[test];
428 _mm_store_ss(dotResult, dotMax);
432 long b3_mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
434 long b3_mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
436 const float4 *vertices = (const float4 *)vv;
437 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
439 float4 dotmin = b3Assign128(B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY);
440 float4 vvec = _mm_loadu_ps(vec);
441 float4 vHi = b3CastiTo128f(_mm_shuffle_epi32(b3CastfTo128i(vvec), 0xaa)); /// zzzz
442 float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy
447 float4 stack_array[STACK_ARRAY_COUNT];
450 // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
455 // Faster loop without cleanup code for full tiles
456 for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
460 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
461 { // do four dot products at a time. Carefully avoid touching the w element.
462 float4 v0 = vertices[0];
463 float4 v1 = vertices[1];
464 float4 v2 = vertices[2];
465 float4 v3 = vertices[3];
468 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
469 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
470 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
471 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
475 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
476 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
477 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
481 stack_array[index] = x;
482 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
490 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
491 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
492 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
493 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
497 z = _mm_shuffle_ps(hi0, hi1, 0x88);
498 x = _mm_shuffle_ps(lo0, lo1, 0x88);
499 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
503 stack_array[index + 1] = x;
504 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
512 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
513 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
514 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
515 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
519 z = _mm_shuffle_ps(hi0, hi1, 0x88);
520 x = _mm_shuffle_ps(lo0, lo1, 0x88);
521 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
525 stack_array[index + 2] = x;
526 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
534 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
535 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
536 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
537 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
541 z = _mm_shuffle_ps(hi0, hi1, 0x88);
542 x = _mm_shuffle_ps(lo0, lo1, 0x88);
543 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
547 stack_array[index + 3] = x;
548 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
550 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
553 // If we found a new min
554 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
556 // copy the new min across all lanes of our min accumulator
557 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
558 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
562 // find first occurrence of that min
564 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
567 // record where it is.
568 minIndex = 4 * index + segment + indexTable[test];
572 // account for work we've already done
575 // Deal with the last < STACK_ARRAY_COUNT vectors
579 if (b3Unlikely(count > 16))
581 for (; index + 4 <= count / 4; index += 4)
582 { // do four dot products at a time. Carefully avoid touching the w element.
583 float4 v0 = vertices[0];
584 float4 v1 = vertices[1];
585 float4 v2 = vertices[2];
586 float4 v3 = vertices[3];
589 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
590 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
591 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
592 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
596 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
597 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
598 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
602 stack_array[index] = x;
603 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
611 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
612 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
613 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
614 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
618 z = _mm_shuffle_ps(hi0, hi1, 0x88);
619 x = _mm_shuffle_ps(lo0, lo1, 0x88);
620 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
624 stack_array[index + 1] = x;
625 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
633 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
634 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
635 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
636 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
640 z = _mm_shuffle_ps(hi0, hi1, 0x88);
641 x = _mm_shuffle_ps(lo0, lo1, 0x88);
642 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
646 stack_array[index + 2] = x;
647 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
655 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
656 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
657 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
658 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
662 z = _mm_shuffle_ps(hi0, hi1, 0x88);
663 x = _mm_shuffle_ps(lo0, lo1, 0x88);
664 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
668 stack_array[index + 3] = x;
669 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
671 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
675 size_t localCount = (count & -4L) - 4 * index;
679 vertices += localCount; // counter the offset
680 float4 t0, t1, t2, t3, t4;
681 size_t byteIndex = -(localCount) * sizeof(float);
682 float4 *sap = &stack_array[index + localCount / 4];
686 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
687 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
688 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
689 movaps %[t0], %[min] // vertices[0] \n\
690 movlhps %[t1], %[min] // x0y0x1y1 \n\
691 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
692 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
693 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
694 movhlps %[t0], %[t1] // z0w0z1w1 \n\
695 movaps %[t3], %[t0] // vertices[2] \n\
696 movlhps %[t4], %[t0] // x2y2x3y3 \n\
697 movhlps %[t3], %[t4] // z2w2z3w3 \n\
698 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
699 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
700 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
701 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
702 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
703 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
704 addps %[t3], %[min] // x + y \n\
705 addps %[t1], %[min] // x + y + z \n\
706 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
707 minps %[t2], %[min] // record min, restore min \n\
708 add $16, %[byteIndex] // advance loop counter\n\
711 : [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
712 : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
714 index += localCount / 4;
717 for (unsigned int i = 0; i < localCount / 4; i++, index++)
718 { // do four dot products at a time. Carefully avoid touching the w element.
719 float4 v0 = vertices[0];
720 float4 v1 = vertices[1];
721 float4 v2 = vertices[2];
722 float4 v3 = vertices[3];
725 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
726 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
727 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
728 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
732 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
733 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
734 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
738 stack_array[index] = x;
739 min = _mm_min_ps(x, min); // control the order here so that max is never NaN even if x is nan
746 // process the last few points
749 float4 v0, v1, v2, x, y, z;
758 // Calculate 3 dot products, transpose, duplicate v2
759 float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
760 float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
762 z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
764 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
766 x = _mm_shuffle_ps(lo0, lo1, 0x88);
767 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
774 float4 xy = _mm_movelh_ps(v0, v1);
775 z = _mm_movehl_ps(v1, v0);
777 z = _mm_shuffle_ps(z, z, 0xa8);
778 x = _mm_shuffle_ps(xy, xy, 0xa8);
779 y = _mm_shuffle_ps(xy, xy, 0xfd);
785 float4 xy = vertices[0];
786 z = _mm_shuffle_ps(xy, xy, 0xaa);
789 x = _mm_shuffle_ps(xy, xy, 0);
790 y = _mm_shuffle_ps(xy, xy, 0x55);
796 stack_array[index] = x;
797 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
801 // if we found a new min.
802 if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
803 { // we found a new min. Search for it
804 // find min across the min vector, place in all elements of min -- big latency hit here
805 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
806 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
808 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
809 // this where it actually makes a difference is handled in the early out at the top of the function,
810 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
811 // complexity, and removed it.
815 // scan for the first occurence of min in the array
817 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
820 minIndex = 4 * index + segment + indexTable[test];
823 _mm_store_ss(dotResult, dotmin);
827 #elif defined B3_USE_NEON
828 #define ARM_NEON_GCC_COMPATIBILITY 1
829 #include <arm_neon.h>
831 static long b3_maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
832 static long b3_maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
833 static long b3_maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
834 static long b3_mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
835 static long b3_mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
836 static long b3_mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
838 long (*b3_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = b3_maxdot_large_sel;
839 long (*b3_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = b3_mindot_large_sel;
843 int _get_cpu_capabilities(void);
846 static long b3_maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
848 if (_get_cpu_capabilities() & 0x2000)
849 b3_maxdot_large = _maxdot_large_v1;
851 b3_maxdot_large = _maxdot_large_v0;
853 return b3_maxdot_large(vv, vec, count, dotResult);
856 static long b3_mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
858 if (_get_cpu_capabilities() & 0x2000)
859 b3_mindot_large = _mindot_large_v1;
861 b3_mindot_large = _mindot_large_v0;
863 return b3_mindot_large(vv, vec, count, dotResult);
866 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
868 long b3_maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
871 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
872 float32x2_t vLo = vget_low_f32(vvec);
873 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
874 float32x2_t dotMaxLo = (float32x2_t){-B3_INFINITY, -B3_INFINITY};
875 float32x2_t dotMaxHi = (float32x2_t){-B3_INFINITY, -B3_INFINITY};
876 uint32x2_t indexLo = (uint32x2_t){0, 1};
877 uint32x2_t indexHi = (uint32x2_t){2, 3};
878 uint32x2_t iLo = (uint32x2_t){-1, -1};
879 uint32x2_t iHi = (uint32x2_t){-1, -1};
880 const uint32x2_t four = (uint32x2_t){4, 4};
882 for (; i + 8 <= count; i += 8)
884 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
885 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
886 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
887 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
889 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
890 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
891 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
892 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
894 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
895 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
896 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
897 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
899 float32x2_t rLo = vpadd_f32(xy0, xy1);
900 float32x2_t rHi = vpadd_f32(xy2, xy3);
901 rLo = vadd_f32(rLo, zLo);
902 rHi = vadd_f32(rHi, zHi);
904 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
905 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
906 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
907 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
908 iLo = vbsl_u32(maskLo, indexLo, iLo);
909 iHi = vbsl_u32(maskHi, indexHi, iHi);
910 indexLo = vadd_u32(indexLo, four);
911 indexHi = vadd_u32(indexHi, four);
913 v0 = vld1q_f32_aligned_postincrement(vv);
914 v1 = vld1q_f32_aligned_postincrement(vv);
915 v2 = vld1q_f32_aligned_postincrement(vv);
916 v3 = vld1q_f32_aligned_postincrement(vv);
918 xy0 = vmul_f32(vget_low_f32(v0), vLo);
919 xy1 = vmul_f32(vget_low_f32(v1), vLo);
920 xy2 = vmul_f32(vget_low_f32(v2), vLo);
921 xy3 = vmul_f32(vget_low_f32(v3), vLo);
923 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
924 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
925 zLo = vmul_f32(z0.val[0], vHi);
926 zHi = vmul_f32(z1.val[0], vHi);
928 rLo = vpadd_f32(xy0, xy1);
929 rHi = vpadd_f32(xy2, xy3);
930 rLo = vadd_f32(rLo, zLo);
931 rHi = vadd_f32(rHi, zHi);
933 maskLo = vcgt_f32(rLo, dotMaxLo);
934 maskHi = vcgt_f32(rHi, dotMaxHi);
935 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
936 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
937 iLo = vbsl_u32(maskLo, indexLo, iLo);
938 iHi = vbsl_u32(maskHi, indexHi, iHi);
939 indexLo = vadd_u32(indexLo, four);
940 indexHi = vadd_u32(indexHi, four);
943 for (; i + 4 <= count; i += 4)
945 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
946 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
947 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
948 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
950 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
951 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
952 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
953 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
955 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
956 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
957 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
958 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
960 float32x2_t rLo = vpadd_f32(xy0, xy1);
961 float32x2_t rHi = vpadd_f32(xy2, xy3);
962 rLo = vadd_f32(rLo, zLo);
963 rHi = vadd_f32(rHi, zHi);
965 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
966 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
967 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
968 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
969 iLo = vbsl_u32(maskLo, indexLo, iLo);
970 iHi = vbsl_u32(maskHi, indexHi, iHi);
971 indexLo = vadd_u32(indexLo, four);
972 indexHi = vadd_u32(indexHi, four);
979 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
980 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
981 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
983 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
984 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
985 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
987 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
988 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
989 float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
991 float32x2_t rLo = vpadd_f32(xy0, xy1);
992 float32x2_t rHi = vpadd_f32(xy2, xy2);
993 rLo = vadd_f32(rLo, zLo);
994 rHi = vadd_f32(rHi, zHi);
996 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
997 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
998 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
999 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
1000 iLo = vbsl_u32(maskLo, indexLo, iLo);
1001 iHi = vbsl_u32(maskHi, indexHi, iHi);
1006 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1007 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1009 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1010 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1012 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1013 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1015 float32x2_t rLo = vpadd_f32(xy0, xy1);
1016 rLo = vadd_f32(rLo, zLo);
1018 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1019 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1020 iLo = vbsl_u32(maskLo, indexLo, iLo);
1025 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1026 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1027 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1028 float32x2_t zLo = vmul_f32(z0, vHi);
1029 float32x2_t rLo = vpadd_f32(xy0, xy0);
1030 rLo = vadd_f32(rLo, zLo);
1031 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1032 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1033 iLo = vbsl_u32(maskLo, indexLo, iLo);
1041 // select best answer between hi and lo results
1042 uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
1043 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1044 iLo = vbsl_u32(mask, iHi, iLo);
1046 // select best answer between even and odd results
1047 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1048 iHi = vdup_lane_u32(iLo, 1);
1049 mask = vcgt_f32(dotMaxHi, dotMaxLo);
1050 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1051 iLo = vbsl_u32(mask, iHi, iLo);
1053 *dotResult = vget_lane_f32(dotMaxLo, 0);
1054 return vget_lane_u32(iLo, 0);
1057 long b3_maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1059 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1060 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1061 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1062 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1063 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1064 uint32x4_t index = (uint32x4_t){-1, -1, -1, -1};
1065 float32x4_t maxDot = (float32x4_t){-B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY};
1067 unsigned long i = 0;
1068 for (; i + 8 <= count; i += 8)
1070 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1071 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1072 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1073 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1075 // the next two lines should resolve to a single vswp d, d
1076 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1077 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1078 // the next two lines should resolve to a single vswp d, d
1079 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1080 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1082 xy0 = vmulq_f32(xy0, vLo);
1083 xy1 = vmulq_f32(xy1, vLo);
1085 float32x4x2_t zb = vuzpq_f32(z0, z1);
1086 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1087 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1088 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1089 x = vaddq_f32(x, z);
1091 uint32x4_t mask = vcgtq_f32(x, maxDot);
1092 maxDot = vbslq_f32(mask, x, maxDot);
1093 index = vbslq_u32(mask, local_index, index);
1094 local_index = vaddq_u32(local_index, four);
1096 v0 = vld1q_f32_aligned_postincrement(vv);
1097 v1 = vld1q_f32_aligned_postincrement(vv);
1098 v2 = vld1q_f32_aligned_postincrement(vv);
1099 v3 = vld1q_f32_aligned_postincrement(vv);
1101 // the next two lines should resolve to a single vswp d, d
1102 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1103 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1104 // the next two lines should resolve to a single vswp d, d
1105 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1106 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1108 xy0 = vmulq_f32(xy0, vLo);
1109 xy1 = vmulq_f32(xy1, vLo);
1111 zb = vuzpq_f32(z0, z1);
1112 z = vmulq_f32(zb.val[0], vHi);
1113 xy = vuzpq_f32(xy0, xy1);
1114 x = vaddq_f32(xy.val[0], xy.val[1]);
1115 x = vaddq_f32(x, z);
1117 mask = vcgtq_f32(x, maxDot);
1118 maxDot = vbslq_f32(mask, x, maxDot);
1119 index = vbslq_u32(mask, local_index, index);
1120 local_index = vaddq_u32(local_index, four);
1123 for (; i + 4 <= count; i += 4)
1125 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1126 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1127 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1128 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1130 // the next two lines should resolve to a single vswp d, d
1131 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1132 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1133 // the next two lines should resolve to a single vswp d, d
1134 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1135 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1137 xy0 = vmulq_f32(xy0, vLo);
1138 xy1 = vmulq_f32(xy1, vLo);
1140 float32x4x2_t zb = vuzpq_f32(z0, z1);
1141 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1142 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1143 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1144 x = vaddq_f32(x, z);
1146 uint32x4_t mask = vcgtq_f32(x, maxDot);
1147 maxDot = vbslq_f32(mask, x, maxDot);
1148 index = vbslq_u32(mask, local_index, index);
1149 local_index = vaddq_u32(local_index, four);
1156 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1157 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1158 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1160 // the next two lines should resolve to a single vswp d, d
1161 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1162 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1163 // the next two lines should resolve to a single vswp d, d
1164 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1165 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1167 xy0 = vmulq_f32(xy0, vLo);
1168 xy1 = vmulq_f32(xy1, vLo);
1170 float32x4x2_t zb = vuzpq_f32(z0, z1);
1171 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1172 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1173 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1174 x = vaddq_f32(x, z);
1176 uint32x4_t mask = vcgtq_f32(x, maxDot);
1177 maxDot = vbslq_f32(mask, x, maxDot);
1178 index = vbslq_u32(mask, local_index, index);
1179 local_index = vaddq_u32(local_index, four);
1185 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1186 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1188 // the next two lines should resolve to a single vswp d, d
1189 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1190 // the next two lines should resolve to a single vswp d, d
1191 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1193 xy0 = vmulq_f32(xy0, vLo);
1195 float32x4x2_t zb = vuzpq_f32(z0, z0);
1196 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1197 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1198 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1199 x = vaddq_f32(x, z);
1201 uint32x4_t mask = vcgtq_f32(x, maxDot);
1202 maxDot = vbslq_f32(mask, x, maxDot);
1203 index = vbslq_u32(mask, local_index, index);
1204 local_index = vaddq_u32(local_index, four);
1210 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1212 // the next two lines should resolve to a single vswp d, d
1213 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1214 // the next two lines should resolve to a single vswp d, d
1215 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1217 xy0 = vmulq_f32(xy0, vLo);
1219 z = vmulq_f32(z, vHi);
1220 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1221 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1222 x = vaddq_f32(x, z);
1224 uint32x4_t mask = vcgtq_f32(x, maxDot);
1225 maxDot = vbslq_f32(mask, x, maxDot);
1226 index = vbslq_u32(mask, local_index, index);
1227 local_index = vaddq_u32(local_index, four);
1235 // select best answer between hi and lo results
1236 uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
1237 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1238 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1240 // select best answer between even and odd results
1241 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1242 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1243 mask = vcgt_f32(maxDotO, maxDot2);
1244 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1245 index2 = vbsl_u32(mask, indexHi, index2);
1247 *dotResult = vget_lane_f32(maxDot2, 0);
1248 return vget_lane_u32(index2, 0);
1251 long b3_mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
1253 unsigned long i = 0;
1254 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1255 float32x2_t vLo = vget_low_f32(vvec);
1256 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1257 float32x2_t dotMinLo = (float32x2_t){B3_INFINITY, B3_INFINITY};
1258 float32x2_t dotMinHi = (float32x2_t){B3_INFINITY, B3_INFINITY};
1259 uint32x2_t indexLo = (uint32x2_t){0, 1};
1260 uint32x2_t indexHi = (uint32x2_t){2, 3};
1261 uint32x2_t iLo = (uint32x2_t){-1, -1};
1262 uint32x2_t iHi = (uint32x2_t){-1, -1};
1263 const uint32x2_t four = (uint32x2_t){4, 4};
1265 for (; i + 8 <= count; i += 8)
1267 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1268 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1269 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1270 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1272 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1273 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1274 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1275 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1277 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1278 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1279 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1280 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1282 float32x2_t rLo = vpadd_f32(xy0, xy1);
1283 float32x2_t rHi = vpadd_f32(xy2, xy3);
1284 rLo = vadd_f32(rLo, zLo);
1285 rHi = vadd_f32(rHi, zHi);
1287 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1288 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1289 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1290 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1291 iLo = vbsl_u32(maskLo, indexLo, iLo);
1292 iHi = vbsl_u32(maskHi, indexHi, iHi);
1293 indexLo = vadd_u32(indexLo, four);
1294 indexHi = vadd_u32(indexHi, four);
1296 v0 = vld1q_f32_aligned_postincrement(vv);
1297 v1 = vld1q_f32_aligned_postincrement(vv);
1298 v2 = vld1q_f32_aligned_postincrement(vv);
1299 v3 = vld1q_f32_aligned_postincrement(vv);
1301 xy0 = vmul_f32(vget_low_f32(v0), vLo);
1302 xy1 = vmul_f32(vget_low_f32(v1), vLo);
1303 xy2 = vmul_f32(vget_low_f32(v2), vLo);
1304 xy3 = vmul_f32(vget_low_f32(v3), vLo);
1306 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1307 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1308 zLo = vmul_f32(z0.val[0], vHi);
1309 zHi = vmul_f32(z1.val[0], vHi);
1311 rLo = vpadd_f32(xy0, xy1);
1312 rHi = vpadd_f32(xy2, xy3);
1313 rLo = vadd_f32(rLo, zLo);
1314 rHi = vadd_f32(rHi, zHi);
1316 maskLo = vclt_f32(rLo, dotMinLo);
1317 maskHi = vclt_f32(rHi, dotMinHi);
1318 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1319 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1320 iLo = vbsl_u32(maskLo, indexLo, iLo);
1321 iHi = vbsl_u32(maskHi, indexHi, iHi);
1322 indexLo = vadd_u32(indexLo, four);
1323 indexHi = vadd_u32(indexHi, four);
1326 for (; i + 4 <= count; i += 4)
1328 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1329 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1330 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1331 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1333 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1334 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1335 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1336 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1338 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1339 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1340 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1341 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1343 float32x2_t rLo = vpadd_f32(xy0, xy1);
1344 float32x2_t rHi = vpadd_f32(xy2, xy3);
1345 rLo = vadd_f32(rLo, zLo);
1346 rHi = vadd_f32(rHi, zHi);
1348 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1349 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1350 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1351 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1352 iLo = vbsl_u32(maskLo, indexLo, iLo);
1353 iHi = vbsl_u32(maskHi, indexHi, iHi);
1354 indexLo = vadd_u32(indexLo, four);
1355 indexHi = vadd_u32(indexHi, four);
1361 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1362 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1363 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1365 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1366 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1367 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1369 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1370 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1371 float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1373 float32x2_t rLo = vpadd_f32(xy0, xy1);
1374 float32x2_t rHi = vpadd_f32(xy2, xy2);
1375 rLo = vadd_f32(rLo, zLo);
1376 rHi = vadd_f32(rHi, zHi);
1378 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1379 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1380 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1381 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1382 iLo = vbsl_u32(maskLo, indexLo, iLo);
1383 iHi = vbsl_u32(maskHi, indexHi, iHi);
1388 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1389 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1391 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1392 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1394 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1395 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1397 float32x2_t rLo = vpadd_f32(xy0, xy1);
1398 rLo = vadd_f32(rLo, zLo);
1400 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1401 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1402 iLo = vbsl_u32(maskLo, indexLo, iLo);
1407 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1408 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1409 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1410 float32x2_t zLo = vmul_f32(z0, vHi);
1411 float32x2_t rLo = vpadd_f32(xy0, xy0);
1412 rLo = vadd_f32(rLo, zLo);
1413 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1414 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1415 iLo = vbsl_u32(maskLo, indexLo, iLo);
1423 // select best answer between hi and lo results
1424 uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
1425 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1426 iLo = vbsl_u32(mask, iHi, iLo);
1428 // select best answer between even and odd results
1429 dotMinHi = vdup_lane_f32(dotMinLo, 1);
1430 iHi = vdup_lane_u32(iLo, 1);
1431 mask = vclt_f32(dotMinHi, dotMinLo);
1432 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1433 iLo = vbsl_u32(mask, iHi, iLo);
1435 *dotResult = vget_lane_f32(dotMinLo, 0);
1436 return vget_lane_u32(iLo, 0);
1439 long b3_mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1441 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1442 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1443 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1444 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1445 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1446 uint32x4_t index = (uint32x4_t){-1, -1, -1, -1};
1447 float32x4_t minDot = (float32x4_t){B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY};
1449 unsigned long i = 0;
1450 for (; i + 8 <= count; i += 8)
1452 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1453 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1454 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1455 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1457 // the next two lines should resolve to a single vswp d, d
1458 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1459 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1460 // the next two lines should resolve to a single vswp d, d
1461 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1462 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1464 xy0 = vmulq_f32(xy0, vLo);
1465 xy1 = vmulq_f32(xy1, vLo);
1467 float32x4x2_t zb = vuzpq_f32(z0, z1);
1468 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1469 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1470 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1471 x = vaddq_f32(x, z);
1473 uint32x4_t mask = vcltq_f32(x, minDot);
1474 minDot = vbslq_f32(mask, x, minDot);
1475 index = vbslq_u32(mask, local_index, index);
1476 local_index = vaddq_u32(local_index, four);
1478 v0 = vld1q_f32_aligned_postincrement(vv);
1479 v1 = vld1q_f32_aligned_postincrement(vv);
1480 v2 = vld1q_f32_aligned_postincrement(vv);
1481 v3 = vld1q_f32_aligned_postincrement(vv);
1483 // the next two lines should resolve to a single vswp d, d
1484 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1485 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1486 // the next two lines should resolve to a single vswp d, d
1487 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1488 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1490 xy0 = vmulq_f32(xy0, vLo);
1491 xy1 = vmulq_f32(xy1, vLo);
1493 zb = vuzpq_f32(z0, z1);
1494 z = vmulq_f32(zb.val[0], vHi);
1495 xy = vuzpq_f32(xy0, xy1);
1496 x = vaddq_f32(xy.val[0], xy.val[1]);
1497 x = vaddq_f32(x, z);
1499 mask = vcltq_f32(x, minDot);
1500 minDot = vbslq_f32(mask, x, minDot);
1501 index = vbslq_u32(mask, local_index, index);
1502 local_index = vaddq_u32(local_index, four);
1505 for (; i + 4 <= count; i += 4)
1507 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1508 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1509 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1510 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1512 // the next two lines should resolve to a single vswp d, d
1513 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1514 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1515 // the next two lines should resolve to a single vswp d, d
1516 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1517 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1519 xy0 = vmulq_f32(xy0, vLo);
1520 xy1 = vmulq_f32(xy1, vLo);
1522 float32x4x2_t zb = vuzpq_f32(z0, z1);
1523 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1524 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1525 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1526 x = vaddq_f32(x, z);
1528 uint32x4_t mask = vcltq_f32(x, minDot);
1529 minDot = vbslq_f32(mask, x, minDot);
1530 index = vbslq_u32(mask, local_index, index);
1531 local_index = vaddq_u32(local_index, four);
1538 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1539 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1540 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1542 // the next two lines should resolve to a single vswp d, d
1543 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1544 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1545 // the next two lines should resolve to a single vswp d, d
1546 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1547 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1549 xy0 = vmulq_f32(xy0, vLo);
1550 xy1 = vmulq_f32(xy1, vLo);
1552 float32x4x2_t zb = vuzpq_f32(z0, z1);
1553 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1554 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1555 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1556 x = vaddq_f32(x, z);
1558 uint32x4_t mask = vcltq_f32(x, minDot);
1559 minDot = vbslq_f32(mask, x, minDot);
1560 index = vbslq_u32(mask, local_index, index);
1561 local_index = vaddq_u32(local_index, four);
1567 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1568 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1570 // the next two lines should resolve to a single vswp d, d
1571 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1572 // the next two lines should resolve to a single vswp d, d
1573 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1575 xy0 = vmulq_f32(xy0, vLo);
1577 float32x4x2_t zb = vuzpq_f32(z0, z0);
1578 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1579 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1580 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1581 x = vaddq_f32(x, z);
1583 uint32x4_t mask = vcltq_f32(x, minDot);
1584 minDot = vbslq_f32(mask, x, minDot);
1585 index = vbslq_u32(mask, local_index, index);
1586 local_index = vaddq_u32(local_index, four);
1592 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1594 // the next two lines should resolve to a single vswp d, d
1595 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1596 // the next two lines should resolve to a single vswp d, d
1597 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1599 xy0 = vmulq_f32(xy0, vLo);
1601 z = vmulq_f32(z, vHi);
1602 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1603 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1604 x = vaddq_f32(x, z);
1606 uint32x4_t mask = vcltq_f32(x, minDot);
1607 minDot = vbslq_f32(mask, x, minDot);
1608 index = vbslq_u32(mask, local_index, index);
1609 local_index = vaddq_u32(local_index, four);
1617 // select best answer between hi and lo results
1618 uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
1619 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1620 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1622 // select best answer between even and odd results
1623 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1624 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1625 mask = vclt_f32(minDotO, minDot2);
1626 minDot2 = vbsl_f32(mask, minDotO, minDot2);
1627 index2 = vbsl_u32(mask, indexHi, index2);
1629 *dotResult = vget_lane_f32(minDot2, 0);
1630 return vget_lane_u32(index2, 0);
1634 #error Unhandled __APPLE__ arch
1637 #endif /* __APPLE__ */