2 Copyright (c) 2011 Apple Inc.
3 http://continuousphysics.com/Bullet/
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose,
8 including commercial applications, and to alter it and redistribute it freely,
9 subject to the following restrictions:
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
15 This source version has been altered.
18 #if defined (_WIN32) || defined (__i386__)
19 #define BT_USE_SSE_IN_API
22 #include "btVector3.h"
24 #if defined (BT_USE_SSE) || defined (BT_USE_NEON)
28 typedef float float4 __attribute__ ((vector_size(16)));
32 //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
35 #if defined BT_USE_SSE || defined _WIN32
37 #define LOG2_ARRAY_SIZE 6
38 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
40 #include <emmintrin.h>
42 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
43 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
45 const float4 *vertices = (const float4*) vv;
46 static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
47 float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY );
48 float4 vvec = _mm_loadu_ps( vec );
49 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz
50 float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy
55 float4 stack_array[ STACK_ARRAY_COUNT ];
58 memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
63 // Faster loop without cleanup code for full tiles
64 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
68 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
69 { // do four dot products at a time. Carefully avoid touching the w element.
70 float4 v0 = vertices[0];
71 float4 v1 = vertices[1];
72 float4 v2 = vertices[2];
73 float4 v3 = vertices[3]; vertices += 4;
75 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
76 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
77 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
78 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
82 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
83 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
84 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
88 stack_array[index] = x;
89 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
94 v3 = vertices[3]; vertices += 4;
96 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
97 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
98 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
99 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
103 z = _mm_shuffle_ps(hi0, hi1, 0x88);
104 x = _mm_shuffle_ps(lo0, lo1, 0x88);
105 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
109 stack_array[index+1] = x;
110 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
115 v3 = vertices[3]; vertices += 4;
117 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
118 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
119 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
120 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
124 z = _mm_shuffle_ps(hi0, hi1, 0x88);
125 x = _mm_shuffle_ps(lo0, lo1, 0x88);
126 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
130 stack_array[index+2] = x;
131 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
136 v3 = vertices[3]; vertices += 4;
138 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
139 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
140 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
141 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
145 z = _mm_shuffle_ps(hi0, hi1, 0x88);
146 x = _mm_shuffle_ps(lo0, lo1, 0x88);
147 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
151 stack_array[index+3] = x;
152 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
154 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
157 // If we found a new max
158 if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
160 // copy the new max across all lanes of our max accumulator
161 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
162 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
166 // find first occurrence of that max
168 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
170 // record where it is.
171 maxIndex = 4*index + segment + indexTable[test];
175 // account for work we've already done
178 // Deal with the last < STACK_ARRAY_COUNT vectors
183 if( btUnlikely( count > 16) )
185 for( ; index + 4 <= count / 4; index+=4 )
186 { // do four dot products at a time. Carefully avoid touching the w element.
187 float4 v0 = vertices[0];
188 float4 v1 = vertices[1];
189 float4 v2 = vertices[2];
190 float4 v3 = vertices[3]; vertices += 4;
192 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
193 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
194 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
195 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
199 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
200 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
201 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
205 stack_array[index] = x;
206 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
211 v3 = vertices[3]; vertices += 4;
213 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
214 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
215 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
216 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
220 z = _mm_shuffle_ps(hi0, hi1, 0x88);
221 x = _mm_shuffle_ps(lo0, lo1, 0x88);
222 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
226 stack_array[index+1] = x;
227 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
232 v3 = vertices[3]; vertices += 4;
234 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
235 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
236 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
237 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
241 z = _mm_shuffle_ps(hi0, hi1, 0x88);
242 x = _mm_shuffle_ps(lo0, lo1, 0x88);
243 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
247 stack_array[index+2] = x;
248 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
253 v3 = vertices[3]; vertices += 4;
255 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
256 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
257 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
258 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
262 z = _mm_shuffle_ps(hi0, hi1, 0x88);
263 x = _mm_shuffle_ps(lo0, lo1, 0x88);
264 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
268 stack_array[index+3] = x;
269 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
271 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
275 size_t localCount = (count & -4L) - 4*index;
279 float4 t0, t1, t2, t3, t4;
280 float4 * sap = &stack_array[index + localCount / 4];
281 vertices += localCount; // counter the offset
282 size_t byteIndex = -(localCount) * sizeof(float);
283 //AT&T Code style assembly
286 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
287 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
288 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
289 movaps %[t0], %[max] // vertices[0] \n\
290 movlhps %[t1], %[max] // x0y0x1y1 \n\
291 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
292 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
293 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
294 movhlps %[t0], %[t1] // z0w0z1w1 \n\
295 movaps %[t3], %[t0] // vertices[2] \n\
296 movlhps %[t4], %[t0] // x2y2x3y3 \n\
297 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
298 movhlps %[t3], %[t4] // z2w2z3w3 \n\
299 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
300 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
301 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
302 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
303 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
304 addps %[t3], %[max] // x + y \n\
305 addps %[t1], %[max] // x + y + z \n\
306 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
307 maxps %[t2], %[max] // record max, restore max \n\
308 add $16, %[byteIndex] // advance loop counter\n\
311 : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
312 : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
315 index += localCount/4;
318 for( unsigned int i=0; i<localCount/4; i++,index++)
319 { // do four dot products at a time. Carefully avoid touching the w element.
320 float4 v0 = vertices[0];
321 float4 v1 = vertices[1];
322 float4 v2 = vertices[2];
323 float4 v3 = vertices[3];
326 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
327 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
328 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
329 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
333 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
334 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
335 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
339 stack_array[index] = x;
340 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
346 // process the last few points
349 float4 v0, v1, v2, x, y, z;
358 // Calculate 3 dot products, transpose, duplicate v2
359 float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
360 float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
362 z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
364 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
366 x = _mm_shuffle_ps(lo0, lo1, 0x88);
367 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
374 float4 xy = _mm_movelh_ps(v0, v1);
375 z = _mm_movehl_ps(v1, v0);
377 z = _mm_shuffle_ps( z, z, 0xa8);
378 x = _mm_shuffle_ps( xy, xy, 0xa8);
379 y = _mm_shuffle_ps( xy, xy, 0xfd);
385 float4 xy = vertices[0];
386 z = _mm_shuffle_ps( xy, xy, 0xaa);
389 x = _mm_shuffle_ps(xy, xy, 0);
390 y = _mm_shuffle_ps(xy, xy, 0x55);
396 stack_array[index] = x;
397 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
401 // if we found a new max.
402 if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
403 { // we found a new max. Search for it
404 // find max across the max vector, place in all elements of max -- big latency hit here
405 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
406 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
408 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
409 // this where it actually makes a difference is handled in the early out at the top of the function,
410 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
411 // complexity, and removed it.
415 // scan for the first occurence of max in the array
417 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
419 maxIndex = 4*index + segment + indexTable[test];
422 _mm_store_ss( dotResult, dotMax);
426 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
428 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
430 const float4 *vertices = (const float4*) vv;
431 static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
432 float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY );
433 float4 vvec = _mm_loadu_ps( vec );
434 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz
435 float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy
440 float4 stack_array[ STACK_ARRAY_COUNT ];
443 memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
448 // Faster loop without cleanup code for full tiles
449 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
453 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
454 { // do four dot products at a time. Carefully avoid touching the w element.
455 float4 v0 = vertices[0];
456 float4 v1 = vertices[1];
457 float4 v2 = vertices[2];
458 float4 v3 = vertices[3]; vertices += 4;
460 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
461 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
462 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
463 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
467 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
468 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
469 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
473 stack_array[index] = x;
474 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
479 v3 = vertices[3]; vertices += 4;
481 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
482 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
483 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
484 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
488 z = _mm_shuffle_ps(hi0, hi1, 0x88);
489 x = _mm_shuffle_ps(lo0, lo1, 0x88);
490 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
494 stack_array[index+1] = x;
495 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
500 v3 = vertices[3]; vertices += 4;
502 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
503 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
504 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
505 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
509 z = _mm_shuffle_ps(hi0, hi1, 0x88);
510 x = _mm_shuffle_ps(lo0, lo1, 0x88);
511 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
515 stack_array[index+2] = x;
516 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
521 v3 = vertices[3]; vertices += 4;
523 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
524 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
525 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
526 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
530 z = _mm_shuffle_ps(hi0, hi1, 0x88);
531 x = _mm_shuffle_ps(lo0, lo1, 0x88);
532 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
536 stack_array[index+3] = x;
537 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
539 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
542 // If we found a new min
543 if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
545 // copy the new min across all lanes of our min accumulator
546 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
547 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
551 // find first occurrence of that min
553 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
555 // record where it is.
556 minIndex = 4*index + segment + indexTable[test];
560 // account for work we've already done
563 // Deal with the last < STACK_ARRAY_COUNT vectors
568 if(btUnlikely( count > 16) )
570 for( ; index + 4 <= count / 4; index+=4 )
571 { // do four dot products at a time. Carefully avoid touching the w element.
572 float4 v0 = vertices[0];
573 float4 v1 = vertices[1];
574 float4 v2 = vertices[2];
575 float4 v3 = vertices[3]; vertices += 4;
577 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
578 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
579 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
580 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
584 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
585 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
586 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
590 stack_array[index] = x;
591 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
596 v3 = vertices[3]; vertices += 4;
598 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
599 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
600 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
601 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
605 z = _mm_shuffle_ps(hi0, hi1, 0x88);
606 x = _mm_shuffle_ps(lo0, lo1, 0x88);
607 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
611 stack_array[index+1] = x;
612 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
617 v3 = vertices[3]; vertices += 4;
619 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
620 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
621 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
622 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
626 z = _mm_shuffle_ps(hi0, hi1, 0x88);
627 x = _mm_shuffle_ps(lo0, lo1, 0x88);
628 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
632 stack_array[index+2] = x;
633 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
638 v3 = vertices[3]; vertices += 4;
640 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
641 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
642 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
643 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
647 z = _mm_shuffle_ps(hi0, hi1, 0x88);
648 x = _mm_shuffle_ps(lo0, lo1, 0x88);
649 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
653 stack_array[index+3] = x;
654 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
656 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
660 size_t localCount = (count & -4L) - 4*index;
666 vertices += localCount; // counter the offset
667 float4 t0, t1, t2, t3, t4;
668 size_t byteIndex = -(localCount) * sizeof(float);
669 float4 * sap = &stack_array[index + localCount / 4];
673 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
674 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
675 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
676 movaps %[t0], %[min] // vertices[0] \n\
677 movlhps %[t1], %[min] // x0y0x1y1 \n\
678 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
679 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
680 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
681 movhlps %[t0], %[t1] // z0w0z1w1 \n\
682 movaps %[t3], %[t0] // vertices[2] \n\
683 movlhps %[t4], %[t0] // x2y2x3y3 \n\
684 movhlps %[t3], %[t4] // z2w2z3w3 \n\
685 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
686 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
687 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
688 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
689 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
690 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
691 addps %[t3], %[min] // x + y \n\
692 addps %[t1], %[min] // x + y + z \n\
693 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
694 minps %[t2], %[min] // record min, restore min \n\
695 add $16, %[byteIndex] // advance loop counter\n\
698 : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
699 : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
702 index += localCount/4;
705 for( unsigned int i=0; i<localCount/4; i++,index++)
706 { // do four dot products at a time. Carefully avoid touching the w element.
707 float4 v0 = vertices[0];
708 float4 v1 = vertices[1];
709 float4 v2 = vertices[2];
710 float4 v3 = vertices[3];
713 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
714 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
715 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
716 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
720 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
721 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
722 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
726 stack_array[index] = x;
727 min = _mm_min_ps( x, min ); // control the order here so that max is never NaN even if x is nan
734 // process the last few points
737 float4 v0, v1, v2, x, y, z;
746 // Calculate 3 dot products, transpose, duplicate v2
747 float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
748 float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
750 z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
752 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
754 x = _mm_shuffle_ps(lo0, lo1, 0x88);
755 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
762 float4 xy = _mm_movelh_ps(v0, v1);
763 z = _mm_movehl_ps(v1, v0);
765 z = _mm_shuffle_ps( z, z, 0xa8);
766 x = _mm_shuffle_ps( xy, xy, 0xa8);
767 y = _mm_shuffle_ps( xy, xy, 0xfd);
773 float4 xy = vertices[0];
774 z = _mm_shuffle_ps( xy, xy, 0xaa);
777 x = _mm_shuffle_ps(xy, xy, 0);
778 y = _mm_shuffle_ps(xy, xy, 0x55);
784 stack_array[index] = x;
785 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
789 // if we found a new min.
790 if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
791 { // we found a new min. Search for it
792 // find min across the min vector, place in all elements of min -- big latency hit here
793 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
794 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
796 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
797 // this where it actually makes a difference is handled in the early out at the top of the function,
798 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
799 // complexity, and removed it.
803 // scan for the first occurence of min in the array
805 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
807 minIndex = 4*index + segment + indexTable[test];
810 _mm_store_ss( dotResult, dotmin);
815 #elif defined BT_USE_NEON
816 #define ARM_NEON_GCC_COMPATIBILITY 1
817 #include <arm_neon.h>
820 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
821 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
822 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
823 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
824 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
825 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
827 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
828 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
830 extern "C" {int _get_cpu_capabilities( void );}
832 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
834 if( _get_cpu_capabilities() & 0x2000 )
835 _maxdot_large = _maxdot_large_v1;
837 _maxdot_large = _maxdot_large_v0;
839 return _maxdot_large(vv, vec, count, dotResult);
842 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
844 if( _get_cpu_capabilities() & 0x2000 )
845 _mindot_large = _mindot_large_v1;
847 _mindot_large = _mindot_large_v0;
849 return _mindot_large(vv, vec, count, dotResult);
854 #define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
857 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
860 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
861 float32x2_t vLo = vget_low_f32(vvec);
862 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
863 float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
864 float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
865 uint32x2_t indexLo = (uint32x2_t) {0, 1};
866 uint32x2_t indexHi = (uint32x2_t) {2, 3};
867 uint32x2_t iLo = (uint32x2_t) {-1, -1};
868 uint32x2_t iHi = (uint32x2_t) {-1, -1};
869 const uint32x2_t four = (uint32x2_t) {4,4};
871 for( ; i+8 <= count; i+= 8 )
873 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
874 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
875 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
876 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
878 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
879 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
880 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
881 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
883 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
884 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
885 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
886 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
888 float32x2_t rLo = vpadd_f32( xy0, xy1);
889 float32x2_t rHi = vpadd_f32( xy2, xy3);
890 rLo = vadd_f32(rLo, zLo);
891 rHi = vadd_f32(rHi, zHi);
893 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
894 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
895 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
896 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
897 iLo = vbsl_u32(maskLo, indexLo, iLo);
898 iHi = vbsl_u32(maskHi, indexHi, iHi);
899 indexLo = vadd_u32(indexLo, four);
900 indexHi = vadd_u32(indexHi, four);
902 v0 = vld1q_f32_aligned_postincrement( vv );
903 v1 = vld1q_f32_aligned_postincrement( vv );
904 v2 = vld1q_f32_aligned_postincrement( vv );
905 v3 = vld1q_f32_aligned_postincrement( vv );
907 xy0 = vmul_f32( vget_low_f32(v0), vLo);
908 xy1 = vmul_f32( vget_low_f32(v1), vLo);
909 xy2 = vmul_f32( vget_low_f32(v2), vLo);
910 xy3 = vmul_f32( vget_low_f32(v3), vLo);
912 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
913 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
914 zLo = vmul_f32( z0.val[0], vHi);
915 zHi = vmul_f32( z1.val[0], vHi);
917 rLo = vpadd_f32( xy0, xy1);
918 rHi = vpadd_f32( xy2, xy3);
919 rLo = vadd_f32(rLo, zLo);
920 rHi = vadd_f32(rHi, zHi);
922 maskLo = vcgt_f32( rLo, dotMaxLo );
923 maskHi = vcgt_f32( rHi, dotMaxHi );
924 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
925 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
926 iLo = vbsl_u32(maskLo, indexLo, iLo);
927 iHi = vbsl_u32(maskHi, indexHi, iHi);
928 indexLo = vadd_u32(indexLo, four);
929 indexHi = vadd_u32(indexHi, four);
932 for( ; i+4 <= count; i+= 4 )
934 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
935 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
936 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
937 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
939 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
940 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
941 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
942 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
944 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
945 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
946 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
947 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
949 float32x2_t rLo = vpadd_f32( xy0, xy1);
950 float32x2_t rHi = vpadd_f32( xy2, xy3);
951 rLo = vadd_f32(rLo, zLo);
952 rHi = vadd_f32(rHi, zHi);
954 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
955 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
956 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
957 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
958 iLo = vbsl_u32(maskLo, indexLo, iLo);
959 iHi = vbsl_u32(maskHi, indexHi, iHi);
960 indexLo = vadd_u32(indexLo, four);
961 indexHi = vadd_u32(indexHi, four);
968 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
969 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
970 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
972 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
973 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
974 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
976 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
977 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
978 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
980 float32x2_t rLo = vpadd_f32( xy0, xy1);
981 float32x2_t rHi = vpadd_f32( xy2, xy2);
982 rLo = vadd_f32(rLo, zLo);
983 rHi = vadd_f32(rHi, zHi);
985 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
986 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
987 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
988 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
989 iLo = vbsl_u32(maskLo, indexLo, iLo);
990 iHi = vbsl_u32(maskHi, indexHi, iHi);
995 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
996 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
998 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
999 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1001 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1002 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1004 float32x2_t rLo = vpadd_f32( xy0, xy1);
1005 rLo = vadd_f32(rLo, zLo);
1007 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1008 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1009 iLo = vbsl_u32(maskLo, indexLo, iLo);
1014 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1015 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1016 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1017 float32x2_t zLo = vmul_f32( z0, vHi);
1018 float32x2_t rLo = vpadd_f32( xy0, xy0);
1019 rLo = vadd_f32(rLo, zLo);
1020 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1021 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1022 iLo = vbsl_u32(maskLo, indexLo, iLo);
1030 // select best answer between hi and lo results
1031 uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
1032 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1033 iLo = vbsl_u32(mask, iHi, iLo);
1035 // select best answer between even and odd results
1036 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1037 iHi = vdup_lane_u32(iLo, 1);
1038 mask = vcgt_f32( dotMaxHi, dotMaxLo );
1039 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1040 iLo = vbsl_u32(mask, iHi, iLo);
1042 *dotResult = vget_lane_f32( dotMaxLo, 0);
1043 return vget_lane_u32(iLo, 0);
1047 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1049 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1050 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1051 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1052 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1053 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1054 uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
1055 float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
1057 unsigned long i = 0;
1058 for( ; i + 8 <= count; i += 8 )
1060 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1061 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1062 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1063 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1065 // the next two lines should resolve to a single vswp d, d
1066 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1067 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1068 // the next two lines should resolve to a single vswp d, d
1069 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1070 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1072 xy0 = vmulq_f32(xy0, vLo);
1073 xy1 = vmulq_f32(xy1, vLo);
1075 float32x4x2_t zb = vuzpq_f32( z0, z1);
1076 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1077 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1078 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1079 x = vaddq_f32(x, z);
1081 uint32x4_t mask = vcgtq_f32(x, maxDot);
1082 maxDot = vbslq_f32( mask, x, maxDot);
1083 index = vbslq_u32(mask, local_index, index);
1084 local_index = vaddq_u32(local_index, four);
1086 v0 = vld1q_f32_aligned_postincrement( vv );
1087 v1 = vld1q_f32_aligned_postincrement( vv );
1088 v2 = vld1q_f32_aligned_postincrement( vv );
1089 v3 = vld1q_f32_aligned_postincrement( vv );
1091 // the next two lines should resolve to a single vswp d, d
1092 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1093 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1094 // the next two lines should resolve to a single vswp d, d
1095 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1096 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1098 xy0 = vmulq_f32(xy0, vLo);
1099 xy1 = vmulq_f32(xy1, vLo);
1101 zb = vuzpq_f32( z0, z1);
1102 z = vmulq_f32( zb.val[0], vHi);
1103 xy = vuzpq_f32( xy0, xy1);
1104 x = vaddq_f32(xy.val[0], xy.val[1]);
1105 x = vaddq_f32(x, z);
1107 mask = vcgtq_f32(x, maxDot);
1108 maxDot = vbslq_f32( mask, x, maxDot);
1109 index = vbslq_u32(mask, local_index, index);
1110 local_index = vaddq_u32(local_index, four);
1113 for( ; i + 4 <= count; i += 4 )
1115 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1116 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1117 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1118 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1120 // the next two lines should resolve to a single vswp d, d
1121 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1122 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1123 // the next two lines should resolve to a single vswp d, d
1124 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1125 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1127 xy0 = vmulq_f32(xy0, vLo);
1128 xy1 = vmulq_f32(xy1, vLo);
1130 float32x4x2_t zb = vuzpq_f32( z0, z1);
1131 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1132 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1133 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1134 x = vaddq_f32(x, z);
1136 uint32x4_t mask = vcgtq_f32(x, maxDot);
1137 maxDot = vbslq_f32( mask, x, maxDot);
1138 index = vbslq_u32(mask, local_index, index);
1139 local_index = vaddq_u32(local_index, four);
1142 switch (count & 3) {
1145 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1146 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1147 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1149 // the next two lines should resolve to a single vswp d, d
1150 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1151 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1152 // the next two lines should resolve to a single vswp d, d
1153 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1154 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1156 xy0 = vmulq_f32(xy0, vLo);
1157 xy1 = vmulq_f32(xy1, vLo);
1159 float32x4x2_t zb = vuzpq_f32( z0, z1);
1160 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1161 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1162 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1163 x = vaddq_f32(x, z);
1165 uint32x4_t mask = vcgtq_f32(x, maxDot);
1166 maxDot = vbslq_f32( mask, x, maxDot);
1167 index = vbslq_u32(mask, local_index, index);
1168 local_index = vaddq_u32(local_index, four);
1174 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1175 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1177 // the next two lines should resolve to a single vswp d, d
1178 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1179 // the next two lines should resolve to a single vswp d, d
1180 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1182 xy0 = vmulq_f32(xy0, vLo);
1184 float32x4x2_t zb = vuzpq_f32( z0, z0);
1185 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1186 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1187 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1188 x = vaddq_f32(x, z);
1190 uint32x4_t mask = vcgtq_f32(x, maxDot);
1191 maxDot = vbslq_f32( mask, x, maxDot);
1192 index = vbslq_u32(mask, local_index, index);
1193 local_index = vaddq_u32(local_index, four);
1199 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1201 // the next two lines should resolve to a single vswp d, d
1202 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1203 // the next two lines should resolve to a single vswp d, d
1204 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1206 xy0 = vmulq_f32(xy0, vLo);
1208 z = vmulq_f32( z, vHi);
1209 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1210 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1211 x = vaddq_f32(x, z);
1213 uint32x4_t mask = vcgtq_f32(x, maxDot);
1214 maxDot = vbslq_f32( mask, x, maxDot);
1215 index = vbslq_u32(mask, local_index, index);
1216 local_index = vaddq_u32(local_index, four);
1225 // select best answer between hi and lo results
1226 uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
1227 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1228 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1230 // select best answer between even and odd results
1231 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1232 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1233 mask = vcgt_f32( maxDotO, maxDot2 );
1234 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1235 index2 = vbsl_u32(mask, indexHi, index2);
1237 *dotResult = vget_lane_f32( maxDot2, 0);
1238 return vget_lane_u32(index2, 0);
1242 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
1244 unsigned long i = 0;
1245 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1246 float32x2_t vLo = vget_low_f32(vvec);
1247 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1248 float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1249 float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1250 uint32x2_t indexLo = (uint32x2_t) {0, 1};
1251 uint32x2_t indexHi = (uint32x2_t) {2, 3};
1252 uint32x2_t iLo = (uint32x2_t) {-1, -1};
1253 uint32x2_t iHi = (uint32x2_t) {-1, -1};
1254 const uint32x2_t four = (uint32x2_t) {4,4};
1256 for( ; i+8 <= count; i+= 8 )
1258 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1259 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1260 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1261 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1263 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1264 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1265 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1266 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1268 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1269 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1270 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1271 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1273 float32x2_t rLo = vpadd_f32( xy0, xy1);
1274 float32x2_t rHi = vpadd_f32( xy2, xy3);
1275 rLo = vadd_f32(rLo, zLo);
1276 rHi = vadd_f32(rHi, zHi);
1278 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1279 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1280 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1281 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1282 iLo = vbsl_u32(maskLo, indexLo, iLo);
1283 iHi = vbsl_u32(maskHi, indexHi, iHi);
1284 indexLo = vadd_u32(indexLo, four);
1285 indexHi = vadd_u32(indexHi, four);
1287 v0 = vld1q_f32_aligned_postincrement( vv );
1288 v1 = vld1q_f32_aligned_postincrement( vv );
1289 v2 = vld1q_f32_aligned_postincrement( vv );
1290 v3 = vld1q_f32_aligned_postincrement( vv );
1292 xy0 = vmul_f32( vget_low_f32(v0), vLo);
1293 xy1 = vmul_f32( vget_low_f32(v1), vLo);
1294 xy2 = vmul_f32( vget_low_f32(v2), vLo);
1295 xy3 = vmul_f32( vget_low_f32(v3), vLo);
1297 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1298 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1299 zLo = vmul_f32( z0.val[0], vHi);
1300 zHi = vmul_f32( z1.val[0], vHi);
1302 rLo = vpadd_f32( xy0, xy1);
1303 rHi = vpadd_f32( xy2, xy3);
1304 rLo = vadd_f32(rLo, zLo);
1305 rHi = vadd_f32(rHi, zHi);
1307 maskLo = vclt_f32( rLo, dotMinLo );
1308 maskHi = vclt_f32( rHi, dotMinHi );
1309 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1310 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1311 iLo = vbsl_u32(maskLo, indexLo, iLo);
1312 iHi = vbsl_u32(maskHi, indexHi, iHi);
1313 indexLo = vadd_u32(indexLo, four);
1314 indexHi = vadd_u32(indexHi, four);
1317 for( ; i+4 <= count; i+= 4 )
1319 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1320 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1321 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1322 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1324 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1325 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1326 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1327 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1329 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1330 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1331 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1332 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1334 float32x2_t rLo = vpadd_f32( xy0, xy1);
1335 float32x2_t rHi = vpadd_f32( xy2, xy3);
1336 rLo = vadd_f32(rLo, zLo);
1337 rHi = vadd_f32(rHi, zHi);
1339 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1340 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1341 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1342 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1343 iLo = vbsl_u32(maskLo, indexLo, iLo);
1344 iHi = vbsl_u32(maskHi, indexHi, iHi);
1345 indexLo = vadd_u32(indexLo, four);
1346 indexHi = vadd_u32(indexHi, four);
1352 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1353 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1354 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1356 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1357 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1358 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1360 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1361 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1362 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1364 float32x2_t rLo = vpadd_f32( xy0, xy1);
1365 float32x2_t rHi = vpadd_f32( xy2, xy2);
1366 rLo = vadd_f32(rLo, zLo);
1367 rHi = vadd_f32(rHi, zHi);
1369 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1370 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1371 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1372 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1373 iLo = vbsl_u32(maskLo, indexLo, iLo);
1374 iHi = vbsl_u32(maskHi, indexHi, iHi);
1379 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1380 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1382 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1383 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1385 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1386 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1388 float32x2_t rLo = vpadd_f32( xy0, xy1);
1389 rLo = vadd_f32(rLo, zLo);
1391 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1392 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1393 iLo = vbsl_u32(maskLo, indexLo, iLo);
1398 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1399 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1400 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1401 float32x2_t zLo = vmul_f32( z0, vHi);
1402 float32x2_t rLo = vpadd_f32( xy0, xy0);
1403 rLo = vadd_f32(rLo, zLo);
1404 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1405 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1406 iLo = vbsl_u32(maskLo, indexLo, iLo);
1414 // select best answer between hi and lo results
1415 uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
1416 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1417 iLo = vbsl_u32(mask, iHi, iLo);
1419 // select best answer between even and odd results
1420 dotMinHi = vdup_lane_f32(dotMinLo, 1);
1421 iHi = vdup_lane_u32(iLo, 1);
1422 mask = vclt_f32( dotMinHi, dotMinLo );
1423 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1424 iLo = vbsl_u32(mask, iHi, iLo);
1426 *dotResult = vget_lane_f32( dotMinLo, 0);
1427 return vget_lane_u32(iLo, 0);
1430 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1432 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1433 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1434 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1435 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1436 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1437 uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
1438 float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
1440 unsigned long i = 0;
1441 for( ; i + 8 <= count; i += 8 )
1443 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1444 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1445 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1446 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1448 // the next two lines should resolve to a single vswp d, d
1449 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1450 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1451 // the next two lines should resolve to a single vswp d, d
1452 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1453 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1455 xy0 = vmulq_f32(xy0, vLo);
1456 xy1 = vmulq_f32(xy1, vLo);
1458 float32x4x2_t zb = vuzpq_f32( z0, z1);
1459 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1460 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1461 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1462 x = vaddq_f32(x, z);
1464 uint32x4_t mask = vcltq_f32(x, minDot);
1465 minDot = vbslq_f32( mask, x, minDot);
1466 index = vbslq_u32(mask, local_index, index);
1467 local_index = vaddq_u32(local_index, four);
1469 v0 = vld1q_f32_aligned_postincrement( vv );
1470 v1 = vld1q_f32_aligned_postincrement( vv );
1471 v2 = vld1q_f32_aligned_postincrement( vv );
1472 v3 = vld1q_f32_aligned_postincrement( vv );
1474 // the next two lines should resolve to a single vswp d, d
1475 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1476 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1477 // the next two lines should resolve to a single vswp d, d
1478 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1479 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1481 xy0 = vmulq_f32(xy0, vLo);
1482 xy1 = vmulq_f32(xy1, vLo);
1484 zb = vuzpq_f32( z0, z1);
1485 z = vmulq_f32( zb.val[0], vHi);
1486 xy = vuzpq_f32( xy0, xy1);
1487 x = vaddq_f32(xy.val[0], xy.val[1]);
1488 x = vaddq_f32(x, z);
1490 mask = vcltq_f32(x, minDot);
1491 minDot = vbslq_f32( mask, x, minDot);
1492 index = vbslq_u32(mask, local_index, index);
1493 local_index = vaddq_u32(local_index, four);
1496 for( ; i + 4 <= count; i += 4 )
1498 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1499 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1500 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1501 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1503 // the next two lines should resolve to a single vswp d, d
1504 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1505 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1506 // the next two lines should resolve to a single vswp d, d
1507 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1508 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1510 xy0 = vmulq_f32(xy0, vLo);
1511 xy1 = vmulq_f32(xy1, vLo);
1513 float32x4x2_t zb = vuzpq_f32( z0, z1);
1514 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1515 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1516 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1517 x = vaddq_f32(x, z);
1519 uint32x4_t mask = vcltq_f32(x, minDot);
1520 minDot = vbslq_f32( mask, x, minDot);
1521 index = vbslq_u32(mask, local_index, index);
1522 local_index = vaddq_u32(local_index, four);
1525 switch (count & 3) {
1528 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1529 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1530 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1532 // the next two lines should resolve to a single vswp d, d
1533 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1534 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1535 // the next two lines should resolve to a single vswp d, d
1536 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1537 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1539 xy0 = vmulq_f32(xy0, vLo);
1540 xy1 = vmulq_f32(xy1, vLo);
1542 float32x4x2_t zb = vuzpq_f32( z0, z1);
1543 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1544 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1545 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1546 x = vaddq_f32(x, z);
1548 uint32x4_t mask = vcltq_f32(x, minDot);
1549 minDot = vbslq_f32( mask, x, minDot);
1550 index = vbslq_u32(mask, local_index, index);
1551 local_index = vaddq_u32(local_index, four);
1557 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1558 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1560 // the next two lines should resolve to a single vswp d, d
1561 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1562 // the next two lines should resolve to a single vswp d, d
1563 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1565 xy0 = vmulq_f32(xy0, vLo);
1567 float32x4x2_t zb = vuzpq_f32( z0, z0);
1568 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1569 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1570 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1571 x = vaddq_f32(x, z);
1573 uint32x4_t mask = vcltq_f32(x, minDot);
1574 minDot = vbslq_f32( mask, x, minDot);
1575 index = vbslq_u32(mask, local_index, index);
1576 local_index = vaddq_u32(local_index, four);
1582 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1584 // the next two lines should resolve to a single vswp d, d
1585 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1586 // the next two lines should resolve to a single vswp d, d
1587 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1589 xy0 = vmulq_f32(xy0, vLo);
1591 z = vmulq_f32( z, vHi);
1592 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1593 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1594 x = vaddq_f32(x, z);
1596 uint32x4_t mask = vcltq_f32(x, minDot);
1597 minDot = vbslq_f32( mask, x, minDot);
1598 index = vbslq_u32(mask, local_index, index);
1599 local_index = vaddq_u32(local_index, four);
1608 // select best answer between hi and lo results
1609 uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
1610 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1611 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1613 // select best answer between even and odd results
1614 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1615 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1616 mask = vclt_f32( minDotO, minDot2 );
1617 minDot2 = vbsl_f32(mask, minDotO, minDot2);
1618 index2 = vbsl_u32(mask, indexHi, index2);
1620 *dotResult = vget_lane_f32( minDot2, 0);
1621 return vget_lane_u32(index2, 0);
1626 #error Unhandled __APPLE__ arch
1629 #endif /* __APPLE__ */