src/LinearMath/btVector3.cpp

   1 /*
   2  Copyright (c) 2011 Apple Inc.
   3  http://continuousphysics.com/Bullet/
   4
   5  This software is provided 'as-is', without any express or implied warranty.
   6  In no event will the authors be held liable for any damages arising from the use of this software.
   7  Permission is granted to anyone to use this software for any purpose,
   8  including commercial applications, and to alter it and redistribute it freely,
   9  subject to the following restrictions:
  10
  11  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  12  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  13  3. This notice may not be removed or altered from any source distribution.
  14
  15  This source version has been altered.
  16  */
  17
  18 #if defined (_WIN32) || defined (__i386__)
  19 #define BT_USE_SSE_IN_API
  20 #endif
  21
  22 #include "btVector3.h"
  23
  24 #if defined (BT_USE_SSE) || defined (BT_USE_NEON)
  25
  26 #ifdef __APPLE__
  27 #include <stdint.h>
  28 typedef  float float4 __attribute__ ((vector_size(16)));
  29 #else
  30 #define float4 __m128
  31 #endif
  32 //typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
  33
  34
  35 #if defined BT_USE_SSE || defined _WIN32
  36
  37 #define LOG2_ARRAY_SIZE     6
  38 #define STACK_ARRAY_COUNT   (1UL << LOG2_ARRAY_SIZE)
  39
  40 #include <emmintrin.h>
  41
  42 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
  43 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
  44 {
  45     const float4 *vertices = (const float4*) vv;
  46     static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
  47     float4 dotMax = btAssign128( -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY );
  48     float4 vvec = _mm_loadu_ps( vec );
  49     float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          /// zzzz
  50     float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
  51
  52     long maxIndex = -1L;
  53
  54     size_t segment = 0;
  55     float4 stack_array[ STACK_ARRAY_COUNT ];
  56
  57 #if DEBUG
  58     memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
  59 #endif
  60
  61     size_t index;
  62     float4 max;
  63     // Faster loop without cleanup code for full tiles
  64     for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
  65     {
  66         max = dotMax;
  67
  68         for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
  69         { // do four dot products at a time. Carefully avoid touching the w element.
  70             float4 v0 = vertices[0];
  71             float4 v1 = vertices[1];
  72             float4 v2 = vertices[2];
  73             float4 v3 = vertices[3];            vertices += 4;
  74
  75             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
  76             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
  77             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
  78             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
  79
  80             lo0 = lo0*vLo;
  81             lo1 = lo1*vLo;
  82             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
  83             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
  84             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  85             z = z*vHi;
  86             x = x+y;
  87             x = x+z;
  88             stack_array[index] = x;
  89             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
  90
  91             v0 = vertices[0];
  92             v1 = vertices[1];
  93             v2 = vertices[2];
  94             v3 = vertices[3];            vertices += 4;
  95
  96             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
  97             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
  98             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
  99             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 100
 101             lo0 = lo0*vLo;
 102             lo1 = lo1*vLo;
 103             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 104             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 105             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 106             z = z*vHi;
 107             x = x+y;
 108             x = x+z;
 109             stack_array[index+1] = x;
 110             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
 111
 112             v0 = vertices[0];
 113             v1 = vertices[1];
 114             v2 = vertices[2];
 115             v3 = vertices[3];            vertices += 4;
 116
 117             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 118             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 119             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 120             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 121
 122             lo0 = lo0*vLo;
 123             lo1 = lo1*vLo;
 124             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 125             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 126             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 127             z = z*vHi;
 128             x = x+y;
 129             x = x+z;
 130             stack_array[index+2] = x;
 131             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
 132
 133             v0 = vertices[0];
 134             v1 = vertices[1];
 135             v2 = vertices[2];
 136             v3 = vertices[3];            vertices += 4;
 137
 138             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 139             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 140             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 141             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 142
 143             lo0 = lo0*vLo;
 144             lo1 = lo1*vLo;
 145             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 146             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 147             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 148             z = z*vHi;
 149             x = x+y;
 150             x = x+z;
 151             stack_array[index+3] = x;
 152             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
 153
 154             // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
 155         }
 156
 157         // If we found a new max
 158         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
 159         {
 160             // copy the new max across all lanes of our max accumulator
 161             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
 162             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
 163
 164             dotMax = max;
 165
 166             // find first occurrence of that max
 167             size_t test;
 168             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
 169             {}
 170             // record where it is.
 171             maxIndex = 4*index + segment + indexTable[test];
 172         }
 173     }
 174
 175     // account for work we've already done
 176     count -= segment;
 177
 178     // Deal with the last < STACK_ARRAY_COUNT vectors
 179     max = dotMax;
 180     index = 0;
 181
 182
 183     if( btUnlikely( count > 16) )
 184     {
 185         for( ; index + 4 <= count / 4; index+=4 )
 186         { // do four dot products at a time. Carefully avoid touching the w element.
 187             float4 v0 = vertices[0];
 188             float4 v1 = vertices[1];
 189             float4 v2 = vertices[2];
 190             float4 v3 = vertices[3];            vertices += 4;
 191
 192             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 193             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 194             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 195             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 196
 197             lo0 = lo0*vLo;
 198             lo1 = lo1*vLo;
 199             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 200             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 201             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 202             z = z*vHi;
 203             x = x+y;
 204             x = x+z;
 205             stack_array[index] = x;
 206             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
 207
 208             v0 = vertices[0];
 209             v1 = vertices[1];
 210             v2 = vertices[2];
 211             v3 = vertices[3];            vertices += 4;
 212
 213             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 214             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 215             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 216             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 217
 218             lo0 = lo0*vLo;
 219             lo1 = lo1*vLo;
 220             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 221             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 222             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 223             z = z*vHi;
 224             x = x+y;
 225             x = x+z;
 226             stack_array[index+1] = x;
 227             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
 228
 229             v0 = vertices[0];
 230             v1 = vertices[1];
 231             v2 = vertices[2];
 232             v3 = vertices[3];            vertices += 4;
 233
 234             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 235             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 236             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 237             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 238
 239             lo0 = lo0*vLo;
 240             lo1 = lo1*vLo;
 241             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 242             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 243             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 244             z = z*vHi;
 245             x = x+y;
 246             x = x+z;
 247             stack_array[index+2] = x;
 248             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
 249
 250             v0 = vertices[0];
 251             v1 = vertices[1];
 252             v2 = vertices[2];
 253             v3 = vertices[3];            vertices += 4;
 254
 255             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 256             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 257             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 258             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 259
 260             lo0 = lo0*vLo;
 261             lo1 = lo1*vLo;
 262             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 263             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 264             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 265             z = z*vHi;
 266             x = x+y;
 267             x = x+z;
 268             stack_array[index+3] = x;
 269             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
 270
 271             // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
 272         }
 273     }
 274
 275     size_t localCount = (count & -4L) - 4*index;
 276     if( localCount )
 277     {
 278 #ifdef __APPLE__
 279         float4 t0, t1, t2, t3, t4;
 280         float4 * sap = &stack_array[index + localCount / 4];
 281           vertices += localCount;      // counter the offset
 282          size_t byteIndex = -(localCount) * sizeof(float);
 283         //AT&T Code style assembly
 284         asm volatile
 285         (   ".align 4                                                                   \n\
 286              0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
 287           movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
 288           movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
 289           movaps  %[t0], %[max]                               // vertices[0]      \n\
 290           movlhps %[t1], %[max]                               // x0y0x1y1         \n\
 291          movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
 292          movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
 293           mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\
 294          movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
 295          movaps  %[t3], %[t0]                                // vertices[2]      \n\
 296          movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
 297          mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
 298           movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
 299           shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
 300           mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
 301          movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\
 302          shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\
 303          shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
 304          addps   %[t3], %[max]                               // x + y            \n\
 305          addps   %[t1], %[max]                               // x + y + z        \n\
 306          movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
 307          maxps   %[t2], %[max]                               // record max, restore max   \n\
 308          add     $16, %[byteIndex]                           // advance loop counter\n\
 309          jnz     0b                                          \n\
 310      "
 311          : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
 312          : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
 313          : "memory", "cc"
 314          );
 315         index += localCount/4;
 316 #else
 317         {
 318             for( unsigned int i=0; i<localCount/4; i++,index++)
 319             { // do four dot products at a time. Carefully avoid touching the w element.
 320                 float4 v0 = vertices[0];
 321                 float4 v1 = vertices[1];
 322                 float4 v2 = vertices[2];
 323                 float4 v3 = vertices[3];
 324                 vertices += 4;
 325
 326                 float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 327                 float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 328                 float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 329                 float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 330
 331                 lo0 = lo0*vLo;
 332                 lo1 = lo1*vLo;
 333                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 334                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 335                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 336                 z = z*vHi;
 337                 x = x+y;
 338                 x = x+z;
 339                 stack_array[index] = x;
 340                 max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
 341             }
 342         }
 343 #endif //__APPLE__
 344     }
 345
 346     // process the last few points
 347     if( count & 3 )
 348     {
 349         float4 v0, v1, v2, x, y, z;
 350         switch( count & 3 )
 351         {
 352             case 3:
 353             {
 354                 v0 = vertices[0];
 355                 v1 = vertices[1];
 356                 v2 = vertices[2];
 357
 358                 // Calculate 3 dot products, transpose, duplicate v2
 359                 float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
 360                 float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
 361                 lo0 = lo0*vLo;
 362                 z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
 363                 z = z*vHi;
 364                 float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
 365                 lo1 = lo1*vLo;
 366                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 367                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 368             }
 369                 break;
 370             case 2:
 371             {
 372                 v0 = vertices[0];
 373                 v1 = vertices[1];
 374                 float4 xy = _mm_movelh_ps(v0, v1);
 375                 z = _mm_movehl_ps(v1, v0);
 376                 xy = xy*vLo;
 377                 z = _mm_shuffle_ps( z, z,  0xa8);
 378                 x = _mm_shuffle_ps( xy, xy, 0xa8);
 379                 y = _mm_shuffle_ps( xy, xy, 0xfd);
 380                 z = z*vHi;
 381             }
 382                 break;
 383             case 1:
 384             {
 385                 float4 xy = vertices[0];
 386                 z =  _mm_shuffle_ps( xy, xy, 0xaa);
 387                 xy = xy*vLo;
 388                 z = z*vHi;
 389                 x = _mm_shuffle_ps(xy, xy, 0);
 390                 y = _mm_shuffle_ps(xy, xy, 0x55);
 391             }
 392                 break;
 393         }
 394         x = x+y;
 395         x = x+z;
 396         stack_array[index] = x;
 397         max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
 398         index++;
 399     }
 400
 401     // if we found a new max.
 402     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
 403     { // we found a new max. Search for it
 404       // find max across the max vector, place in all elements of max -- big latency hit here
 405         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
 406         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
 407
 408         // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
 409         // this where it actually makes a difference is handled in the early out at the top of the function,
 410         // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
 411         // complexity, and removed it.
 412
 413         dotMax = max;
 414
 415         // scan for the first occurence of max in the array
 416         size_t test;
 417         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
 418         {}
 419         maxIndex = 4*index + segment + indexTable[test];
 420     }
 421
 422     _mm_store_ss( dotResult, dotMax);
 423     return maxIndex;
 424 }
 425
 426 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
 427
 428 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
 429 {
 430     const float4 *vertices = (const float4*) vv;
 431     static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
 432     float4 dotmin = btAssign128( BT_INFINITY,  BT_INFINITY,  BT_INFINITY,  BT_INFINITY );
 433     float4 vvec = _mm_loadu_ps( vec );
 434     float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          /// zzzz
 435     float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
 436
 437     long minIndex = -1L;
 438
 439     size_t segment = 0;
 440     float4 stack_array[ STACK_ARRAY_COUNT ];
 441
 442 #if DEBUG
 443     memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
 444 #endif
 445
 446     size_t index;
 447     float4 min;
 448     // Faster loop without cleanup code for full tiles
 449     for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
 450     {
 451         min = dotmin;
 452
 453         for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
 454         { // do four dot products at a time. Carefully avoid touching the w element.
 455             float4 v0 = vertices[0];
 456             float4 v1 = vertices[1];
 457             float4 v2 = vertices[2];
 458             float4 v3 = vertices[3];            vertices += 4;
 459
 460             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 461             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 462             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 463             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 464
 465             lo0 = lo0*vLo;
 466             lo1 = lo1*vLo;
 467             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 468             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 469             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 470             z = z*vHi;
 471             x = x+y;
 472             x = x+z;
 473             stack_array[index] = x;
 474             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
 475
 476             v0 = vertices[0];
 477             v1 = vertices[1];
 478             v2 = vertices[2];
 479             v3 = vertices[3];            vertices += 4;
 480
 481             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 482             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 483             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 484             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 485
 486             lo0 = lo0*vLo;
 487             lo1 = lo1*vLo;
 488             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 489             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 490             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 491             z = z*vHi;
 492             x = x+y;
 493             x = x+z;
 494             stack_array[index+1] = x;
 495             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
 496
 497             v0 = vertices[0];
 498             v1 = vertices[1];
 499             v2 = vertices[2];
 500             v3 = vertices[3];            vertices += 4;
 501
 502             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 503             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 504             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 505             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 506
 507             lo0 = lo0*vLo;
 508             lo1 = lo1*vLo;
 509             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 510             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 511             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 512             z = z*vHi;
 513             x = x+y;
 514             x = x+z;
 515             stack_array[index+2] = x;
 516             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
 517
 518             v0 = vertices[0];
 519             v1 = vertices[1];
 520             v2 = vertices[2];
 521             v3 = vertices[3];            vertices += 4;
 522
 523             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 524             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 525             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 526             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 527
 528             lo0 = lo0*vLo;
 529             lo1 = lo1*vLo;
 530             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 531             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 532             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 533             z = z*vHi;
 534             x = x+y;
 535             x = x+z;
 536             stack_array[index+3] = x;
 537             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
 538
 539             // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
 540         }
 541
 542         // If we found a new min
 543         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
 544         {
 545             // copy the new min across all lanes of our min accumulator
 546             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
 547             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
 548
 549             dotmin = min;
 550
 551             // find first occurrence of that min
 552             size_t test;
 553             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
 554             {}
 555             // record where it is.
 556             minIndex = 4*index + segment + indexTable[test];
 557         }
 558     }
 559
 560     // account for work we've already done
 561     count -= segment;
 562
 563     // Deal with the last < STACK_ARRAY_COUNT vectors
 564     min = dotmin;
 565     index = 0;
 566
 567
 568     if(btUnlikely( count > 16) )
 569     {
 570         for( ; index + 4 <= count / 4; index+=4 )
 571         { // do four dot products at a time. Carefully avoid touching the w element.
 572             float4 v0 = vertices[0];
 573             float4 v1 = vertices[1];
 574             float4 v2 = vertices[2];
 575             float4 v3 = vertices[3];            vertices += 4;
 576
 577             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 578             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 579             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 580             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 581
 582             lo0 = lo0*vLo;
 583             lo1 = lo1*vLo;
 584             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 585             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 586             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 587             z = z*vHi;
 588             x = x+y;
 589             x = x+z;
 590             stack_array[index] = x;
 591             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
 592
 593             v0 = vertices[0];
 594             v1 = vertices[1];
 595             v2 = vertices[2];
 596             v3 = vertices[3];            vertices += 4;
 597
 598             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 599             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 600             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 601             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 602
 603             lo0 = lo0*vLo;
 604             lo1 = lo1*vLo;
 605             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 606             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 607             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 608             z = z*vHi;
 609             x = x+y;
 610             x = x+z;
 611             stack_array[index+1] = x;
 612             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
 613
 614             v0 = vertices[0];
 615             v1 = vertices[1];
 616             v2 = vertices[2];
 617             v3 = vertices[3];            vertices += 4;
 618
 619             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 620             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 621             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 622             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 623
 624             lo0 = lo0*vLo;
 625             lo1 = lo1*vLo;
 626             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 627             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 628             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 629             z = z*vHi;
 630             x = x+y;
 631             x = x+z;
 632             stack_array[index+2] = x;
 633             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
 634
 635             v0 = vertices[0];
 636             v1 = vertices[1];
 637             v2 = vertices[2];
 638             v3 = vertices[3];            vertices += 4;
 639
 640             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 641             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 642             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 643             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 644
 645             lo0 = lo0*vLo;
 646             lo1 = lo1*vLo;
 647             z = _mm_shuffle_ps(hi0, hi1, 0x88);
 648             x = _mm_shuffle_ps(lo0, lo1, 0x88);
 649             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 650             z = z*vHi;
 651             x = x+y;
 652             x = x+z;
 653             stack_array[index+3] = x;
 654             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
 655
 656             // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
 657         }
 658     }
 659
 660     size_t localCount = (count & -4L) - 4*index;
 661     if( localCount )
 662     {
 663
 664
 665 #ifdef __APPLE__
 666         vertices += localCount;      // counter the offset
 667         float4 t0, t1, t2, t3, t4;
 668         size_t byteIndex = -(localCount) * sizeof(float);
 669         float4 * sap = &stack_array[index + localCount / 4];
 670
 671         asm volatile
 672         (   ".align 4                                                                   \n\
 673              0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
 674              movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
 675              movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
 676              movaps  %[t0], %[min]                               // vertices[0]      \n\
 677              movlhps %[t1], %[min]                               // x0y0x1y1         \n\
 678              movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
 679              movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
 680              mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\
 681              movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
 682              movaps  %[t3], %[t0]                                // vertices[2]      \n\
 683              movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
 684              movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
 685              mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
 686              shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
 687              mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
 688              movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\
 689              shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\
 690              shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
 691              addps   %[t3], %[min]                               // x + y            \n\
 692              addps   %[t1], %[min]                               // x + y + z        \n\
 693              movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
 694              minps   %[t2], %[min]                               // record min, restore min   \n\
 695              add     $16, %[byteIndex]                           // advance loop counter\n\
 696              jnz     0b                                          \n\
 697              "
 698          : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
 699          : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
 700          : "memory", "cc"
 701          );
 702         index += localCount/4;
 703 #else
 704         {
 705             for( unsigned int i=0; i<localCount/4; i++,index++)
 706             { // do four dot products at a time. Carefully avoid touching the w element.
 707                 float4 v0 = vertices[0];
 708                 float4 v1 = vertices[1];
 709                 float4 v2 = vertices[2];
 710                 float4 v3 = vertices[3];
 711                 vertices += 4;
 712
 713                 float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
 714                 float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
 715                 float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
 716                 float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
 717
 718                 lo0 = lo0*vLo;
 719                 lo1 = lo1*vLo;
 720                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 721                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 722                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 723                 z = z*vHi;
 724                 x = x+y;
 725                 x = x+z;
 726                 stack_array[index] = x;
 727                 min = _mm_min_ps( x, min );         // control the order here so that max is never NaN even if x is nan
 728             }
 729         }
 730
 731 #endif
 732     }
 733
 734     // process the last few points
 735     if( count & 3 )
 736     {
 737         float4 v0, v1, v2, x, y, z;
 738         switch( count & 3 )
 739         {
 740             case 3:
 741             {
 742                 v0 = vertices[0];
 743                 v1 = vertices[1];
 744                 v2 = vertices[2];
 745
 746                 // Calculate 3 dot products, transpose, duplicate v2
 747                 float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
 748                 float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
 749                 lo0 = lo0*vLo;
 750                 z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
 751                 z = z*vHi;
 752                 float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
 753                 lo1 = lo1*vLo;
 754                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 755                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 756             }
 757                 break;
 758             case 2:
 759             {
 760                 v0 = vertices[0];
 761                 v1 = vertices[1];
 762                 float4 xy = _mm_movelh_ps(v0, v1);
 763                 z = _mm_movehl_ps(v1, v0);
 764                 xy = xy*vLo;
 765                 z = _mm_shuffle_ps( z, z,  0xa8);
 766                 x = _mm_shuffle_ps( xy, xy, 0xa8);
 767                 y = _mm_shuffle_ps( xy, xy, 0xfd);
 768                 z = z*vHi;
 769             }
 770                 break;
 771             case 1:
 772             {
 773                 float4 xy = vertices[0];
 774                 z =  _mm_shuffle_ps( xy, xy, 0xaa);
 775                 xy = xy*vLo;
 776                 z = z*vHi;
 777                 x = _mm_shuffle_ps(xy, xy, 0);
 778                 y = _mm_shuffle_ps(xy, xy, 0x55);
 779             }
 780                 break;
 781         }
 782         x = x+y;
 783         x = x+z;
 784         stack_array[index] = x;
 785         min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
 786         index++;
 787     }
 788
 789     // if we found a new min.
 790     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
 791     { // we found a new min. Search for it
 792       // find min across the min vector, place in all elements of min -- big latency hit here
 793         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
 794         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
 795
 796         // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
 797         // this where it actually makes a difference is handled in the early out at the top of the function,
 798         // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
 799         // complexity, and removed it.
 800
 801         dotmin = min;
 802
 803         // scan for the first occurence of min in the array
 804         size_t test;
 805         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
 806         {}
 807         minIndex = 4*index + segment + indexTable[test];
 808     }
 809
 810     _mm_store_ss( dotResult, dotmin);
 811     return minIndex;
 812 }
 813
 814
 815 #elif defined BT_USE_NEON
 816 #define ARM_NEON_GCC_COMPATIBILITY  1
 817 #include <arm_neon.h>
 818
 819
 820 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
 821 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
 822 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
 823 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
 824 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
 825 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
 826
 827 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
 828 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
 829
 830 extern "C" {int  _get_cpu_capabilities( void );}
 831
 832 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
 833 {
 834     if( _get_cpu_capabilities() & 0x2000 )
 835         _maxdot_large = _maxdot_large_v1;
 836     else
 837         _maxdot_large = _maxdot_large_v0;
 838
 839     return _maxdot_large(vv, vec, count, dotResult);
 840 }
 841
 842 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
 843 {
 844     if( _get_cpu_capabilities() & 0x2000 )
 845         _mindot_large = _mindot_large_v1;
 846     else
 847         _mindot_large = _mindot_large_v0;
 848
 849     return _mindot_large(vv, vec, count, dotResult);
 850 }
 851
 852
 853
 854 #define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32  {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
 855
 856
 857 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
 858 {
 859     unsigned long i = 0;
 860     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
 861     float32x2_t vLo = vget_low_f32(vvec);
 862     float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
 863     float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
 864     float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
 865     uint32x2_t indexLo = (uint32x2_t) {0, 1};
 866     uint32x2_t indexHi = (uint32x2_t) {2, 3};
 867     uint32x2_t iLo = (uint32x2_t) {-1, -1};
 868     uint32x2_t iHi = (uint32x2_t) {-1, -1};
 869     const uint32x2_t four = (uint32x2_t) {4,4};
 870
 871     for( ; i+8 <= count; i+= 8 )
 872     {
 873         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
 874         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
 875         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
 876         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
 877
 878         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
 879         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
 880         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
 881         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
 882
 883         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
 884         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
 885         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
 886         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
 887
 888         float32x2_t rLo = vpadd_f32( xy0, xy1);
 889         float32x2_t rHi = vpadd_f32( xy2, xy3);
 890         rLo = vadd_f32(rLo, zLo);
 891         rHi = vadd_f32(rHi, zHi);
 892
 893         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
 894         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
 895         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
 896         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
 897         iLo = vbsl_u32(maskLo, indexLo, iLo);
 898         iHi = vbsl_u32(maskHi, indexHi, iHi);
 899         indexLo = vadd_u32(indexLo, four);
 900         indexHi = vadd_u32(indexHi, four);
 901
 902         v0 = vld1q_f32_aligned_postincrement( vv );
 903         v1 = vld1q_f32_aligned_postincrement( vv );
 904         v2 = vld1q_f32_aligned_postincrement( vv );
 905         v3 = vld1q_f32_aligned_postincrement( vv );
 906
 907         xy0 = vmul_f32( vget_low_f32(v0), vLo);
 908         xy1 = vmul_f32( vget_low_f32(v1), vLo);
 909         xy2 = vmul_f32( vget_low_f32(v2), vLo);
 910         xy3 = vmul_f32( vget_low_f32(v3), vLo);
 911
 912         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
 913         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
 914         zLo = vmul_f32( z0.val[0], vHi);
 915         zHi = vmul_f32( z1.val[0], vHi);
 916
 917         rLo = vpadd_f32( xy0, xy1);
 918         rHi = vpadd_f32( xy2, xy3);
 919         rLo = vadd_f32(rLo, zLo);
 920         rHi = vadd_f32(rHi, zHi);
 921
 922         maskLo = vcgt_f32( rLo, dotMaxLo );
 923         maskHi = vcgt_f32( rHi, dotMaxHi );
 924         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
 925         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
 926         iLo = vbsl_u32(maskLo, indexLo, iLo);
 927         iHi = vbsl_u32(maskHi, indexHi, iHi);
 928         indexLo = vadd_u32(indexLo, four);
 929         indexHi = vadd_u32(indexHi, four);
 930     }
 931
 932     for( ; i+4 <= count; i+= 4 )
 933     {
 934         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
 935         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
 936         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
 937         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
 938
 939         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
 940         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
 941         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
 942         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
 943
 944         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
 945         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
 946         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
 947         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
 948
 949         float32x2_t rLo = vpadd_f32( xy0, xy1);
 950         float32x2_t rHi = vpadd_f32( xy2, xy3);
 951         rLo = vadd_f32(rLo, zLo);
 952         rHi = vadd_f32(rHi, zHi);
 953
 954         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
 955         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
 956         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
 957         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
 958         iLo = vbsl_u32(maskLo, indexLo, iLo);
 959         iHi = vbsl_u32(maskHi, indexHi, iHi);
 960         indexLo = vadd_u32(indexLo, four);
 961         indexHi = vadd_u32(indexHi, four);
 962     }
 963
 964     switch( count & 3 )
 965     {
 966         case 3:
 967         {
 968             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
 969             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
 970             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
 971
 972             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
 973             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
 974             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
 975
 976             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
 977             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
 978             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
 979
 980             float32x2_t rLo = vpadd_f32( xy0, xy1);
 981             float32x2_t rHi = vpadd_f32( xy2, xy2);
 982             rLo = vadd_f32(rLo, zLo);
 983             rHi = vadd_f32(rHi, zHi);
 984
 985             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
 986             uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
 987             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
 988             dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
 989             iLo = vbsl_u32(maskLo, indexLo, iLo);
 990             iHi = vbsl_u32(maskHi, indexHi, iHi);
 991         }
 992             break;
 993         case 2:
 994         {
 995             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
 996             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
 997
 998             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
 999             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1000
1001             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1002             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1003
1004             float32x2_t rLo = vpadd_f32( xy0, xy1);
1005             rLo = vadd_f32(rLo, zLo);
1006
1007             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1008             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1009             iLo = vbsl_u32(maskLo, indexLo, iLo);
1010         }
1011             break;
1012         case 1:
1013         {
1014             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1015             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1016             float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1017             float32x2_t zLo = vmul_f32( z0, vHi);
1018             float32x2_t rLo = vpadd_f32( xy0, xy0);
1019             rLo = vadd_f32(rLo, zLo);
1020             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1021             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1022             iLo = vbsl_u32(maskLo, indexLo, iLo);
1023         }
1024             break;
1025
1026         default:
1027             break;
1028     }
1029
1030     // select best answer between hi and lo results
1031     uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
1032     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1033     iLo = vbsl_u32(mask, iHi, iLo);
1034
1035     // select best answer between even and odd results
1036     dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1037     iHi = vdup_lane_u32(iLo, 1);
1038     mask = vcgt_f32( dotMaxHi, dotMaxLo );
1039     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1040     iLo = vbsl_u32(mask, iHi, iLo);
1041
1042     *dotResult = vget_lane_f32( dotMaxLo, 0);
1043     return vget_lane_u32(iLo, 0);
1044 }
1045
1046
1047 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1048 {
1049     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1050     float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1051     float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1052     const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1053     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1054     uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
1055     float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
1056
1057     unsigned long i = 0;
1058     for( ; i + 8 <= count; i += 8 )
1059     {
1060         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1061         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1062         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1063         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1064
1065         // the next two lines should resolve to a single vswp d, d
1066         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1067         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1068         // the next two lines should resolve to a single vswp d, d
1069         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1070         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1071
1072         xy0 = vmulq_f32(xy0, vLo);
1073         xy1 = vmulq_f32(xy1, vLo);
1074
1075         float32x4x2_t zb = vuzpq_f32( z0, z1);
1076         float32x4_t z = vmulq_f32( zb.val[0], vHi);
1077         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1078         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1079         x = vaddq_f32(x, z);
1080
1081         uint32x4_t mask = vcgtq_f32(x, maxDot);
1082         maxDot = vbslq_f32( mask, x, maxDot);
1083         index = vbslq_u32(mask, local_index, index);
1084         local_index = vaddq_u32(local_index, four);
1085
1086         v0 = vld1q_f32_aligned_postincrement( vv );
1087         v1 = vld1q_f32_aligned_postincrement( vv );
1088         v2 = vld1q_f32_aligned_postincrement( vv );
1089         v3 = vld1q_f32_aligned_postincrement( vv );
1090
1091         // the next two lines should resolve to a single vswp d, d
1092         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1093         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1094         // the next two lines should resolve to a single vswp d, d
1095         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1096         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1097
1098         xy0 = vmulq_f32(xy0, vLo);
1099         xy1 = vmulq_f32(xy1, vLo);
1100
1101         zb = vuzpq_f32( z0, z1);
1102         z = vmulq_f32( zb.val[0], vHi);
1103         xy = vuzpq_f32( xy0, xy1);
1104         x = vaddq_f32(xy.val[0], xy.val[1]);
1105         x = vaddq_f32(x, z);
1106
1107         mask = vcgtq_f32(x, maxDot);
1108         maxDot = vbslq_f32( mask, x, maxDot);
1109         index = vbslq_u32(mask, local_index, index);
1110         local_index = vaddq_u32(local_index, four);
1111     }
1112
1113     for( ; i + 4 <= count; i += 4 )
1114     {
1115         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1116         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1117         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1118         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1119
1120         // the next two lines should resolve to a single vswp d, d
1121         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1122         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1123         // the next two lines should resolve to a single vswp d, d
1124         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1125         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1126
1127         xy0 = vmulq_f32(xy0, vLo);
1128         xy1 = vmulq_f32(xy1, vLo);
1129
1130         float32x4x2_t zb = vuzpq_f32( z0, z1);
1131         float32x4_t z = vmulq_f32( zb.val[0], vHi);
1132         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1133         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1134         x = vaddq_f32(x, z);
1135
1136         uint32x4_t mask = vcgtq_f32(x, maxDot);
1137         maxDot = vbslq_f32( mask, x, maxDot);
1138         index = vbslq_u32(mask, local_index, index);
1139         local_index = vaddq_u32(local_index, four);
1140     }
1141
1142     switch (count & 3) {
1143         case 3:
1144         {
1145             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1146             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1147             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1148
1149             // the next two lines should resolve to a single vswp d, d
1150             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1151             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1152             // the next two lines should resolve to a single vswp d, d
1153             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1154             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1155
1156             xy0 = vmulq_f32(xy0, vLo);
1157             xy1 = vmulq_f32(xy1, vLo);
1158
1159             float32x4x2_t zb = vuzpq_f32( z0, z1);
1160             float32x4_t z = vmulq_f32( zb.val[0], vHi);
1161             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1162             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1163             x = vaddq_f32(x, z);
1164
1165             uint32x4_t mask = vcgtq_f32(x, maxDot);
1166             maxDot = vbslq_f32( mask, x, maxDot);
1167             index = vbslq_u32(mask, local_index, index);
1168             local_index = vaddq_u32(local_index, four);
1169         }
1170             break;
1171
1172         case 2:
1173         {
1174             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1175             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1176
1177             // the next two lines should resolve to a single vswp d, d
1178             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1179             // the next two lines should resolve to a single vswp d, d
1180             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1181
1182             xy0 = vmulq_f32(xy0, vLo);
1183
1184             float32x4x2_t zb = vuzpq_f32( z0, z0);
1185             float32x4_t z = vmulq_f32( zb.val[0], vHi);
1186             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1187             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1188             x = vaddq_f32(x, z);
1189
1190             uint32x4_t mask = vcgtq_f32(x, maxDot);
1191             maxDot = vbslq_f32( mask, x, maxDot);
1192             index = vbslq_u32(mask, local_index, index);
1193             local_index = vaddq_u32(local_index, four);
1194         }
1195             break;
1196
1197         case 1:
1198         {
1199             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1200
1201             // the next two lines should resolve to a single vswp d, d
1202             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1203             // the next two lines should resolve to a single vswp d, d
1204             float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1205
1206             xy0 = vmulq_f32(xy0, vLo);
1207
1208             z = vmulq_f32( z, vHi);
1209             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1210             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1211             x = vaddq_f32(x, z);
1212
1213             uint32x4_t mask = vcgtq_f32(x, maxDot);
1214             maxDot = vbslq_f32( mask, x, maxDot);
1215             index = vbslq_u32(mask, local_index, index);
1216             local_index = vaddq_u32(local_index, four);
1217         }
1218             break;
1219
1220         default:
1221             break;
1222     }
1223
1224
1225     // select best answer between hi and lo results
1226     uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
1227     float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1228     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1229
1230     // select best answer between even and odd results
1231     float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1232     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1233     mask = vcgt_f32( maxDotO, maxDot2 );
1234     maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1235     index2 = vbsl_u32(mask, indexHi, index2);
1236
1237     *dotResult = vget_lane_f32( maxDot2, 0);
1238     return vget_lane_u32(index2, 0);
1239
1240 }
1241
1242 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
1243 {
1244     unsigned long i = 0;
1245     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1246     float32x2_t vLo = vget_low_f32(vvec);
1247     float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1248     float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1249     float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1250     uint32x2_t indexLo = (uint32x2_t) {0, 1};
1251     uint32x2_t indexHi = (uint32x2_t) {2, 3};
1252     uint32x2_t iLo = (uint32x2_t) {-1, -1};
1253     uint32x2_t iHi = (uint32x2_t) {-1, -1};
1254     const uint32x2_t four = (uint32x2_t) {4,4};
1255
1256     for( ; i+8 <= count; i+= 8 )
1257     {
1258         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1259         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1260         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1261         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1262
1263         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1264         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1265         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1266         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1267
1268         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1269         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1270         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1271         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1272
1273         float32x2_t rLo = vpadd_f32( xy0, xy1);
1274         float32x2_t rHi = vpadd_f32( xy2, xy3);
1275         rLo = vadd_f32(rLo, zLo);
1276         rHi = vadd_f32(rHi, zHi);
1277
1278         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1279         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1280         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1281         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1282         iLo = vbsl_u32(maskLo, indexLo, iLo);
1283         iHi = vbsl_u32(maskHi, indexHi, iHi);
1284         indexLo = vadd_u32(indexLo, four);
1285         indexHi = vadd_u32(indexHi, four);
1286
1287         v0 = vld1q_f32_aligned_postincrement( vv );
1288         v1 = vld1q_f32_aligned_postincrement( vv );
1289         v2 = vld1q_f32_aligned_postincrement( vv );
1290         v3 = vld1q_f32_aligned_postincrement( vv );
1291
1292         xy0 = vmul_f32( vget_low_f32(v0), vLo);
1293         xy1 = vmul_f32( vget_low_f32(v1), vLo);
1294         xy2 = vmul_f32( vget_low_f32(v2), vLo);
1295         xy3 = vmul_f32( vget_low_f32(v3), vLo);
1296
1297         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1298         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1299         zLo = vmul_f32( z0.val[0], vHi);
1300         zHi = vmul_f32( z1.val[0], vHi);
1301
1302         rLo = vpadd_f32( xy0, xy1);
1303         rHi = vpadd_f32( xy2, xy3);
1304         rLo = vadd_f32(rLo, zLo);
1305         rHi = vadd_f32(rHi, zHi);
1306
1307         maskLo = vclt_f32( rLo, dotMinLo );
1308         maskHi = vclt_f32( rHi, dotMinHi );
1309         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1310         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1311         iLo = vbsl_u32(maskLo, indexLo, iLo);
1312         iHi = vbsl_u32(maskHi, indexHi, iHi);
1313         indexLo = vadd_u32(indexLo, four);
1314         indexHi = vadd_u32(indexHi, four);
1315     }
1316
1317     for( ; i+4 <= count; i+= 4 )
1318     {
1319         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1320         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1321         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1322         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1323
1324         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1325         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1326         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1327         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1328
1329         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1330         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1331         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1332         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1333
1334         float32x2_t rLo = vpadd_f32( xy0, xy1);
1335         float32x2_t rHi = vpadd_f32( xy2, xy3);
1336         rLo = vadd_f32(rLo, zLo);
1337         rHi = vadd_f32(rHi, zHi);
1338
1339         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1340         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1341         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1342         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1343         iLo = vbsl_u32(maskLo, indexLo, iLo);
1344         iHi = vbsl_u32(maskHi, indexHi, iHi);
1345         indexLo = vadd_u32(indexLo, four);
1346         indexHi = vadd_u32(indexHi, four);
1347     }
1348     switch( count & 3 )
1349     {
1350         case 3:
1351         {
1352             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1353             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1354             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1355
1356             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1357             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1358             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1359
1360             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1361             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1362             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1363
1364             float32x2_t rLo = vpadd_f32( xy0, xy1);
1365             float32x2_t rHi = vpadd_f32( xy2, xy2);
1366             rLo = vadd_f32(rLo, zLo);
1367             rHi = vadd_f32(rHi, zHi);
1368
1369             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1370             uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1371             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1372             dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1373             iLo = vbsl_u32(maskLo, indexLo, iLo);
1374             iHi = vbsl_u32(maskHi, indexHi, iHi);
1375         }
1376             break;
1377         case 2:
1378         {
1379             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1380             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1381
1382             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1383             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1384
1385             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1386             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1387
1388             float32x2_t rLo = vpadd_f32( xy0, xy1);
1389             rLo = vadd_f32(rLo, zLo);
1390
1391             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1392             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1393             iLo = vbsl_u32(maskLo, indexLo, iLo);
1394         }
1395             break;
1396         case 1:
1397         {
1398             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1399             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1400             float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1401             float32x2_t zLo = vmul_f32( z0, vHi);
1402             float32x2_t rLo = vpadd_f32( xy0, xy0);
1403             rLo = vadd_f32(rLo, zLo);
1404             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1405             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1406             iLo = vbsl_u32(maskLo, indexLo, iLo);
1407         }
1408             break;
1409
1410         default:
1411             break;
1412     }
1413
1414     // select best answer between hi and lo results
1415     uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
1416     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1417     iLo = vbsl_u32(mask, iHi, iLo);
1418
1419     // select best answer between even and odd results
1420     dotMinHi = vdup_lane_f32(dotMinLo, 1);
1421     iHi = vdup_lane_u32(iLo, 1);
1422     mask = vclt_f32( dotMinHi, dotMinLo );
1423     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1424     iLo = vbsl_u32(mask, iHi, iLo);
1425
1426     *dotResult = vget_lane_f32( dotMinLo, 0);
1427     return vget_lane_u32(iLo, 0);
1428 }
1429
1430 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1431 {
1432     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1433     float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1434     float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1435     const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1436     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1437     uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
1438     float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
1439
1440     unsigned long i = 0;
1441     for( ; i + 8 <= count; i += 8 )
1442     {
1443         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1444         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1445         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1446         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1447
1448         // the next two lines should resolve to a single vswp d, d
1449         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1450         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1451         // the next two lines should resolve to a single vswp d, d
1452         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1453         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1454
1455         xy0 = vmulq_f32(xy0, vLo);
1456         xy1 = vmulq_f32(xy1, vLo);
1457
1458         float32x4x2_t zb = vuzpq_f32( z0, z1);
1459         float32x4_t z = vmulq_f32( zb.val[0], vHi);
1460         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1461         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1462         x = vaddq_f32(x, z);
1463
1464         uint32x4_t mask = vcltq_f32(x, minDot);
1465         minDot = vbslq_f32( mask, x, minDot);
1466         index = vbslq_u32(mask, local_index, index);
1467         local_index = vaddq_u32(local_index, four);
1468
1469         v0 = vld1q_f32_aligned_postincrement( vv );
1470         v1 = vld1q_f32_aligned_postincrement( vv );
1471         v2 = vld1q_f32_aligned_postincrement( vv );
1472         v3 = vld1q_f32_aligned_postincrement( vv );
1473
1474         // the next two lines should resolve to a single vswp d, d
1475         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1476         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1477         // the next two lines should resolve to a single vswp d, d
1478         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1479         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1480
1481         xy0 = vmulq_f32(xy0, vLo);
1482         xy1 = vmulq_f32(xy1, vLo);
1483
1484         zb = vuzpq_f32( z0, z1);
1485         z = vmulq_f32( zb.val[0], vHi);
1486         xy = vuzpq_f32( xy0, xy1);
1487         x = vaddq_f32(xy.val[0], xy.val[1]);
1488         x = vaddq_f32(x, z);
1489
1490         mask = vcltq_f32(x, minDot);
1491         minDot = vbslq_f32( mask, x, minDot);
1492         index = vbslq_u32(mask, local_index, index);
1493         local_index = vaddq_u32(local_index, four);
1494     }
1495
1496     for( ; i + 4 <= count; i += 4 )
1497     {
1498         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1499         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1500         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1501         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1502
1503         // the next two lines should resolve to a single vswp d, d
1504         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1505         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1506         // the next two lines should resolve to a single vswp d, d
1507         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1508         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1509
1510         xy0 = vmulq_f32(xy0, vLo);
1511         xy1 = vmulq_f32(xy1, vLo);
1512
1513         float32x4x2_t zb = vuzpq_f32( z0, z1);
1514         float32x4_t z = vmulq_f32( zb.val[0], vHi);
1515         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1516         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1517         x = vaddq_f32(x, z);
1518
1519         uint32x4_t mask = vcltq_f32(x, minDot);
1520         minDot = vbslq_f32( mask, x, minDot);
1521         index = vbslq_u32(mask, local_index, index);
1522         local_index = vaddq_u32(local_index, four);
1523     }
1524
1525     switch (count & 3) {
1526         case 3:
1527         {
1528             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1529             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1530             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1531
1532             // the next two lines should resolve to a single vswp d, d
1533             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1534             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1535             // the next two lines should resolve to a single vswp d, d
1536             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1537             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1538
1539             xy0 = vmulq_f32(xy0, vLo);
1540             xy1 = vmulq_f32(xy1, vLo);
1541
1542             float32x4x2_t zb = vuzpq_f32( z0, z1);
1543             float32x4_t z = vmulq_f32( zb.val[0], vHi);
1544             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1545             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1546             x = vaddq_f32(x, z);
1547
1548             uint32x4_t mask = vcltq_f32(x, minDot);
1549             minDot = vbslq_f32( mask, x, minDot);
1550             index = vbslq_u32(mask, local_index, index);
1551             local_index = vaddq_u32(local_index, four);
1552         }
1553             break;
1554
1555         case 2:
1556         {
1557             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1558             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1559
1560             // the next two lines should resolve to a single vswp d, d
1561             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1562             // the next two lines should resolve to a single vswp d, d
1563             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1564
1565             xy0 = vmulq_f32(xy0, vLo);
1566
1567             float32x4x2_t zb = vuzpq_f32( z0, z0);
1568             float32x4_t z = vmulq_f32( zb.val[0], vHi);
1569             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1570             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1571             x = vaddq_f32(x, z);
1572
1573             uint32x4_t mask = vcltq_f32(x, minDot);
1574             minDot = vbslq_f32( mask, x, minDot);
1575             index = vbslq_u32(mask, local_index, index);
1576             local_index = vaddq_u32(local_index, four);
1577         }
1578             break;
1579
1580         case 1:
1581         {
1582             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1583
1584             // the next two lines should resolve to a single vswp d, d
1585             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1586             // the next two lines should resolve to a single vswp d, d
1587             float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1588
1589             xy0 = vmulq_f32(xy0, vLo);
1590
1591             z = vmulq_f32( z, vHi);
1592             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1593             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1594             x = vaddq_f32(x, z);
1595
1596             uint32x4_t mask = vcltq_f32(x, minDot);
1597             minDot = vbslq_f32( mask, x, minDot);
1598             index = vbslq_u32(mask, local_index, index);
1599             local_index = vaddq_u32(local_index, four);
1600         }
1601             break;
1602
1603         default:
1604             break;
1605     }
1606
1607
1608     // select best answer between hi and lo results
1609     uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
1610     float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1611     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1612
1613     // select best answer between even and odd results
1614     float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1615     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1616     mask = vclt_f32( minDotO, minDot2 );
1617     minDot2 = vbsl_f32(mask, minDotO, minDot2);
1618     index2 = vbsl_u32(mask, indexHi, index2);
1619
1620     *dotResult = vget_lane_f32( minDot2, 0);
1621     return vget_lane_u32(index2, 0);
1622
1623 }
1624
1625 #else
1626     #error Unhandled __APPLE__ arch
1627 #endif
1628
1629 #endif  /* __APPLE__ */
1630
1631