dali-physics/third-party/bullet3/src/Bullet3Common/b3Vector3.h

   1 /*
   2 Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
   3
   4 This software is provided 'as-is', without any express or implied warranty.
   5 In no event will the authors be held liable for any damages arising from the use of this software.
   6 Permission is granted to anyone to use this software for any purpose,
   7 including commercial applications, and to alter it and redistribute it freely,
   8 subject to the following restrictions:
   9
  10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  12 3. This notice may not be removed or altered from any source distribution.
  13 */
  14
  15 #ifndef B3_VECTOR3_H
  16 #define B3_VECTOR3_H
  17
  18 //#include <stdint.h>
  19 #include "b3Scalar.h"
  20 #include "b3MinMax.h"
  21 #include "b3AlignedAllocator.h"
  22
  23 #ifdef B3_USE_DOUBLE_PRECISION
  24 #define b3Vector3Data b3Vector3DoubleData
  25 #define b3Vector3DataName "b3Vector3DoubleData"
  26 #else
  27 #define b3Vector3Data b3Vector3FloatData
  28 #define b3Vector3DataName "b3Vector3FloatData"
  29 #endif  //B3_USE_DOUBLE_PRECISION
  30
  31 #if defined B3_USE_SSE
  32
  33 //typedef  uint32_t __m128i __attribute__ ((vector_size(16)));
  34
  35 #ifdef _MSC_VER
  36 #pragma warning(disable : 4556)  // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
  37 #endif
  38
  39 #define B3_SHUFFLE(x, y, z, w) (((w) << 6 | (z) << 4 | (y) << 2 | (x)) & 0xff)
  40 //#define b3_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) )
  41 #define b3_pshufd_ps(_a, _mask) _mm_shuffle_ps((_a), (_a), (_mask))
  42 #define b3_splat3_ps(_a, _i) b3_pshufd_ps((_a), B3_SHUFFLE(_i, _i, _i, 3))
  43 #define b3_splat_ps(_a, _i) b3_pshufd_ps((_a), B3_SHUFFLE(_i, _i, _i, _i))
  44
  45 #define b3v3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
  46 #define b3vAbsMask (_mm_set_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
  47 #define b3vFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
  48 #define b3v3AbsfMask b3CastiTo128f(b3v3AbsiMask)
  49 #define b3vFFF0fMask b3CastiTo128f(b3vFFF0Mask)
  50 #define b3vxyzMaskf b3vFFF0fMask
  51 #define b3vAbsfMask b3CastiTo128f(b3vAbsMask)
  52
  53 const __m128 B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
  54 const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
  55 const __m128 B3_ATTRIBUTE_ALIGNED16(b3vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
  56 const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1_5) = {1.5f, 1.5f, 1.5f, 1.5f};
  57
  58 #endif
  59
  60 #ifdef B3_USE_NEON
  61
  62 const float32x4_t B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
  63 const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3vFFF0Mask) = (int32x4_t){0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
  64 const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3vAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
  65 const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3v3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
  66
  67 #endif
  68
  69 class b3Vector3;
  70 class b3Vector4;
  71
  72 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
  73 //#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
  74 inline b3Vector3 b3MakeVector3(b3SimdFloat4 v);
  75 inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec);
  76 #endif
  77
  78 inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z);
  79 inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w);
  80 inline b3Vector4 b3MakeVector4(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w);
  81
  82 /**@brief b3Vector3 can be used to represent 3D points and vectors.
  83  * It has an un-used w component to suit 16-byte alignment when b3Vector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user
  84  * Ideally, this class should be replaced by a platform optimized SIMD version that keeps the data in registers
  85  */
  86 B3_ATTRIBUTE_ALIGNED16(class)
  87 b3Vector3
  88 {
  89 public:
  90 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)  // _WIN32 || ARM
  91         union {
  92                 b3SimdFloat4 mVec128;
  93                 float m_floats[4];
  94                 struct
  95                 {
  96                         float x, y, z, w;
  97                 };
  98         };
  99 #else
 100         union {
 101                 float m_floats[4];
 102                 struct
 103                 {
 104                         float x, y, z, w;
 105                 };
 106         };
 107 #endif
 108
 109 public:
 110         B3_DECLARE_ALIGNED_ALLOCATOR();
 111
 112 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)  // _WIN32 || ARM
 113
 114         /*B3_FORCE_INLINE               b3Vector3()
 115         {
 116         }
 117         */
 118
 119         B3_FORCE_INLINE b3SimdFloat4 get128() const
 120         {
 121                 return mVec128;
 122         }
 123         B3_FORCE_INLINE void set128(b3SimdFloat4 v128)
 124         {
 125                 mVec128 = v128;
 126         }
 127 #endif
 128
 129 public:
 130         /**@brief Add a vector to this one
 131  * @param The vector to add to this one */
 132         B3_FORCE_INLINE b3Vector3& operator+=(const b3Vector3& v)
 133         {
 134 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 135                 mVec128 = _mm_add_ps(mVec128, v.mVec128);
 136 #elif defined(B3_USE_NEON)
 137                 mVec128 = vaddq_f32(mVec128, v.mVec128);
 138 #else
 139                 m_floats[0] += v.m_floats[0];
 140                 m_floats[1] += v.m_floats[1];
 141                 m_floats[2] += v.m_floats[2];
 142 #endif
 143                 return *this;
 144         }
 145
 146         /**@brief Subtract a vector from this one
 147    * @param The vector to subtract */
 148         B3_FORCE_INLINE b3Vector3& operator-=(const b3Vector3& v)
 149         {
 150 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 151                 mVec128 = _mm_sub_ps(mVec128, v.mVec128);
 152 #elif defined(B3_USE_NEON)
 153                 mVec128 = vsubq_f32(mVec128, v.mVec128);
 154 #else
 155                 m_floats[0] -= v.m_floats[0];
 156                 m_floats[1] -= v.m_floats[1];
 157                 m_floats[2] -= v.m_floats[2];
 158 #endif
 159                 return *this;
 160         }
 161
 162         /**@brief Scale the vector
 163    * @param s Scale factor */
 164         B3_FORCE_INLINE b3Vector3& operator*=(const b3Scalar& s)
 165         {
 166 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 167                 __m128 vs = _mm_load_ss(&s);  //        (S 0 0 0)
 168                 vs = b3_pshufd_ps(vs, 0x80);  //        (S S S 0.0)
 169                 mVec128 = _mm_mul_ps(mVec128, vs);
 170 #elif defined(B3_USE_NEON)
 171                 mVec128 = vmulq_n_f32(mVec128, s);
 172 #else
 173                 m_floats[0] *= s;
 174                 m_floats[1] *= s;
 175                 m_floats[2] *= s;
 176 #endif
 177                 return *this;
 178         }
 179
 180         /**@brief Inversely scale the vector
 181    * @param s Scale factor to divide by */
 182         B3_FORCE_INLINE b3Vector3& operator/=(const b3Scalar& s)
 183         {
 184                 b3FullAssert(s != b3Scalar(0.0));
 185
 186 #if 0  //defined(B3_USE_SSE_IN_API)
 187 // this code is not faster !
 188                 __m128 vs = _mm_load_ss(&s);
 189                 vs = _mm_div_ss(b3v1110, vs);
 190                 vs = b3_pshufd_ps(vs, 0x00);    //      (S S S S)
 191
 192                 mVec128 = _mm_mul_ps(mVec128, vs);
 193
 194                 return *this;
 195 #else
 196                 return *this *= b3Scalar(1.0) / s;
 197 #endif
 198         }
 199
 200         /**@brief Return the dot product
 201    * @param v The other vector in the dot product */
 202         B3_FORCE_INLINE b3Scalar dot(const b3Vector3& v) const
 203         {
 204 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 205                 __m128 vd = _mm_mul_ps(mVec128, v.mVec128);
 206                 __m128 z = _mm_movehl_ps(vd, vd);
 207                 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
 208                 vd = _mm_add_ss(vd, y);
 209                 vd = _mm_add_ss(vd, z);
 210                 return _mm_cvtss_f32(vd);
 211 #elif defined(B3_USE_NEON)
 212                 float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
 213                 float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));
 214                 x = vadd_f32(x, vget_high_f32(vd));
 215                 return vget_lane_f32(x, 0);
 216 #else
 217                 return m_floats[0] * v.m_floats[0] +
 218                            m_floats[1] * v.m_floats[1] +
 219                            m_floats[2] * v.m_floats[2];
 220 #endif
 221         }
 222
 223         /**@brief Return the length of the vector squared */
 224         B3_FORCE_INLINE b3Scalar length2() const
 225         {
 226                 return dot(*this);
 227         }
 228
 229         /**@brief Return the length of the vector */
 230         B3_FORCE_INLINE b3Scalar length() const
 231         {
 232                 return b3Sqrt(length2());
 233         }
 234
 235         /**@brief Return the distance squared between the ends of this and another vector
 236    * This is symantically treating the vector like a point */
 237         B3_FORCE_INLINE b3Scalar distance2(const b3Vector3& v) const;
 238
 239         /**@brief Return the distance between the ends of this and another vector
 240    * This is symantically treating the vector like a point */
 241         B3_FORCE_INLINE b3Scalar distance(const b3Vector3& v) const;
 242
 243         B3_FORCE_INLINE b3Vector3& safeNormalize()
 244         {
 245                 b3Scalar l2 = length2();
 246                 //triNormal.normalize();
 247                 if (l2 >= B3_EPSILON * B3_EPSILON)
 248                 {
 249                         (*this) /= b3Sqrt(l2);
 250                 }
 251                 else
 252                 {
 253                         setValue(1, 0, 0);
 254                 }
 255                 return *this;
 256         }
 257
 258         /**@brief Normalize this vector
 259    * x^2 + y^2 + z^2 = 1 */
 260         B3_FORCE_INLINE b3Vector3& normalize()
 261         {
 262 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 263                 // dot product first
 264                 __m128 vd = _mm_mul_ps(mVec128, mVec128);
 265                 __m128 z = _mm_movehl_ps(vd, vd);
 266                 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
 267                 vd = _mm_add_ss(vd, y);
 268                 vd = _mm_add_ss(vd, z);
 269
 270 #if 0
 271         vd = _mm_sqrt_ss(vd);
 272                 vd = _mm_div_ss(b3v1110, vd);
 273                 vd = b3_splat_ps(vd, 0x80);
 274                 mVec128 = _mm_mul_ps(mVec128, vd);
 275 #else
 276
 277                 // NR step 1/sqrt(x) - vd is x, y is output
 278                 y = _mm_rsqrt_ss(vd);  // estimate
 279
 280                 //  one step NR
 281                 z = b3v1_5;
 282                 vd = _mm_mul_ss(vd, b3vHalf);  // vd * 0.5
 283                 //x2 = vd;
 284                 vd = _mm_mul_ss(vd, y);  // vd * 0.5 * y0
 285                 vd = _mm_mul_ss(vd, y);  // vd * 0.5 * y0 * y0
 286                 z = _mm_sub_ss(z, vd);   // 1.5 - vd * 0.5 * y0 * y0
 287
 288                 y = _mm_mul_ss(y, z);  // y0 * (1.5 - vd * 0.5 * y0 * y0)
 289
 290                 y = b3_splat_ps(y, 0x80);
 291                 mVec128 = _mm_mul_ps(mVec128, y);
 292
 293 #endif
 294
 295                 return *this;
 296 #else
 297                 return *this /= length();
 298 #endif
 299         }
 300
 301         /**@brief Return a normalized version of this vector */
 302         B3_FORCE_INLINE b3Vector3 normalized() const;
 303
 304         /**@brief Return a rotated version of this vector
 305    * @param wAxis The axis to rotate about
 306    * @param angle The angle to rotate by */
 307         B3_FORCE_INLINE b3Vector3 rotate(const b3Vector3& wAxis, const b3Scalar angle) const;
 308
 309         /**@brief Return the angle between this and another vector
 310    * @param v The other vector */
 311         B3_FORCE_INLINE b3Scalar angle(const b3Vector3& v) const
 312         {
 313                 b3Scalar s = b3Sqrt(length2() * v.length2());
 314                 b3FullAssert(s != b3Scalar(0.0));
 315                 return b3Acos(dot(v) / s);
 316         }
 317
 318         /**@brief Return a vector will the absolute values of each element */
 319         B3_FORCE_INLINE b3Vector3 absolute() const
 320         {
 321 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 322                 return b3MakeVector3(_mm_and_ps(mVec128, b3v3AbsfMask));
 323 #elif defined(B3_USE_NEON)
 324                 return b3Vector3(vabsq_f32(mVec128));
 325 #else
 326                 return b3MakeVector3(
 327                         b3Fabs(m_floats[0]),
 328                         b3Fabs(m_floats[1]),
 329                         b3Fabs(m_floats[2]));
 330 #endif
 331         }
 332
 333         /**@brief Return the cross product between this and another vector
 334    * @param v The other vector */
 335         B3_FORCE_INLINE b3Vector3 cross(const b3Vector3& v) const
 336         {
 337 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 338                 __m128 T, V;
 339
 340                 T = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 3));    //        (Y Z X 0)
 341                 V = b3_pshufd_ps(v.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //        (Y Z X 0)
 342
 343                 V = _mm_mul_ps(V, mVec128);
 344                 T = _mm_mul_ps(T, v.mVec128);
 345                 V = _mm_sub_ps(V, T);
 346
 347                 V = b3_pshufd_ps(V, B3_SHUFFLE(1, 2, 0, 3));
 348                 return b3MakeVector3(V);
 349 #elif defined(B3_USE_NEON)
 350                 float32x4_t T, V;
 351                 // form (Y, Z, X, _) of mVec128 and v.mVec128
 352                 float32x2_t Tlow = vget_low_f32(mVec128);
 353                 float32x2_t Vlow = vget_low_f32(v.mVec128);
 354                 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
 355                 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
 356
 357                 V = vmulq_f32(V, mVec128);
 358                 T = vmulq_f32(T, v.mVec128);
 359                 V = vsubq_f32(V, T);
 360                 Vlow = vget_low_f32(V);
 361                 // form (Y, Z, X, _);
 362                 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
 363                 V = (float32x4_t)vandq_s32((int32x4_t)V, b3vFFF0Mask);
 364
 365                 return b3Vector3(V);
 366 #else
 367                 return b3MakeVector3(
 368                         m_floats[1] * v.m_floats[2] - m_floats[2] * v.m_floats[1],
 369                         m_floats[2] * v.m_floats[0] - m_floats[0] * v.m_floats[2],
 370                         m_floats[0] * v.m_floats[1] - m_floats[1] * v.m_floats[0]);
 371 #endif
 372         }
 373
 374         B3_FORCE_INLINE b3Scalar triple(const b3Vector3& v1, const b3Vector3& v2) const
 375         {
 376 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 377                 // cross:
 378                 __m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //  (Y Z X 0)
 379                 __m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //  (Y Z X 0)
 380
 381                 V = _mm_mul_ps(V, v1.mVec128);
 382                 T = _mm_mul_ps(T, v2.mVec128);
 383                 V = _mm_sub_ps(V, T);
 384
 385                 V = _mm_shuffle_ps(V, V, B3_SHUFFLE(1, 2, 0, 3));
 386
 387                 // dot:
 388                 V = _mm_mul_ps(V, mVec128);
 389                 __m128 z = _mm_movehl_ps(V, V);
 390                 __m128 y = _mm_shuffle_ps(V, V, 0x55);
 391                 V = _mm_add_ss(V, y);
 392                 V = _mm_add_ss(V, z);
 393                 return _mm_cvtss_f32(V);
 394
 395 #elif defined(B3_USE_NEON)
 396                 // cross:
 397                 float32x4_t T, V;
 398                 // form (Y, Z, X, _) of mVec128 and v.mVec128
 399                 float32x2_t Tlow = vget_low_f32(v1.mVec128);
 400                 float32x2_t Vlow = vget_low_f32(v2.mVec128);
 401                 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
 402                 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
 403
 404                 V = vmulq_f32(V, v1.mVec128);
 405                 T = vmulq_f32(T, v2.mVec128);
 406                 V = vsubq_f32(V, T);
 407                 Vlow = vget_low_f32(V);
 408                 // form (Y, Z, X, _);
 409                 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
 410
 411                 // dot:
 412                 V = vmulq_f32(mVec128, V);
 413                 float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));
 414                 x = vadd_f32(x, vget_high_f32(V));
 415                 return vget_lane_f32(x, 0);
 416 #else
 417                 return m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) +
 418                            m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) +
 419                            m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
 420 #endif
 421         }
 422
 423         /**@brief Return the axis with the smallest value
 424    * Note return values are 0,1,2 for x, y, or z */
 425         B3_FORCE_INLINE int minAxis() const
 426         {
 427                 return m_floats[0] < m_floats[1] ? (m_floats[0] < m_floats[2] ? 0 : 2) : (m_floats[1] < m_floats[2] ? 1 : 2);
 428         }
 429
 430         /**@brief Return the axis with the largest value
 431    * Note return values are 0,1,2 for x, y, or z */
 432         B3_FORCE_INLINE int maxAxis() const
 433         {
 434                 return m_floats[0] < m_floats[1] ? (m_floats[1] < m_floats[2] ? 2 : 1) : (m_floats[0] < m_floats[2] ? 2 : 0);
 435         }
 436
 437         B3_FORCE_INLINE int furthestAxis() const
 438         {
 439                 return absolute().minAxis();
 440         }
 441
 442         B3_FORCE_INLINE int closestAxis() const
 443         {
 444                 return absolute().maxAxis();
 445         }
 446
 447         B3_FORCE_INLINE void setInterpolate3(const b3Vector3& v0, const b3Vector3& v1, b3Scalar rt)
 448         {
 449 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 450                 __m128 vrt = _mm_load_ss(&rt);  //      (rt 0 0 0)
 451                 b3Scalar s = b3Scalar(1.0) - rt;
 452                 __m128 vs = _mm_load_ss(&s);  //        (S 0 0 0)
 453                 vs = b3_pshufd_ps(vs, 0x80);  //        (S S S 0.0)
 454                 __m128 r0 = _mm_mul_ps(v0.mVec128, vs);
 455                 vrt = b3_pshufd_ps(vrt, 0x80);  //      (rt rt rt 0.0)
 456                 __m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
 457                 __m128 tmp3 = _mm_add_ps(r0, r1);
 458                 mVec128 = tmp3;
 459 #elif defined(B3_USE_NEON)
 460                 float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128);
 461                 vl = vmulq_n_f32(vl, rt);
 462                 mVec128 = vaddq_f32(vl, v0.mVec128);
 463 #else
 464                 b3Scalar s = b3Scalar(1.0) - rt;
 465                 m_floats[0] = s * v0.m_floats[0] + rt * v1.m_floats[0];
 466                 m_floats[1] = s * v0.m_floats[1] + rt * v1.m_floats[1];
 467                 m_floats[2] = s * v0.m_floats[2] + rt * v1.m_floats[2];
 468                 //don't do the unused w component
 469                 //              m_co[3] = s * v0[3] + rt * v1[3];
 470 #endif
 471         }
 472
 473         /**@brief Return the linear interpolation between this and another vector
 474    * @param v The other vector
 475    * @param t The ration of this to v (t = 0 => return this, t=1 => return other) */
 476         B3_FORCE_INLINE b3Vector3 lerp(const b3Vector3& v, const b3Scalar& t) const
 477         {
 478 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 479                 __m128 vt = _mm_load_ss(&t);  //        (t 0 0 0)
 480                 vt = b3_pshufd_ps(vt, 0x80);  //        (rt rt rt 0.0)
 481                 __m128 vl = _mm_sub_ps(v.mVec128, mVec128);
 482                 vl = _mm_mul_ps(vl, vt);
 483                 vl = _mm_add_ps(vl, mVec128);
 484
 485                 return b3MakeVector3(vl);
 486 #elif defined(B3_USE_NEON)
 487                 float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
 488                 vl = vmulq_n_f32(vl, t);
 489                 vl = vaddq_f32(vl, mVec128);
 490
 491                 return b3Vector3(vl);
 492 #else
 493                 return b3MakeVector3(m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
 494                                                          m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
 495                                                          m_floats[2] + (v.m_floats[2] - m_floats[2]) * t);
 496 #endif
 497         }
 498
 499         /**@brief Elementwise multiply this vector by the other
 500    * @param v The other vector */
 501         B3_FORCE_INLINE b3Vector3& operator*=(const b3Vector3& v)
 502         {
 503 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 504                 mVec128 = _mm_mul_ps(mVec128, v.mVec128);
 505 #elif defined(B3_USE_NEON)
 506                 mVec128 = vmulq_f32(mVec128, v.mVec128);
 507 #else
 508                 m_floats[0] *= v.m_floats[0];
 509                 m_floats[1] *= v.m_floats[1];
 510                 m_floats[2] *= v.m_floats[2];
 511 #endif
 512                 return *this;
 513         }
 514
 515         /**@brief Return the x value */
 516         B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; }
 517         /**@brief Return the y value */
 518         B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; }
 519         /**@brief Return the z value */
 520         B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; }
 521         /**@brief Return the w value */
 522         B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; }
 523
 524         /**@brief Set the x value */
 525         B3_FORCE_INLINE void setX(b3Scalar _x) { m_floats[0] = _x; };
 526         /**@brief Set the y value */
 527         B3_FORCE_INLINE void setY(b3Scalar _y) { m_floats[1] = _y; };
 528         /**@brief Set the z value */
 529         B3_FORCE_INLINE void setZ(b3Scalar _z) { m_floats[2] = _z; };
 530         /**@brief Set the w value */
 531         B3_FORCE_INLINE void setW(b3Scalar _w) { m_floats[3] = _w; };
 532
 533         //B3_FORCE_INLINE b3Scalar&       operator[](int i)       { return (&m_floats[0])[i];   }
 534         //B3_FORCE_INLINE const b3Scalar& operator[](int i) const { return (&m_floats[0])[i]; }
 535         ///operator b3Scalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons.
 536         B3_FORCE_INLINE operator b3Scalar*() { return &m_floats[0]; }
 537         B3_FORCE_INLINE operator const b3Scalar*() const { return &m_floats[0]; }
 538
 539         B3_FORCE_INLINE bool operator==(const b3Vector3& other) const
 540         {
 541 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 542                 return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
 543 #else
 544                 return ((m_floats[3] == other.m_floats[3]) &&
 545                                 (m_floats[2] == other.m_floats[2]) &&
 546                                 (m_floats[1] == other.m_floats[1]) &&
 547                                 (m_floats[0] == other.m_floats[0]));
 548 #endif
 549         }
 550
 551         B3_FORCE_INLINE bool operator!=(const b3Vector3& other) const
 552         {
 553                 return !(*this == other);
 554         }
 555
 556         /**@brief Set each element to the max of the current values and the values of another b3Vector3
 557    * @param other The other b3Vector3 to compare with
 558    */
 559         B3_FORCE_INLINE void setMax(const b3Vector3& other)
 560         {
 561 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 562                 mVec128 = _mm_max_ps(mVec128, other.mVec128);
 563 #elif defined(B3_USE_NEON)
 564                 mVec128 = vmaxq_f32(mVec128, other.mVec128);
 565 #else
 566                 b3SetMax(m_floats[0], other.m_floats[0]);
 567                 b3SetMax(m_floats[1], other.m_floats[1]);
 568                 b3SetMax(m_floats[2], other.m_floats[2]);
 569                 b3SetMax(m_floats[3], other.m_floats[3]);
 570 #endif
 571         }
 572
 573         /**@brief Set each element to the min of the current values and the values of another b3Vector3
 574    * @param other The other b3Vector3 to compare with
 575    */
 576         B3_FORCE_INLINE void setMin(const b3Vector3& other)
 577         {
 578 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 579                 mVec128 = _mm_min_ps(mVec128, other.mVec128);
 580 #elif defined(B3_USE_NEON)
 581                 mVec128 = vminq_f32(mVec128, other.mVec128);
 582 #else
 583                 b3SetMin(m_floats[0], other.m_floats[0]);
 584                 b3SetMin(m_floats[1], other.m_floats[1]);
 585                 b3SetMin(m_floats[2], other.m_floats[2]);
 586                 b3SetMin(m_floats[3], other.m_floats[3]);
 587 #endif
 588         }
 589
 590         B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
 591         {
 592                 m_floats[0] = _x;
 593                 m_floats[1] = _y;
 594                 m_floats[2] = _z;
 595                 m_floats[3] = b3Scalar(0.f);
 596         }
 597
 598         void getSkewSymmetricMatrix(b3Vector3 * v0, b3Vector3 * v1, b3Vector3 * v2) const
 599         {
 600 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 601
 602                 __m128 V = _mm_and_ps(mVec128, b3vFFF0fMask);
 603                 __m128 V0 = _mm_xor_ps(b3vMzeroMask, V);
 604                 __m128 V2 = _mm_movelh_ps(V0, V);
 605
 606                 __m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
 607
 608                 V0 = _mm_shuffle_ps(V0, V, 0xDB);
 609                 V2 = _mm_shuffle_ps(V2, V, 0xF9);
 610
 611                 v0->mVec128 = V0;
 612                 v1->mVec128 = V1;
 613                 v2->mVec128 = V2;
 614 #else
 615                 v0->setValue(0., -getZ(), getY());
 616                 v1->setValue(getZ(), 0., -getX());
 617                 v2->setValue(-getY(), getX(), 0.);
 618 #endif
 619         }
 620
 621         void setZero()
 622         {
 623 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 624                 mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
 625 #elif defined(B3_USE_NEON)
 626                 int32x4_t vi = vdupq_n_s32(0);
 627                 mVec128 = vreinterpretq_f32_s32(vi);
 628 #else
 629                 setValue(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
 630 #endif
 631         }
 632
 633         B3_FORCE_INLINE bool isZero() const
 634         {
 635                 return m_floats[0] == b3Scalar(0) && m_floats[1] == b3Scalar(0) && m_floats[2] == b3Scalar(0);
 636         }
 637
 638         B3_FORCE_INLINE bool fuzzyZero() const
 639         {
 640                 return length2() < B3_EPSILON;
 641         }
 642
 643         B3_FORCE_INLINE void serialize(struct b3Vector3Data & dataOut) const;
 644
 645         B3_FORCE_INLINE void deSerialize(const struct b3Vector3Data& dataIn);
 646
 647         B3_FORCE_INLINE void serializeFloat(struct b3Vector3FloatData & dataOut) const;
 648
 649         B3_FORCE_INLINE void deSerializeFloat(const struct b3Vector3FloatData& dataIn);
 650
 651         B3_FORCE_INLINE void serializeDouble(struct b3Vector3DoubleData & dataOut) const;
 652
 653         B3_FORCE_INLINE void deSerializeDouble(const struct b3Vector3DoubleData& dataIn);
 654
 655         /**@brief returns index of maximum dot product between this and vectors in array[]
 656          * @param array The other vectors
 657          * @param array_count The number of other vectors
 658          * @param dotOut The maximum dot product */
 659         B3_FORCE_INLINE long maxDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const;
 660
 661         /**@brief returns index of minimum dot product between this and vectors in array[]
 662          * @param array The other vectors
 663          * @param array_count The number of other vectors
 664          * @param dotOut The minimum dot product */
 665         B3_FORCE_INLINE long minDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const;
 666
 667         /* create a vector as  b3Vector3( this->dot( b3Vector3 v0 ), this->dot( b3Vector3 v1), this->dot( b3Vector3 v2 ))  */
 668         B3_FORCE_INLINE b3Vector3 dot3(const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2) const
 669         {
 670 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 671
 672                 __m128 a0 = _mm_mul_ps(v0.mVec128, this->mVec128);
 673                 __m128 a1 = _mm_mul_ps(v1.mVec128, this->mVec128);
 674                 __m128 a2 = _mm_mul_ps(v2.mVec128, this->mVec128);
 675                 __m128 b0 = _mm_unpacklo_ps(a0, a1);
 676                 __m128 b1 = _mm_unpackhi_ps(a0, a1);
 677                 __m128 b2 = _mm_unpacklo_ps(a2, _mm_setzero_ps());
 678                 __m128 r = _mm_movelh_ps(b0, b2);
 679                 r = _mm_add_ps(r, _mm_movehl_ps(b2, b0));
 680                 a2 = _mm_and_ps(a2, b3vxyzMaskf);
 681                 r = _mm_add_ps(r, b3CastdTo128f(_mm_move_sd(b3CastfTo128d(a2), b3CastfTo128d(b1))));
 682                 return b3MakeVector3(r);
 683
 684 #elif defined(B3_USE_NEON)
 685                 static const uint32x4_t xyzMask = (const uint32x4_t){-1, -1, -1, 0};
 686                 float32x4_t a0 = vmulq_f32(v0.mVec128, this->mVec128);
 687                 float32x4_t a1 = vmulq_f32(v1.mVec128, this->mVec128);
 688                 float32x4_t a2 = vmulq_f32(v2.mVec128, this->mVec128);
 689                 float32x2x2_t zLo = vtrn_f32(vget_high_f32(a0), vget_high_f32(a1));
 690                 a2 = (float32x4_t)vandq_u32((uint32x4_t)a2, xyzMask);
 691                 float32x2_t b0 = vadd_f32(vpadd_f32(vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0]);
 692                 float32x2_t b1 = vpadd_f32(vpadd_f32(vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
 693                 return b3Vector3(vcombine_f32(b0, b1));
 694 #else
 695                 return b3MakeVector3(dot(v0), dot(v1), dot(v2));
 696 #endif
 697         }
 698 };
 699
 700 /**@brief Return the sum of two vectors (Point symantics)*/
 701 B3_FORCE_INLINE b3Vector3
 702 operator+(const b3Vector3& v1, const b3Vector3& v2)
 703 {
 704 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 705         return b3MakeVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
 706 #elif defined(B3_USE_NEON)
 707         return b3MakeVector3(vaddq_f32(v1.mVec128, v2.mVec128));
 708 #else
 709         return b3MakeVector3(
 710                 v1.m_floats[0] + v2.m_floats[0],
 711                 v1.m_floats[1] + v2.m_floats[1],
 712                 v1.m_floats[2] + v2.m_floats[2]);
 713 #endif
 714 }
 715
 716 /**@brief Return the elementwise product of two vectors */
 717 B3_FORCE_INLINE b3Vector3
 718 operator*(const b3Vector3& v1, const b3Vector3& v2)
 719 {
 720 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 721         return b3MakeVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
 722 #elif defined(B3_USE_NEON)
 723         return b3MakeVector3(vmulq_f32(v1.mVec128, v2.mVec128));
 724 #else
 725         return b3MakeVector3(
 726                 v1.m_floats[0] * v2.m_floats[0],
 727                 v1.m_floats[1] * v2.m_floats[1],
 728                 v1.m_floats[2] * v2.m_floats[2]);
 729 #endif
 730 }
 731
 732 /**@brief Return the difference between two vectors */
 733 B3_FORCE_INLINE b3Vector3
 734 operator-(const b3Vector3& v1, const b3Vector3& v2)
 735 {
 736 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
 737
 738         //      without _mm_and_ps this code causes slowdown in Concave moving
 739         __m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
 740         return b3MakeVector3(_mm_and_ps(r, b3vFFF0fMask));
 741 #elif defined(B3_USE_NEON)
 742         float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
 743         return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)r, b3vFFF0Mask));
 744 #else
 745         return b3MakeVector3(
 746                 v1.m_floats[0] - v2.m_floats[0],
 747                 v1.m_floats[1] - v2.m_floats[1],
 748                 v1.m_floats[2] - v2.m_floats[2]);
 749 #endif
 750 }
 751
 752 /**@brief Return the negative of the vector */
 753 B3_FORCE_INLINE b3Vector3
 754 operator-(const b3Vector3& v)
 755 {
 756 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
 757         __m128 r = _mm_xor_ps(v.mVec128, b3vMzeroMask);
 758         return b3MakeVector3(_mm_and_ps(r, b3vFFF0fMask));
 759 #elif defined(B3_USE_NEON)
 760         return b3MakeVector3((b3SimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)b3vMzeroMask));
 761 #else
 762         return b3MakeVector3(-v.m_floats[0], -v.m_floats[1], -v.m_floats[2]);
 763 #endif
 764 }
 765
 766 /**@brief Return the vector scaled by s */
 767 B3_FORCE_INLINE b3Vector3
 768 operator*(const b3Vector3& v, const b3Scalar& s)
 769 {
 770 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 771         __m128 vs = _mm_load_ss(&s);  //        (S 0 0 0)
 772         vs = b3_pshufd_ps(vs, 0x80);  //        (S S S 0.0)
 773         return b3MakeVector3(_mm_mul_ps(v.mVec128, vs));
 774 #elif defined(B3_USE_NEON)
 775         float32x4_t r = vmulq_n_f32(v.mVec128, s);
 776         return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)r, b3vFFF0Mask));
 777 #else
 778         return b3MakeVector3(v.m_floats[0] * s, v.m_floats[1] * s, v.m_floats[2] * s);
 779 #endif
 780 }
 781
 782 /**@brief Return the vector scaled by s */
 783 B3_FORCE_INLINE b3Vector3
 784 operator*(const b3Scalar& s, const b3Vector3& v)
 785 {
 786         return v * s;
 787 }
 788
 789 /**@brief Return the vector inversely scaled by s */
 790 B3_FORCE_INLINE b3Vector3
 791 operator/(const b3Vector3& v, const b3Scalar& s)
 792 {
 793         b3FullAssert(s != b3Scalar(0.0));
 794 #if 0  //defined(B3_USE_SSE_IN_API)
 795 // this code is not faster !
 796         __m128 vs = _mm_load_ss(&s);
 797     vs = _mm_div_ss(b3v1110, vs);
 798         vs = b3_pshufd_ps(vs, 0x00);    //      (S S S S)
 799
 800         return b3Vector3(_mm_mul_ps(v.mVec128, vs));
 801 #else
 802         return v * (b3Scalar(1.0) / s);
 803 #endif
 804 }
 805
 806 /**@brief Return the vector inversely scaled by s */
 807 B3_FORCE_INLINE b3Vector3
 808 operator/(const b3Vector3& v1, const b3Vector3& v2)
 809 {
 810 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
 811         __m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
 812         vec = _mm_and_ps(vec, b3vFFF0fMask);
 813         return b3MakeVector3(vec);
 814 #elif defined(B3_USE_NEON)
 815         float32x4_t x, y, v, m;
 816
 817         x = v1.mVec128;
 818         y = v2.mVec128;
 819
 820         v = vrecpeq_f32(y);     // v ~ 1/y
 821         m = vrecpsq_f32(y, v);  // m = (2-v*y)
 822         v = vmulq_f32(v, m);    // vv = v*m ~~ 1/y
 823         m = vrecpsq_f32(y, v);  // mm = (2-vv*y)
 824         v = vmulq_f32(v, x);    // x*vv
 825         v = vmulq_f32(v, m);    // (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y
 826
 827         return b3Vector3(v);
 828 #else
 829         return b3MakeVector3(
 830                 v1.m_floats[0] / v2.m_floats[0],
 831                 v1.m_floats[1] / v2.m_floats[1],
 832                 v1.m_floats[2] / v2.m_floats[2]);
 833 #endif
 834 }
 835
 836 /**@brief Return the dot product between two vectors */
 837 B3_FORCE_INLINE b3Scalar
 838 b3Dot(const b3Vector3& v1, const b3Vector3& v2)
 839 {
 840         return v1.dot(v2);
 841 }
 842
 843 /**@brief Return the distance squared between two vectors */
 844 B3_FORCE_INLINE b3Scalar
 845 b3Distance2(const b3Vector3& v1, const b3Vector3& v2)
 846 {
 847         return v1.distance2(v2);
 848 }
 849
 850 /**@brief Return the distance between two vectors */
 851 B3_FORCE_INLINE b3Scalar
 852 b3Distance(const b3Vector3& v1, const b3Vector3& v2)
 853 {
 854         return v1.distance(v2);
 855 }
 856
 857 /**@brief Return the angle between two vectors */
 858 B3_FORCE_INLINE b3Scalar
 859 b3Angle(const b3Vector3& v1, const b3Vector3& v2)
 860 {
 861         return v1.angle(v2);
 862 }
 863
 864 /**@brief Return the cross product of two vectors */
 865 B3_FORCE_INLINE b3Vector3
 866 b3Cross(const b3Vector3& v1, const b3Vector3& v2)
 867 {
 868         return v1.cross(v2);
 869 }
 870
 871 B3_FORCE_INLINE b3Scalar
 872 b3Triple(const b3Vector3& v1, const b3Vector3& v2, const b3Vector3& v3)
 873 {
 874         return v1.triple(v2, v3);
 875 }
 876
 877 /**@brief Return the linear interpolation between two vectors
 878  * @param v1 One vector
 879  * @param v2 The other vector
 880  * @param t The ration of this to v (t = 0 => return v1, t=1 => return v2) */
 881 B3_FORCE_INLINE b3Vector3
 882 b3Lerp(const b3Vector3& v1, const b3Vector3& v2, const b3Scalar& t)
 883 {
 884         return v1.lerp(v2, t);
 885 }
 886
 887 B3_FORCE_INLINE b3Scalar b3Vector3::distance2(const b3Vector3& v) const
 888 {
 889         return (v - *this).length2();
 890 }
 891
 892 B3_FORCE_INLINE b3Scalar b3Vector3::distance(const b3Vector3& v) const
 893 {
 894         return (v - *this).length();
 895 }
 896
 897 B3_FORCE_INLINE b3Vector3 b3Vector3::normalized() const
 898 {
 899 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 900         b3Vector3 norm = *this;
 901
 902         return norm.normalize();
 903 #else
 904         return *this / length();
 905 #endif
 906 }
 907
 908 B3_FORCE_INLINE b3Vector3 b3Vector3::rotate(const b3Vector3& wAxis, const b3Scalar _angle) const
 909 {
 910         // wAxis must be a unit lenght vector
 911
 912 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 913
 914         __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
 915         b3Scalar ssin = b3Sin(_angle);
 916         __m128 C = wAxis.cross(b3MakeVector3(mVec128)).mVec128;
 917         O = _mm_and_ps(O, b3vFFF0fMask);
 918         b3Scalar scos = b3Cos(_angle);
 919
 920         __m128 vsin = _mm_load_ss(&ssin);  //   (S 0 0 0)
 921         __m128 vcos = _mm_load_ss(&scos);  //   (S 0 0 0)
 922
 923         __m128 Y = b3_pshufd_ps(O, 0xC9);  //   (Y Z X 0)
 924         __m128 Z = b3_pshufd_ps(O, 0xD2);  //   (Z X Y 0)
 925         O = _mm_add_ps(O, Y);
 926         vsin = b3_pshufd_ps(vsin, 0x80);  //    (S S S 0)
 927         O = _mm_add_ps(O, Z);
 928         vcos = b3_pshufd_ps(vcos, 0x80);  //    (S S S 0)
 929
 930         vsin = vsin * C;
 931         O = O * wAxis.mVec128;
 932         __m128 X = mVec128 - O;
 933
 934         O = O + vsin;
 935         vcos = vcos * X;
 936         O = O + vcos;
 937
 938         return b3MakeVector3(O);
 939 #else
 940         b3Vector3 o = wAxis * wAxis.dot(*this);
 941         b3Vector3 _x = *this - o;
 942         b3Vector3 _y;
 943
 944         _y = wAxis.cross(*this);
 945
 946         return (o + _x * b3Cos(_angle) + _y * b3Sin(_angle));
 947 #endif
 948 }
 949
 950 B3_FORCE_INLINE long b3Vector3::maxDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const
 951 {
 952 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
 953 #if defined _WIN32 || defined(B3_USE_SSE)
 954         const long scalar_cutoff = 10;
 955         long b3_maxdot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut);
 956 #elif defined B3_USE_NEON
 957         const long scalar_cutoff = 4;
 958         extern long (*_maxdot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut);
 959 #endif
 960         if (array_count < scalar_cutoff)
 961 #else
 962
 963 #endif  //B3_USE_SSE || B3_USE_NEON
 964         {
 965                 b3Scalar maxDot = -B3_INFINITY;
 966                 int i = 0;
 967                 int ptIndex = -1;
 968                 for (i = 0; i < array_count; i++)
 969                 {
 970                         b3Scalar dot = array[i].dot(*this);
 971
 972                         if (dot > maxDot)
 973                         {
 974                                 maxDot = dot;
 975                                 ptIndex = i;
 976                         }
 977                 }
 978
 979                 b3Assert(ptIndex >= 0);
 980                 if (ptIndex < 0)
 981                 {
 982                         ptIndex = 0;
 983                 }
 984                 dotOut = maxDot;
 985                 return ptIndex;
 986         }
 987 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
 988         return b3_maxdot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut);
 989 #endif
 990 }
 991
 992 B3_FORCE_INLINE long b3Vector3::minDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const
 993 {
 994 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
 995 #if defined B3_USE_SSE
 996         const long scalar_cutoff = 10;
 997         long b3_mindot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut);
 998 #elif defined B3_USE_NEON
 999         const long scalar_cutoff = 4;
1000         extern long (*b3_mindot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut);
1001 #else
1002 #error unhandled arch!
1003 #endif
1004
1005         if (array_count < scalar_cutoff)
1006 #endif  //B3_USE_SSE || B3_USE_NEON
1007         {
1008                 b3Scalar minDot = B3_INFINITY;
1009                 int i = 0;
1010                 int ptIndex = -1;
1011
1012                 for (i = 0; i < array_count; i++)
1013                 {
1014                         b3Scalar dot = array[i].dot(*this);
1015
1016                         if (dot < minDot)
1017                         {
1018                                 minDot = dot;
1019                                 ptIndex = i;
1020                         }
1021                 }
1022
1023                 dotOut = minDot;
1024
1025                 return ptIndex;
1026         }
1027 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
1028         return b3_mindot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut);
1029 #endif
1030 }
1031
1032 class b3Vector4 : public b3Vector3
1033 {
1034 public:
1035         B3_FORCE_INLINE b3Vector4 absolute4() const
1036         {
1037 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
1038                 return b3MakeVector4(_mm_and_ps(mVec128, b3vAbsfMask));
1039 #elif defined(B3_USE_NEON)
1040                 return b3Vector4(vabsq_f32(mVec128));
1041 #else
1042                 return b3MakeVector4(
1043                         b3Fabs(m_floats[0]),
1044                         b3Fabs(m_floats[1]),
1045                         b3Fabs(m_floats[2]),
1046                         b3Fabs(m_floats[3]));
1047 #endif
1048         }
1049
1050         b3Scalar getW() const { return m_floats[3]; }
1051
1052         B3_FORCE_INLINE int maxAxis4() const
1053         {
1054                 int maxIndex = -1;
1055                 b3Scalar maxVal = b3Scalar(-B3_LARGE_FLOAT);
1056                 if (m_floats[0] > maxVal)
1057                 {
1058                         maxIndex = 0;
1059                         maxVal = m_floats[0];
1060                 }
1061                 if (m_floats[1] > maxVal)
1062                 {
1063                         maxIndex = 1;
1064                         maxVal = m_floats[1];
1065                 }
1066                 if (m_floats[2] > maxVal)
1067                 {
1068                         maxIndex = 2;
1069                         maxVal = m_floats[2];
1070                 }
1071                 if (m_floats[3] > maxVal)
1072                 {
1073                         maxIndex = 3;
1074                 }
1075
1076                 return maxIndex;
1077         }
1078
1079         B3_FORCE_INLINE int minAxis4() const
1080         {
1081                 int minIndex = -1;
1082                 b3Scalar minVal = b3Scalar(B3_LARGE_FLOAT);
1083                 if (m_floats[0] < minVal)
1084                 {
1085                         minIndex = 0;
1086                         minVal = m_floats[0];
1087                 }
1088                 if (m_floats[1] < minVal)
1089                 {
1090                         minIndex = 1;
1091                         minVal = m_floats[1];
1092                 }
1093                 if (m_floats[2] < minVal)
1094                 {
1095                         minIndex = 2;
1096                         minVal = m_floats[2];
1097                 }
1098                 if (m_floats[3] < minVal)
1099                 {
1100                         minIndex = 3;
1101                         minVal = m_floats[3];
1102                 }
1103
1104                 return minIndex;
1105         }
1106
1107         B3_FORCE_INLINE int closestAxis4() const
1108         {
1109                 return absolute4().maxAxis4();
1110         }
1111
1112         /**@brief Set x,y,z and zero w
1113    * @param x Value of x
1114    * @param y Value of y
1115    * @param z Value of z
1116    */
1117
1118         /*              void getValue(b3Scalar *m) const
1119                 {
1120                         m[0] = m_floats[0];
1121                         m[1] = m_floats[1];
1122                         m[2] =m_floats[2];
1123                 }
1124 */
1125         /**@brief Set the values
1126    * @param x Value of x
1127    * @param y Value of y
1128    * @param z Value of z
1129    * @param w Value of w
1130    */
1131         B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w)
1132         {
1133                 m_floats[0] = _x;
1134                 m_floats[1] = _y;
1135                 m_floats[2] = _z;
1136                 m_floats[3] = _w;
1137         }
1138 };
1139
1140 ///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
1141 B3_FORCE_INLINE void b3SwapScalarEndian(const b3Scalar& sourceVal, b3Scalar& destVal)
1142 {
1143 #ifdef B3_USE_DOUBLE_PRECISION
1144         unsigned char* dest = (unsigned char*)&destVal;
1145         unsigned char* src = (unsigned char*)&sourceVal;
1146         dest[0] = src[7];
1147         dest[1] = src[6];
1148         dest[2] = src[5];
1149         dest[3] = src[4];
1150         dest[4] = src[3];
1151         dest[5] = src[2];
1152         dest[6] = src[1];
1153         dest[7] = src[0];
1154 #else
1155         unsigned char* dest = (unsigned char*)&destVal;
1156         unsigned char* src = (unsigned char*)&sourceVal;
1157         dest[0] = src[3];
1158         dest[1] = src[2];
1159         dest[2] = src[1];
1160         dest[3] = src[0];
1161 #endif  //B3_USE_DOUBLE_PRECISION
1162 }
1163 ///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
1164 B3_FORCE_INLINE void b3SwapVector3Endian(const b3Vector3& sourceVec, b3Vector3& destVec)
1165 {
1166         for (int i = 0; i < 4; i++)
1167         {
1168                 b3SwapScalarEndian(sourceVec[i], destVec[i]);
1169         }
1170 }
1171
1172 ///b3UnSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
1173 B3_FORCE_INLINE void b3UnSwapVector3Endian(b3Vector3& vector)
1174 {
1175         b3Vector3 swappedVec;
1176         for (int i = 0; i < 4; i++)
1177         {
1178                 b3SwapScalarEndian(vector[i], swappedVec[i]);
1179         }
1180         vector = swappedVec;
1181 }
1182
1183 template <class T>
1184 B3_FORCE_INLINE void b3PlaneSpace1(const T& n, T& p, T& q)
1185 {
1186         if (b3Fabs(n[2]) > B3_SQRT12)
1187         {
1188                 // choose p in y-z plane
1189                 b3Scalar a = n[1] * n[1] + n[2] * n[2];
1190                 b3Scalar k = b3RecipSqrt(a);
1191                 p[0] = 0;
1192                 p[1] = -n[2] * k;
1193                 p[2] = n[1] * k;
1194                 // set q = n x p
1195                 q[0] = a * k;
1196                 q[1] = -n[0] * p[2];
1197                 q[2] = n[0] * p[1];
1198         }
1199         else
1200         {
1201                 // choose p in x-y plane
1202                 b3Scalar a = n[0] * n[0] + n[1] * n[1];
1203                 b3Scalar k = b3RecipSqrt(a);
1204                 p[0] = -n[1] * k;
1205                 p[1] = n[0] * k;
1206                 p[2] = 0;
1207                 // set q = n x p
1208                 q[0] = -n[2] * p[1];
1209                 q[1] = n[2] * p[0];
1210                 q[2] = a * k;
1211         }
1212 }
1213
1214 struct b3Vector3FloatData
1215 {
1216         float m_floats[4];
1217 };
1218
1219 struct b3Vector3DoubleData
1220 {
1221         double m_floats[4];
1222 };
1223
1224 B3_FORCE_INLINE void b3Vector3::serializeFloat(struct b3Vector3FloatData& dataOut) const
1225 {
1226         ///could also do a memcpy, check if it is worth it
1227         for (int i = 0; i < 4; i++)
1228                 dataOut.m_floats[i] = float(m_floats[i]);
1229 }
1230
1231 B3_FORCE_INLINE void b3Vector3::deSerializeFloat(const struct b3Vector3FloatData& dataIn)
1232 {
1233         for (int i = 0; i < 4; i++)
1234                 m_floats[i] = b3Scalar(dataIn.m_floats[i]);
1235 }
1236
1237 B3_FORCE_INLINE void b3Vector3::serializeDouble(struct b3Vector3DoubleData& dataOut) const
1238 {
1239         ///could also do a memcpy, check if it is worth it
1240         for (int i = 0; i < 4; i++)
1241                 dataOut.m_floats[i] = double(m_floats[i]);
1242 }
1243
1244 B3_FORCE_INLINE void b3Vector3::deSerializeDouble(const struct b3Vector3DoubleData& dataIn)
1245 {
1246         for (int i = 0; i < 4; i++)
1247                 m_floats[i] = b3Scalar(dataIn.m_floats[i]);
1248 }
1249
1250 B3_FORCE_INLINE void b3Vector3::serialize(struct b3Vector3Data& dataOut) const
1251 {
1252         ///could also do a memcpy, check if it is worth it
1253         for (int i = 0; i < 4; i++)
1254                 dataOut.m_floats[i] = m_floats[i];
1255 }
1256
1257 B3_FORCE_INLINE void b3Vector3::deSerialize(const struct b3Vector3Data& dataIn)
1258 {
1259         for (int i = 0; i < 4; i++)
1260                 m_floats[i] = dataIn.m_floats[i];
1261 }
1262
1263 inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z)
1264 {
1265         b3Vector3 tmp;
1266         tmp.setValue(x, y, z);
1267         return tmp;
1268 }
1269
1270 inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w)
1271 {
1272         b3Vector3 tmp;
1273         tmp.setValue(x, y, z);
1274         tmp.w = w;
1275         return tmp;
1276 }
1277
1278 inline b3Vector4 b3MakeVector4(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w)
1279 {
1280         b3Vector4 tmp;
1281         tmp.setValue(x, y, z, w);
1282         return tmp;
1283 }
1284
1285 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
1286
1287 inline b3Vector3 b3MakeVector3(b3SimdFloat4 v)
1288 {
1289         b3Vector3 tmp;
1290         tmp.set128(v);
1291         return tmp;
1292 }
1293
1294 inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec)
1295 {
1296         b3Vector4 tmp;
1297         tmp.set128(vec);
1298         return tmp;
1299 }
1300
1301 #endif
1302
1303 #endif  //B3_VECTOR3_H