dali-physics/third-party/bullet3/src/LinearMath/btVector3.h

   1 /*
   2 Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  https://bulletphysics.org
   3
   4 This software is provided 'as-is', without any express or implied warranty.
   5 In no event will the authors be held liable for any damages arising from the use of this software.
   6 Permission is granted to anyone to use this software for any purpose,
   7 including commercial applications, and to alter it and redistribute it freely,
   8 subject to the following restrictions:
   9
  10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  12 3. This notice may not be removed or altered from any source distribution.
  13 */
  14
  15 #ifndef BT_VECTOR3_H
  16 #define BT_VECTOR3_H
  17
  18 //#include <stdint.h>
  19 #include "btScalar.h"
  20 #include "btMinMax.h"
  21 #include "btAlignedAllocator.h"
  22
  23 #ifdef BT_USE_DOUBLE_PRECISION
  24 #define btVector3Data btVector3DoubleData
  25 #define btVector3DataName "btVector3DoubleData"
  26 #else
  27 #define btVector3Data btVector3FloatData
  28 #define btVector3DataName "btVector3FloatData"
  29 #endif  //BT_USE_DOUBLE_PRECISION
  30
  31 #if defined BT_USE_SSE
  32
  33 //typedef  uint32_t __m128i __attribute__ ((vector_size(16)));
  34
  35 #ifdef _MSC_VER
  36 #pragma warning(disable : 4556)  // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
  37 #endif
  38
  39 #define BT_SHUFFLE(x, y, z, w) (((w) << 6 | (z) << 4 | (y) << 2 | (x)) & 0xff)
  40 //#define bt_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) )
  41 #define bt_pshufd_ps(_a, _mask) _mm_shuffle_ps((_a), (_a), (_mask))
  42 #define bt_splat3_ps(_a, _i) bt_pshufd_ps((_a), BT_SHUFFLE(_i, _i, _i, 3))
  43 #define bt_splat_ps(_a, _i) bt_pshufd_ps((_a), BT_SHUFFLE(_i, _i, _i, _i))
  44
  45 #define btv3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
  46 #define btvAbsMask (_mm_set_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
  47 #define btvFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
  48 #define btv3AbsfMask btCastiTo128f(btv3AbsiMask)
  49 #define btvFFF0fMask btCastiTo128f(btvFFF0Mask)
  50 #define btvxyzMaskf btvFFF0fMask
  51 #define btvAbsfMask btCastiTo128f(btvAbsMask)
  52
  53 //there is an issue with XCode 3.2 (LCx errors)
  54 #define btvMzeroMask (_mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f))
  55 #define v1110 (_mm_set_ps(0.0f, 1.0f, 1.0f, 1.0f))
  56 #define vHalf (_mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f))
  57 #define v1_5 (_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f))
  58
  59 //const __m128 ATTRIBUTE_ALIGNED16(btvMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
  60 //const __m128 ATTRIBUTE_ALIGNED16(v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
  61 //const __m128 ATTRIBUTE_ALIGNED16(vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
  62 //const __m128 ATTRIBUTE_ALIGNED16(v1_5)  = {1.5f, 1.5f, 1.5f, 1.5f};
  63
  64 #endif
  65
  66 #ifdef BT_USE_NEON
  67
  68 const float32x4_t ATTRIBUTE_ALIGNED16(btvMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
  69 const int32x4_t ATTRIBUTE_ALIGNED16(btvFFF0Mask) = (int32x4_t){static_cast<int32_t>(0xFFFFFFFF),
  70                                                                                                                            static_cast<int32_t>(0xFFFFFFFF), static_cast<int32_t>(0xFFFFFFFF), 0x0};
  71 const int32x4_t ATTRIBUTE_ALIGNED16(btvAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
  72 const int32x4_t ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
  73
  74 #endif
  75
  76 /**@brief btVector3 can be used to represent 3D points and vectors.
  77  * It has an un-used w component to suit 16-byte alignment when btVector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user
  78  * Ideally, this class should be replaced by a platform optimized SIMD version that keeps the data in registers
  79  */
  80 ATTRIBUTE_ALIGNED16(class)
  81 btVector3
  82 {
  83 public:
  84         BT_DECLARE_ALIGNED_ALLOCATOR();
  85
  86 #if defined(__SPU__) && defined(__CELLOS_LV2__)
  87         btScalar m_floats[4];
  88
  89 public:
  90         SIMD_FORCE_INLINE const vec_float4& get128() const
  91         {
  92                 return *((const vec_float4*)&m_floats[0]);
  93         }
  94
  95 public:
  96 #else                                            //__CELLOS_LV2__ __SPU__
  97 #if defined(BT_USE_SSE) || defined(BT_USE_NEON)  // _WIN32 || ARM
  98         union {
  99                 btSimdFloat4 mVec128;
 100                 btScalar m_floats[4];
 101         };
 102         SIMD_FORCE_INLINE btSimdFloat4 get128() const
 103         {
 104                 return mVec128;
 105         }
 106         SIMD_FORCE_INLINE void set128(btSimdFloat4 v128)
 107         {
 108                 mVec128 = v128;
 109         }
 110 #else
 111         btScalar m_floats[4];
 112 #endif
 113 #endif  //__CELLOS_LV2__ __SPU__
 114
 115 public:
 116         /**@brief No initialization constructor */
 117         SIMD_FORCE_INLINE btVector3()
 118         {
 119         }
 120
 121         /**@brief Constructor from scalars
 122    * @param x X value
 123    * @param y Y value
 124    * @param z Z value
 125    */
 126         SIMD_FORCE_INLINE btVector3(const btScalar& _x, const btScalar& _y, const btScalar& _z)
 127         {
 128                 m_floats[0] = _x;
 129                 m_floats[1] = _y;
 130                 m_floats[2] = _z;
 131                 m_floats[3] = btScalar(0.f);
 132         }
 133
 134 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
 135         // Set Vector
 136         SIMD_FORCE_INLINE btVector3(btSimdFloat4 v)
 137         {
 138                 mVec128 = v;
 139         }
 140
 141         // Copy constructor
 142         SIMD_FORCE_INLINE btVector3(const btVector3& rhs)
 143         {
 144                 mVec128 = rhs.mVec128;
 145         }
 146
 147         // Assignment Operator
 148         SIMD_FORCE_INLINE btVector3&
 149         operator=(const btVector3& v)
 150         {
 151                 mVec128 = v.mVec128;
 152
 153                 return *this;
 154         }
 155 #endif  // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
 156
 157         /**@brief Add a vector to this one
 158  * @param The vector to add to this one */
 159         SIMD_FORCE_INLINE btVector3& operator+=(const btVector3& v)
 160         {
 161 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 162                 mVec128 = _mm_add_ps(mVec128, v.mVec128);
 163 #elif defined(BT_USE_NEON)
 164                 mVec128 = vaddq_f32(mVec128, v.mVec128);
 165 #else
 166                 m_floats[0] += v.m_floats[0];
 167                 m_floats[1] += v.m_floats[1];
 168                 m_floats[2] += v.m_floats[2];
 169 #endif
 170                 return *this;
 171         }
 172
 173         /**@brief Subtract a vector from this one
 174    * @param The vector to subtract */
 175         SIMD_FORCE_INLINE btVector3& operator-=(const btVector3& v)
 176         {
 177 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 178                 mVec128 = _mm_sub_ps(mVec128, v.mVec128);
 179 #elif defined(BT_USE_NEON)
 180                 mVec128 = vsubq_f32(mVec128, v.mVec128);
 181 #else
 182                 m_floats[0] -= v.m_floats[0];
 183                 m_floats[1] -= v.m_floats[1];
 184                 m_floats[2] -= v.m_floats[2];
 185 #endif
 186                 return *this;
 187         }
 188
 189         /**@brief Scale the vector
 190    * @param s Scale factor */
 191         SIMD_FORCE_INLINE btVector3& operator*=(const btScalar& s)
 192         {
 193 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 194                 __m128 vs = _mm_load_ss(&s);  //        (S 0 0 0)
 195                 vs = bt_pshufd_ps(vs, 0x80);  //        (S S S 0.0)
 196                 mVec128 = _mm_mul_ps(mVec128, vs);
 197 #elif defined(BT_USE_NEON)
 198                 mVec128 = vmulq_n_f32(mVec128, s);
 199 #else
 200                 m_floats[0] *= s;
 201                 m_floats[1] *= s;
 202                 m_floats[2] *= s;
 203 #endif
 204                 return *this;
 205         }
 206
 207         /**@brief Inversely scale the vector
 208    * @param s Scale factor to divide by */
 209         SIMD_FORCE_INLINE btVector3& operator/=(const btScalar& s)
 210         {
 211                 btFullAssert(s != btScalar(0.0));
 212
 213 #if 0  //defined(BT_USE_SSE_IN_API)
 214 // this code is not faster !
 215                 __m128 vs = _mm_load_ss(&s);
 216                 vs = _mm_div_ss(v1110, vs);
 217                 vs = bt_pshufd_ps(vs, 0x00);    //      (S S S S)
 218
 219                 mVec128 = _mm_mul_ps(mVec128, vs);
 220
 221                 return *this;
 222 #else
 223                 return *this *= btScalar(1.0) / s;
 224 #endif
 225         }
 226
 227         /**@brief Return the dot product
 228    * @param v The other vector in the dot product */
 229         SIMD_FORCE_INLINE btScalar dot(const btVector3& v) const
 230         {
 231 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 232                 __m128 vd = _mm_mul_ps(mVec128, v.mVec128);
 233                 __m128 z = _mm_movehl_ps(vd, vd);
 234                 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
 235                 vd = _mm_add_ss(vd, y);
 236                 vd = _mm_add_ss(vd, z);
 237                 return _mm_cvtss_f32(vd);
 238 #elif defined(BT_USE_NEON)
 239                 float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
 240                 float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));
 241                 x = vadd_f32(x, vget_high_f32(vd));
 242                 return vget_lane_f32(x, 0);
 243 #else
 244                 return m_floats[0] * v.m_floats[0] +
 245                            m_floats[1] * v.m_floats[1] +
 246                            m_floats[2] * v.m_floats[2];
 247 #endif
 248         }
 249
 250         /**@brief Return the length of the vector squared */
 251         SIMD_FORCE_INLINE btScalar length2() const
 252         {
 253                 return dot(*this);
 254         }
 255
 256         /**@brief Return the length of the vector */
 257         SIMD_FORCE_INLINE btScalar length() const
 258         {
 259                 return btSqrt(length2());
 260         }
 261
 262         /**@brief Return the norm (length) of the vector */
 263         SIMD_FORCE_INLINE btScalar norm() const
 264         {
 265                 return length();
 266         }
 267
 268         /**@brief Return the norm (length) of the vector */
 269         SIMD_FORCE_INLINE btScalar safeNorm() const
 270         {
 271                 btScalar d = length2();
 272                 //workaround for some clang/gcc issue of sqrtf(tiny number) = -INF
 273                 if (d > SIMD_EPSILON)
 274                         return btSqrt(d);
 275                 return btScalar(0);
 276         }
 277
 278         /**@brief Return the distance squared between the ends of this and another vector
 279    * This is symantically treating the vector like a point */
 280         SIMD_FORCE_INLINE btScalar distance2(const btVector3& v) const;
 281
 282         /**@brief Return the distance between the ends of this and another vector
 283    * This is symantically treating the vector like a point */
 284         SIMD_FORCE_INLINE btScalar distance(const btVector3& v) const;
 285
 286         SIMD_FORCE_INLINE btVector3& safeNormalize()
 287         {
 288                 btScalar l2 = length2();
 289                 //triNormal.normalize();
 290                 if (l2 >= SIMD_EPSILON * SIMD_EPSILON)
 291                 {
 292                         (*this) /= btSqrt(l2);
 293                 }
 294                 else
 295                 {
 296                         setValue(1, 0, 0);
 297                 }
 298                 return *this;
 299         }
 300
 301         /**@brief Normalize this vector
 302    * x^2 + y^2 + z^2 = 1 */
 303         SIMD_FORCE_INLINE btVector3& normalize()
 304         {
 305                 btAssert(!fuzzyZero());
 306
 307 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 308                 // dot product first
 309                 __m128 vd = _mm_mul_ps(mVec128, mVec128);
 310                 __m128 z = _mm_movehl_ps(vd, vd);
 311                 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
 312                 vd = _mm_add_ss(vd, y);
 313                 vd = _mm_add_ss(vd, z);
 314
 315 #if 0
 316         vd = _mm_sqrt_ss(vd);
 317                 vd = _mm_div_ss(v1110, vd);
 318                 vd = bt_splat_ps(vd, 0x80);
 319                 mVec128 = _mm_mul_ps(mVec128, vd);
 320 #else
 321
 322                 // NR step 1/sqrt(x) - vd is x, y is output
 323                 y = _mm_rsqrt_ss(vd);  // estimate
 324
 325                 //  one step NR
 326                 z = v1_5;
 327                 vd = _mm_mul_ss(vd, vHalf);  // vd * 0.5
 328                 //x2 = vd;
 329                 vd = _mm_mul_ss(vd, y);  // vd * 0.5 * y0
 330                 vd = _mm_mul_ss(vd, y);  // vd * 0.5 * y0 * y0
 331                 z = _mm_sub_ss(z, vd);   // 1.5 - vd * 0.5 * y0 * y0
 332
 333                 y = _mm_mul_ss(y, z);  // y0 * (1.5 - vd * 0.5 * y0 * y0)
 334
 335                 y = bt_splat_ps(y, 0x80);
 336                 mVec128 = _mm_mul_ps(mVec128, y);
 337
 338 #endif
 339
 340                 return *this;
 341 #else
 342                 return *this /= length();
 343 #endif
 344         }
 345
 346         /**@brief Return a normalized version of this vector */
 347         SIMD_FORCE_INLINE btVector3 normalized() const;
 348
 349         /**@brief Return a rotated version of this vector
 350    * @param wAxis The axis to rotate about
 351    * @param angle The angle to rotate by */
 352         SIMD_FORCE_INLINE btVector3 rotate(const btVector3& wAxis, const btScalar angle) const;
 353
 354         /**@brief Return the angle between this and another vector
 355    * @param v The other vector */
 356         SIMD_FORCE_INLINE btScalar angle(const btVector3& v) const
 357         {
 358                 btScalar s = btSqrt(length2() * v.length2());
 359                 btFullAssert(s != btScalar(0.0));
 360                 return btAcos(dot(v) / s);
 361         }
 362
 363         /**@brief Return a vector with the absolute values of each element */
 364         SIMD_FORCE_INLINE btVector3 absolute() const
 365         {
 366 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 367                 return btVector3(_mm_and_ps(mVec128, btv3AbsfMask));
 368 #elif defined(BT_USE_NEON)
 369                 return btVector3(vabsq_f32(mVec128));
 370 #else
 371                 return btVector3(
 372                         btFabs(m_floats[0]),
 373                         btFabs(m_floats[1]),
 374                         btFabs(m_floats[2]));
 375 #endif
 376         }
 377
 378         /**@brief Return the cross product between this and another vector
 379    * @param v The other vector */
 380         SIMD_FORCE_INLINE btVector3 cross(const btVector3& v) const
 381         {
 382 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 383                 __m128 T, V;
 384
 385                 T = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 3));    //        (Y Z X 0)
 386                 V = bt_pshufd_ps(v.mVec128, BT_SHUFFLE(1, 2, 0, 3));  //        (Y Z X 0)
 387
 388                 V = _mm_mul_ps(V, mVec128);
 389                 T = _mm_mul_ps(T, v.mVec128);
 390                 V = _mm_sub_ps(V, T);
 391
 392                 V = bt_pshufd_ps(V, BT_SHUFFLE(1, 2, 0, 3));
 393                 return btVector3(V);
 394 #elif defined(BT_USE_NEON)
 395                 float32x4_t T, V;
 396                 // form (Y, Z, X, _) of mVec128 and v.mVec128
 397                 float32x2_t Tlow = vget_low_f32(mVec128);
 398                 float32x2_t Vlow = vget_low_f32(v.mVec128);
 399                 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
 400                 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
 401
 402                 V = vmulq_f32(V, mVec128);
 403                 T = vmulq_f32(T, v.mVec128);
 404                 V = vsubq_f32(V, T);
 405                 Vlow = vget_low_f32(V);
 406                 // form (Y, Z, X, _);
 407                 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
 408                 V = (float32x4_t)vandq_s32((int32x4_t)V, btvFFF0Mask);
 409
 410                 return btVector3(V);
 411 #else
 412                 return btVector3(
 413                         m_floats[1] * v.m_floats[2] - m_floats[2] * v.m_floats[1],
 414                         m_floats[2] * v.m_floats[0] - m_floats[0] * v.m_floats[2],
 415                         m_floats[0] * v.m_floats[1] - m_floats[1] * v.m_floats[0]);
 416 #endif
 417         }
 418
 419         SIMD_FORCE_INLINE btScalar triple(const btVector3& v1, const btVector3& v2) const
 420         {
 421 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 422                 // cross:
 423                 __m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3));  //  (Y Z X 0)
 424                 __m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3));  //  (Y Z X 0)
 425
 426                 V = _mm_mul_ps(V, v1.mVec128);
 427                 T = _mm_mul_ps(T, v2.mVec128);
 428                 V = _mm_sub_ps(V, T);
 429
 430                 V = _mm_shuffle_ps(V, V, BT_SHUFFLE(1, 2, 0, 3));
 431
 432                 // dot:
 433                 V = _mm_mul_ps(V, mVec128);
 434                 __m128 z = _mm_movehl_ps(V, V);
 435                 __m128 y = _mm_shuffle_ps(V, V, 0x55);
 436                 V = _mm_add_ss(V, y);
 437                 V = _mm_add_ss(V, z);
 438                 return _mm_cvtss_f32(V);
 439
 440 #elif defined(BT_USE_NEON)
 441                 // cross:
 442                 float32x4_t T, V;
 443                 // form (Y, Z, X, _) of mVec128 and v.mVec128
 444                 float32x2_t Tlow = vget_low_f32(v1.mVec128);
 445                 float32x2_t Vlow = vget_low_f32(v2.mVec128);
 446                 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
 447                 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
 448
 449                 V = vmulq_f32(V, v1.mVec128);
 450                 T = vmulq_f32(T, v2.mVec128);
 451                 V = vsubq_f32(V, T);
 452                 Vlow = vget_low_f32(V);
 453                 // form (Y, Z, X, _);
 454                 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
 455
 456                 // dot:
 457                 V = vmulq_f32(mVec128, V);
 458                 float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));
 459                 x = vadd_f32(x, vget_high_f32(V));
 460                 return vget_lane_f32(x, 0);
 461 #else
 462                 return m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) +
 463                            m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) +
 464                            m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
 465 #endif
 466         }
 467
 468         /**@brief Return the axis with the smallest value
 469    * Note return values are 0,1,2 for x, y, or z */
 470         SIMD_FORCE_INLINE int minAxis() const
 471         {
 472                 return m_floats[0] < m_floats[1] ? (m_floats[0] < m_floats[2] ? 0 : 2) : (m_floats[1] < m_floats[2] ? 1 : 2);
 473         }
 474
 475         /**@brief Return the axis with the largest value
 476    * Note return values are 0,1,2 for x, y, or z */
 477         SIMD_FORCE_INLINE int maxAxis() const
 478         {
 479                 return m_floats[0] < m_floats[1] ? (m_floats[1] < m_floats[2] ? 2 : 1) : (m_floats[0] < m_floats[2] ? 2 : 0);
 480         }
 481
 482         SIMD_FORCE_INLINE int furthestAxis() const
 483         {
 484                 return absolute().minAxis();
 485         }
 486
 487         SIMD_FORCE_INLINE int closestAxis() const
 488         {
 489                 return absolute().maxAxis();
 490         }
 491
 492         SIMD_FORCE_INLINE void setInterpolate3(const btVector3& v0, const btVector3& v1, btScalar rt)
 493         {
 494 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 495                 __m128 vrt = _mm_load_ss(&rt);  //      (rt 0 0 0)
 496                 btScalar s = btScalar(1.0) - rt;
 497                 __m128 vs = _mm_load_ss(&s);  //        (S 0 0 0)
 498                 vs = bt_pshufd_ps(vs, 0x80);  //        (S S S 0.0)
 499                 __m128 r0 = _mm_mul_ps(v0.mVec128, vs);
 500                 vrt = bt_pshufd_ps(vrt, 0x80);  //      (rt rt rt 0.0)
 501                 __m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
 502                 __m128 tmp3 = _mm_add_ps(r0, r1);
 503                 mVec128 = tmp3;
 504 #elif defined(BT_USE_NEON)
 505                 float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128);
 506                 vl = vmulq_n_f32(vl, rt);
 507                 mVec128 = vaddq_f32(vl, v0.mVec128);
 508 #else
 509                 btScalar s = btScalar(1.0) - rt;
 510                 m_floats[0] = s * v0.m_floats[0] + rt * v1.m_floats[0];
 511                 m_floats[1] = s * v0.m_floats[1] + rt * v1.m_floats[1];
 512                 m_floats[2] = s * v0.m_floats[2] + rt * v1.m_floats[2];
 513                 //don't do the unused w component
 514                 //              m_co[3] = s * v0[3] + rt * v1[3];
 515 #endif
 516         }
 517
 518         /**@brief Return the linear interpolation between this and another vector
 519    * @param v The other vector
 520    * @param t The ration of this to v (t = 0 => return this, t=1 => return other) */
 521         SIMD_FORCE_INLINE btVector3 lerp(const btVector3& v, const btScalar& t) const
 522         {
 523 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 524                 __m128 vt = _mm_load_ss(&t);  //        (t 0 0 0)
 525                 vt = bt_pshufd_ps(vt, 0x80);  //        (rt rt rt 0.0)
 526                 __m128 vl = _mm_sub_ps(v.mVec128, mVec128);
 527                 vl = _mm_mul_ps(vl, vt);
 528                 vl = _mm_add_ps(vl, mVec128);
 529
 530                 return btVector3(vl);
 531 #elif defined(BT_USE_NEON)
 532                 float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
 533                 vl = vmulq_n_f32(vl, t);
 534                 vl = vaddq_f32(vl, mVec128);
 535
 536                 return btVector3(vl);
 537 #else
 538                 return btVector3(m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
 539                                                  m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
 540                                                  m_floats[2] + (v.m_floats[2] - m_floats[2]) * t);
 541 #endif
 542         }
 543
 544         /**@brief Elementwise multiply this vector by the other
 545    * @param v The other vector */
 546         SIMD_FORCE_INLINE btVector3& operator*=(const btVector3& v)
 547         {
 548 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 549                 mVec128 = _mm_mul_ps(mVec128, v.mVec128);
 550 #elif defined(BT_USE_NEON)
 551                 mVec128 = vmulq_f32(mVec128, v.mVec128);
 552 #else
 553                 m_floats[0] *= v.m_floats[0];
 554                 m_floats[1] *= v.m_floats[1];
 555                 m_floats[2] *= v.m_floats[2];
 556 #endif
 557                 return *this;
 558         }
 559
 560         /**@brief Return the x value */
 561         SIMD_FORCE_INLINE const btScalar& getX() const { return m_floats[0]; }
 562         /**@brief Return the y value */
 563         SIMD_FORCE_INLINE const btScalar& getY() const { return m_floats[1]; }
 564         /**@brief Return the z value */
 565         SIMD_FORCE_INLINE const btScalar& getZ() const { return m_floats[2]; }
 566         /**@brief Set the x value */
 567         SIMD_FORCE_INLINE void setX(btScalar _x) { m_floats[0] = _x; };
 568         /**@brief Set the y value */
 569         SIMD_FORCE_INLINE void setY(btScalar _y) { m_floats[1] = _y; };
 570         /**@brief Set the z value */
 571         SIMD_FORCE_INLINE void setZ(btScalar _z) { m_floats[2] = _z; };
 572         /**@brief Set the w value */
 573         SIMD_FORCE_INLINE void setW(btScalar _w) { m_floats[3] = _w; };
 574         /**@brief Return the x value */
 575         SIMD_FORCE_INLINE const btScalar& x() const { return m_floats[0]; }
 576         /**@brief Return the y value */
 577         SIMD_FORCE_INLINE const btScalar& y() const { return m_floats[1]; }
 578         /**@brief Return the z value */
 579         SIMD_FORCE_INLINE const btScalar& z() const { return m_floats[2]; }
 580         /**@brief Return the w value */
 581         SIMD_FORCE_INLINE const btScalar& w() const { return m_floats[3]; }
 582
 583         //SIMD_FORCE_INLINE btScalar&       operator[](int i)       { return (&m_floats[0])[i]; }
 584         //SIMD_FORCE_INLINE const btScalar& operator[](int i) const { return (&m_floats[0])[i]; }
 585         ///operator btScalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons.
 586         SIMD_FORCE_INLINE operator btScalar*() { return &m_floats[0]; }
 587         SIMD_FORCE_INLINE operator const btScalar*() const { return &m_floats[0]; }
 588
 589         SIMD_FORCE_INLINE bool operator==(const btVector3& other) const
 590         {
 591 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 592                 return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
 593 #else
 594                 return ((m_floats[3] == other.m_floats[3]) &&
 595                                 (m_floats[2] == other.m_floats[2]) &&
 596                                 (m_floats[1] == other.m_floats[1]) &&
 597                                 (m_floats[0] == other.m_floats[0]));
 598 #endif
 599         }
 600
 601         SIMD_FORCE_INLINE bool operator!=(const btVector3& other) const
 602         {
 603                 return !(*this == other);
 604         }
 605
 606         /**@brief Set each element to the max of the current values and the values of another btVector3
 607    * @param other The other btVector3 to compare with
 608    */
 609         SIMD_FORCE_INLINE void setMax(const btVector3& other)
 610         {
 611 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 612                 mVec128 = _mm_max_ps(mVec128, other.mVec128);
 613 #elif defined(BT_USE_NEON)
 614                 mVec128 = vmaxq_f32(mVec128, other.mVec128);
 615 #else
 616                 btSetMax(m_floats[0], other.m_floats[0]);
 617                 btSetMax(m_floats[1], other.m_floats[1]);
 618                 btSetMax(m_floats[2], other.m_floats[2]);
 619                 btSetMax(m_floats[3], other.w());
 620 #endif
 621         }
 622
 623         /**@brief Set each element to the min of the current values and the values of another btVector3
 624    * @param other The other btVector3 to compare with
 625    */
 626         SIMD_FORCE_INLINE void setMin(const btVector3& other)
 627         {
 628 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 629                 mVec128 = _mm_min_ps(mVec128, other.mVec128);
 630 #elif defined(BT_USE_NEON)
 631                 mVec128 = vminq_f32(mVec128, other.mVec128);
 632 #else
 633                 btSetMin(m_floats[0], other.m_floats[0]);
 634                 btSetMin(m_floats[1], other.m_floats[1]);
 635                 btSetMin(m_floats[2], other.m_floats[2]);
 636                 btSetMin(m_floats[3], other.w());
 637 #endif
 638         }
 639
 640         SIMD_FORCE_INLINE void setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z)
 641         {
 642                 m_floats[0] = _x;
 643                 m_floats[1] = _y;
 644                 m_floats[2] = _z;
 645                 m_floats[3] = btScalar(0.f);
 646         }
 647
 648         void getSkewSymmetricMatrix(btVector3 * v0, btVector3 * v1, btVector3 * v2) const
 649         {
 650 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 651
 652                 __m128 V = _mm_and_ps(mVec128, btvFFF0fMask);
 653                 __m128 V0 = _mm_xor_ps(btvMzeroMask, V);
 654                 __m128 V2 = _mm_movelh_ps(V0, V);
 655
 656                 __m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
 657
 658                 V0 = _mm_shuffle_ps(V0, V, 0xDB);
 659                 V2 = _mm_shuffle_ps(V2, V, 0xF9);
 660
 661                 v0->mVec128 = V0;
 662                 v1->mVec128 = V1;
 663                 v2->mVec128 = V2;
 664 #else
 665                 v0->setValue(0., -z(), y());
 666                 v1->setValue(z(), 0., -x());
 667                 v2->setValue(-y(), x(), 0.);
 668 #endif
 669         }
 670
 671         void setZero()
 672         {
 673 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 674                 mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
 675 #elif defined(BT_USE_NEON)
 676                 int32x4_t vi = vdupq_n_s32(0);
 677                 mVec128 = vreinterpretq_f32_s32(vi);
 678 #else
 679                 setValue(btScalar(0.), btScalar(0.), btScalar(0.));
 680 #endif
 681         }
 682
 683         SIMD_FORCE_INLINE bool isZero() const
 684         {
 685                 return m_floats[0] == btScalar(0) && m_floats[1] == btScalar(0) && m_floats[2] == btScalar(0);
 686         }
 687
 688         SIMD_FORCE_INLINE bool fuzzyZero() const
 689         {
 690                 return length2() < SIMD_EPSILON * SIMD_EPSILON;
 691         }
 692
 693         SIMD_FORCE_INLINE void serialize(struct btVector3Data & dataOut) const;
 694
 695         SIMD_FORCE_INLINE void deSerialize(const struct btVector3DoubleData& dataIn);
 696
 697         SIMD_FORCE_INLINE void deSerialize(const struct btVector3FloatData& dataIn);
 698
 699         SIMD_FORCE_INLINE void serializeFloat(struct btVector3FloatData & dataOut) const;
 700
 701         SIMD_FORCE_INLINE void deSerializeFloat(const struct btVector3FloatData& dataIn);
 702
 703         SIMD_FORCE_INLINE void serializeDouble(struct btVector3DoubleData & dataOut) const;
 704
 705         SIMD_FORCE_INLINE void deSerializeDouble(const struct btVector3DoubleData& dataIn);
 706
 707         /**@brief returns index of maximum dot product between this and vectors in array[]
 708          * @param array The other vectors
 709          * @param array_count The number of other vectors
 710          * @param dotOut The maximum dot product */
 711         SIMD_FORCE_INLINE long maxDot(const btVector3* array, long array_count, btScalar& dotOut) const;
 712
 713         /**@brief returns index of minimum dot product between this and vectors in array[]
 714          * @param array The other vectors
 715          * @param array_count The number of other vectors
 716          * @param dotOut The minimum dot product */
 717         SIMD_FORCE_INLINE long minDot(const btVector3* array, long array_count, btScalar& dotOut) const;
 718
 719         /* create a vector as  btVector3( this->dot( btVector3 v0 ), this->dot( btVector3 v1), this->dot( btVector3 v2 ))  */
 720         SIMD_FORCE_INLINE btVector3 dot3(const btVector3& v0, const btVector3& v1, const btVector3& v2) const
 721         {
 722 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 723
 724                 __m128 a0 = _mm_mul_ps(v0.mVec128, this->mVec128);
 725                 __m128 a1 = _mm_mul_ps(v1.mVec128, this->mVec128);
 726                 __m128 a2 = _mm_mul_ps(v2.mVec128, this->mVec128);
 727                 __m128 b0 = _mm_unpacklo_ps(a0, a1);
 728                 __m128 b1 = _mm_unpackhi_ps(a0, a1);
 729                 __m128 b2 = _mm_unpacklo_ps(a2, _mm_setzero_ps());
 730                 __m128 r = _mm_movelh_ps(b0, b2);
 731                 r = _mm_add_ps(r, _mm_movehl_ps(b2, b0));
 732                 a2 = _mm_and_ps(a2, btvxyzMaskf);
 733                 r = _mm_add_ps(r, btCastdTo128f(_mm_move_sd(btCastfTo128d(a2), btCastfTo128d(b1))));
 734                 return btVector3(r);
 735
 736 #elif defined(BT_USE_NEON)
 737                 static const uint32x4_t xyzMask = (const uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0};
 738                 float32x4_t a0 = vmulq_f32(v0.mVec128, this->mVec128);
 739                 float32x4_t a1 = vmulq_f32(v1.mVec128, this->mVec128);
 740                 float32x4_t a2 = vmulq_f32(v2.mVec128, this->mVec128);
 741                 float32x2x2_t zLo = vtrn_f32(vget_high_f32(a0), vget_high_f32(a1));
 742                 a2 = (float32x4_t)vandq_u32((uint32x4_t)a2, xyzMask);
 743                 float32x2_t b0 = vadd_f32(vpadd_f32(vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0]);
 744                 float32x2_t b1 = vpadd_f32(vpadd_f32(vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
 745                 return btVector3(vcombine_f32(b0, b1));
 746 #else
 747                 return btVector3(dot(v0), dot(v1), dot(v2));
 748 #endif
 749         }
 750 };
 751
 752 /**@brief Return the sum of two vectors (Point symantics)*/
 753 SIMD_FORCE_INLINE btVector3
 754 operator+(const btVector3& v1, const btVector3& v2)
 755 {
 756 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 757         return btVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
 758 #elif defined(BT_USE_NEON)
 759         return btVector3(vaddq_f32(v1.mVec128, v2.mVec128));
 760 #else
 761         return btVector3(
 762                 v1.m_floats[0] + v2.m_floats[0],
 763                 v1.m_floats[1] + v2.m_floats[1],
 764                 v1.m_floats[2] + v2.m_floats[2]);
 765 #endif
 766 }
 767
 768 /**@brief Return the elementwise product of two vectors */
 769 SIMD_FORCE_INLINE btVector3
 770 operator*(const btVector3& v1, const btVector3& v2)
 771 {
 772 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 773         return btVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
 774 #elif defined(BT_USE_NEON)
 775         return btVector3(vmulq_f32(v1.mVec128, v2.mVec128));
 776 #else
 777         return btVector3(
 778                 v1.m_floats[0] * v2.m_floats[0],
 779                 v1.m_floats[1] * v2.m_floats[1],
 780                 v1.m_floats[2] * v2.m_floats[2]);
 781 #endif
 782 }
 783
 784 /**@brief Return the difference between two vectors */
 785 SIMD_FORCE_INLINE btVector3
 786 operator-(const btVector3& v1, const btVector3& v2)
 787 {
 788 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
 789
 790         //      without _mm_and_ps this code causes slowdown in Concave moving
 791         __m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
 792         return btVector3(_mm_and_ps(r, btvFFF0fMask));
 793 #elif defined(BT_USE_NEON)
 794         float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
 795         return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
 796 #else
 797         return btVector3(
 798                 v1.m_floats[0] - v2.m_floats[0],
 799                 v1.m_floats[1] - v2.m_floats[1],
 800                 v1.m_floats[2] - v2.m_floats[2]);
 801 #endif
 802 }
 803
 804 /**@brief Return the negative of the vector */
 805 SIMD_FORCE_INLINE btVector3
 806 operator-(const btVector3& v)
 807 {
 808 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
 809         __m128 r = _mm_xor_ps(v.mVec128, btvMzeroMask);
 810         return btVector3(_mm_and_ps(r, btvFFF0fMask));
 811 #elif defined(BT_USE_NEON)
 812         return btVector3((btSimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)btvMzeroMask));
 813 #else
 814         return btVector3(-v.m_floats[0], -v.m_floats[1], -v.m_floats[2]);
 815 #endif
 816 }
 817
 818 /**@brief Return the vector scaled by s */
 819 SIMD_FORCE_INLINE btVector3
 820 operator*(const btVector3& v, const btScalar& s)
 821 {
 822 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 823         __m128 vs = _mm_load_ss(&s);  //        (S 0 0 0)
 824         vs = bt_pshufd_ps(vs, 0x80);  //        (S S S 0.0)
 825         return btVector3(_mm_mul_ps(v.mVec128, vs));
 826 #elif defined(BT_USE_NEON)
 827         float32x4_t r = vmulq_n_f32(v.mVec128, s);
 828         return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
 829 #else
 830         return btVector3(v.m_floats[0] * s, v.m_floats[1] * s, v.m_floats[2] * s);
 831 #endif
 832 }
 833
 834 /**@brief Return the vector scaled by s */
 835 SIMD_FORCE_INLINE btVector3
 836 operator*(const btScalar& s, const btVector3& v)
 837 {
 838         return v * s;
 839 }
 840
 841 /**@brief Return the vector inversely scaled by s */
 842 SIMD_FORCE_INLINE btVector3
 843 operator/(const btVector3& v, const btScalar& s)
 844 {
 845         btFullAssert(s != btScalar(0.0));
 846 #if 0  //defined(BT_USE_SSE_IN_API)
 847 // this code is not faster !
 848         __m128 vs = _mm_load_ss(&s);
 849     vs = _mm_div_ss(v1110, vs);
 850         vs = bt_pshufd_ps(vs, 0x00);    //      (S S S S)
 851
 852         return btVector3(_mm_mul_ps(v.mVec128, vs));
 853 #else
 854         return v * (btScalar(1.0) / s);
 855 #endif
 856 }
 857
 858 /**@brief Return the vector inversely scaled by s */
 859 SIMD_FORCE_INLINE btVector3
 860 operator/(const btVector3& v1, const btVector3& v2)
 861 {
 862 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
 863         __m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
 864         vec = _mm_and_ps(vec, btvFFF0fMask);
 865         return btVector3(vec);
 866 #elif defined(BT_USE_NEON)
 867         float32x4_t x, y, v, m;
 868
 869         x = v1.mVec128;
 870         y = v2.mVec128;
 871
 872         v = vrecpeq_f32(y);     // v ~ 1/y
 873         m = vrecpsq_f32(y, v);  // m = (2-v*y)
 874         v = vmulq_f32(v, m);    // vv = v*m ~~ 1/y
 875         m = vrecpsq_f32(y, v);  // mm = (2-vv*y)
 876         v = vmulq_f32(v, x);    // x*vv
 877         v = vmulq_f32(v, m);    // (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y
 878
 879         return btVector3(v);
 880 #else
 881         return btVector3(
 882                 v1.m_floats[0] / v2.m_floats[0],
 883                 v1.m_floats[1] / v2.m_floats[1],
 884                 v1.m_floats[2] / v2.m_floats[2]);
 885 #endif
 886 }
 887
 888 /**@brief Return the dot product between two vectors */
 889 SIMD_FORCE_INLINE btScalar
 890 btDot(const btVector3& v1, const btVector3& v2)
 891 {
 892         return v1.dot(v2);
 893 }
 894
 895 /**@brief Return the distance squared between two vectors */
 896 SIMD_FORCE_INLINE btScalar
 897 btDistance2(const btVector3& v1, const btVector3& v2)
 898 {
 899         return v1.distance2(v2);
 900 }
 901
 902 /**@brief Return the distance between two vectors */
 903 SIMD_FORCE_INLINE btScalar
 904 btDistance(const btVector3& v1, const btVector3& v2)
 905 {
 906         return v1.distance(v2);
 907 }
 908
 909 /**@brief Return the angle between two vectors */
 910 SIMD_FORCE_INLINE btScalar
 911 btAngle(const btVector3& v1, const btVector3& v2)
 912 {
 913         return v1.angle(v2);
 914 }
 915
 916 /**@brief Return the cross product of two vectors */
 917 SIMD_FORCE_INLINE btVector3
 918 btCross(const btVector3& v1, const btVector3& v2)
 919 {
 920         return v1.cross(v2);
 921 }
 922
 923 SIMD_FORCE_INLINE btScalar
 924 btTriple(const btVector3& v1, const btVector3& v2, const btVector3& v3)
 925 {
 926         return v1.triple(v2, v3);
 927 }
 928
 929 /**@brief Return the linear interpolation between two vectors
 930  * @param v1 One vector
 931  * @param v2 The other vector
 932  * @param t The ration of this to v (t = 0 => return v1, t=1 => return v2) */
 933 SIMD_FORCE_INLINE btVector3
 934 lerp(const btVector3& v1, const btVector3& v2, const btScalar& t)
 935 {
 936         return v1.lerp(v2, t);
 937 }
 938
 939 SIMD_FORCE_INLINE btScalar btVector3::distance2(const btVector3& v) const
 940 {
 941         return (v - *this).length2();
 942 }
 943
 944 SIMD_FORCE_INLINE btScalar btVector3::distance(const btVector3& v) const
 945 {
 946         return (v - *this).length();
 947 }
 948
 949 SIMD_FORCE_INLINE btVector3 btVector3::normalized() const
 950 {
 951         btVector3 nrm = *this;
 952
 953         return nrm.normalize();
 954 }
 955
 956 SIMD_FORCE_INLINE btVector3 btVector3::rotate(const btVector3& wAxis, const btScalar _angle) const
 957 {
 958         // wAxis must be a unit lenght vector
 959
 960 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
 961
 962         __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
 963         btScalar ssin = btSin(_angle);
 964         __m128 C = wAxis.cross(mVec128).mVec128;
 965         O = _mm_and_ps(O, btvFFF0fMask);
 966         btScalar scos = btCos(_angle);
 967
 968         __m128 vsin = _mm_load_ss(&ssin);  //   (S 0 0 0)
 969         __m128 vcos = _mm_load_ss(&scos);  //   (S 0 0 0)
 970
 971         __m128 Y = bt_pshufd_ps(O, 0xC9);  //   (Y Z X 0)
 972         __m128 Z = bt_pshufd_ps(O, 0xD2);  //   (Z X Y 0)
 973         O = _mm_add_ps(O, Y);
 974         vsin = bt_pshufd_ps(vsin, 0x80);  //    (S S S 0)
 975         O = _mm_add_ps(O, Z);
 976         vcos = bt_pshufd_ps(vcos, 0x80);  //    (S S S 0)
 977
 978         vsin = vsin * C;
 979         O = O * wAxis.mVec128;
 980         __m128 X = mVec128 - O;
 981
 982         O = O + vsin;
 983         vcos = vcos * X;
 984         O = O + vcos;
 985
 986         return btVector3(O);
 987 #else
 988         btVector3 o = wAxis * wAxis.dot(*this);
 989         btVector3 _x = *this - o;
 990         btVector3 _y;
 991
 992         _y = wAxis.cross(*this);
 993
 994         return (o + _x * btCos(_angle) + _y * btSin(_angle));
 995 #endif
 996 }
 997
 998 SIMD_FORCE_INLINE long btVector3::maxDot(const btVector3* array, long array_count, btScalar& dotOut) const
 999 {
1000 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON)
1001 #if defined _WIN32 || defined(BT_USE_SSE)
1002         const long scalar_cutoff = 10;
1003         long _maxdot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut);
1004 #elif defined BT_USE_NEON
1005         const long scalar_cutoff = 4;
1006         extern long (*_maxdot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut);
1007 #endif
1008         if (array_count < scalar_cutoff)
1009 #endif
1010         {
1011                 btScalar maxDot1 = -SIMD_INFINITY;
1012                 int i = 0;
1013                 int ptIndex = -1;
1014                 for (i = 0; i < array_count; i++)
1015                 {
1016                         btScalar dot = array[i].dot(*this);
1017
1018                         if (dot > maxDot1)
1019                         {
1020                                 maxDot1 = dot;
1021                                 ptIndex = i;
1022                         }
1023                 }
1024
1025                 dotOut = maxDot1;
1026                 return ptIndex;
1027         }
1028 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON)
1029         return _maxdot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut);
1030 #endif
1031 }
1032
1033 SIMD_FORCE_INLINE long btVector3::minDot(const btVector3* array, long array_count, btScalar& dotOut) const
1034 {
1035 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON)
1036 #if defined BT_USE_SSE
1037         const long scalar_cutoff = 10;
1038         long _mindot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut);
1039 #elif defined BT_USE_NEON
1040         const long scalar_cutoff = 4;
1041         extern long (*_mindot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut);
1042 #else
1043 #error unhandled arch!
1044 #endif
1045
1046         if (array_count < scalar_cutoff)
1047 #endif
1048         {
1049                 btScalar minDot = SIMD_INFINITY;
1050                 int i = 0;
1051                 int ptIndex = -1;
1052
1053                 for (i = 0; i < array_count; i++)
1054                 {
1055                         btScalar dot = array[i].dot(*this);
1056
1057                         if (dot < minDot)
1058                         {
1059                                 minDot = dot;
1060                                 ptIndex = i;
1061                         }
1062                 }
1063
1064                 dotOut = minDot;
1065
1066                 return ptIndex;
1067         }
1068 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON)
1069         return _mindot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut);
1070 #endif  //BT_USE_SIMD_VECTOR3
1071 }
1072
1073 class btVector4 : public btVector3
1074 {
1075 public:
1076         SIMD_FORCE_INLINE btVector4() {}
1077
1078         SIMD_FORCE_INLINE btVector4(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w)
1079                 : btVector3(_x, _y, _z)
1080         {
1081                 m_floats[3] = _w;
1082         }
1083
1084 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
1085         SIMD_FORCE_INLINE btVector4(const btSimdFloat4 vec)
1086         {
1087                 mVec128 = vec;
1088         }
1089
1090         SIMD_FORCE_INLINE btVector4(const btVector3& rhs)
1091         {
1092                 mVec128 = rhs.mVec128;
1093         }
1094
1095         SIMD_FORCE_INLINE btVector4&
1096         operator=(const btVector4& v)
1097         {
1098                 mVec128 = v.mVec128;
1099                 return *this;
1100         }
1101 #endif  // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1102
1103         SIMD_FORCE_INLINE btVector4 absolute4() const
1104         {
1105 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
1106                 return btVector4(_mm_and_ps(mVec128, btvAbsfMask));
1107 #elif defined(BT_USE_NEON)
1108                 return btVector4(vabsq_f32(mVec128));
1109 #else
1110                 return btVector4(
1111                         btFabs(m_floats[0]),
1112                         btFabs(m_floats[1]),
1113                         btFabs(m_floats[2]),
1114                         btFabs(m_floats[3]));
1115 #endif
1116         }
1117
1118         btScalar getW() const { return m_floats[3]; }
1119
1120         SIMD_FORCE_INLINE int maxAxis4() const
1121         {
1122                 int maxIndex = -1;
1123                 btScalar maxVal = btScalar(-BT_LARGE_FLOAT);
1124                 if (m_floats[0] > maxVal)
1125                 {
1126                         maxIndex = 0;
1127                         maxVal = m_floats[0];
1128                 }
1129                 if (m_floats[1] > maxVal)
1130                 {
1131                         maxIndex = 1;
1132                         maxVal = m_floats[1];
1133                 }
1134                 if (m_floats[2] > maxVal)
1135                 {
1136                         maxIndex = 2;
1137                         maxVal = m_floats[2];
1138                 }
1139                 if (m_floats[3] > maxVal)
1140                 {
1141                         maxIndex = 3;
1142                 }
1143
1144                 return maxIndex;
1145         }
1146
1147         SIMD_FORCE_INLINE int minAxis4() const
1148         {
1149                 int minIndex = -1;
1150                 btScalar minVal = btScalar(BT_LARGE_FLOAT);
1151                 if (m_floats[0] < minVal)
1152                 {
1153                         minIndex = 0;
1154                         minVal = m_floats[0];
1155                 }
1156                 if (m_floats[1] < minVal)
1157                 {
1158                         minIndex = 1;
1159                         minVal = m_floats[1];
1160                 }
1161                 if (m_floats[2] < minVal)
1162                 {
1163                         minIndex = 2;
1164                         minVal = m_floats[2];
1165                 }
1166                 if (m_floats[3] < minVal)
1167                 {
1168                         minIndex = 3;
1169                 }
1170
1171                 return minIndex;
1172         }
1173
1174         SIMD_FORCE_INLINE int closestAxis4() const
1175         {
1176                 return absolute4().maxAxis4();
1177         }
1178
1179         /**@brief Set x,y,z and zero w
1180    * @param x Value of x
1181    * @param y Value of y
1182    * @param z Value of z
1183    */
1184
1185         /*              void getValue(btScalar *m) const
1186                 {
1187                         m[0] = m_floats[0];
1188                         m[1] = m_floats[1];
1189                         m[2] =m_floats[2];
1190                 }
1191 */
1192         /**@brief Set the values
1193    * @param x Value of x
1194    * @param y Value of y
1195    * @param z Value of z
1196    * @param w Value of w
1197    */
1198         SIMD_FORCE_INLINE void setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w)
1199         {
1200                 m_floats[0] = _x;
1201                 m_floats[1] = _y;
1202                 m_floats[2] = _z;
1203                 m_floats[3] = _w;
1204         }
1205 };
1206
1207 ///btSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
1208 SIMD_FORCE_INLINE void btSwapScalarEndian(const btScalar& sourceVal, btScalar& destVal)
1209 {
1210 #ifdef BT_USE_DOUBLE_PRECISION
1211         unsigned char* dest = (unsigned char*)&destVal;
1212         const unsigned char* src = (const unsigned char*)&sourceVal;
1213         dest[0] = src[7];
1214         dest[1] = src[6];
1215         dest[2] = src[5];
1216         dest[3] = src[4];
1217         dest[4] = src[3];
1218         dest[5] = src[2];
1219         dest[6] = src[1];
1220         dest[7] = src[0];
1221 #else
1222         unsigned char* dest = (unsigned char*)&destVal;
1223         const unsigned char* src = (const unsigned char*)&sourceVal;
1224         dest[0] = src[3];
1225         dest[1] = src[2];
1226         dest[2] = src[1];
1227         dest[3] = src[0];
1228 #endif  //BT_USE_DOUBLE_PRECISION
1229 }
1230 ///btSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
1231 SIMD_FORCE_INLINE void btSwapVector3Endian(const btVector3& sourceVec, btVector3& destVec)
1232 {
1233         for (int i = 0; i < 4; i++)
1234         {
1235                 btSwapScalarEndian(sourceVec[i], destVec[i]);
1236         }
1237 }
1238
1239 ///btUnSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
1240 SIMD_FORCE_INLINE void btUnSwapVector3Endian(btVector3& vector)
1241 {
1242         btVector3 swappedVec;
1243         for (int i = 0; i < 4; i++)
1244         {
1245                 btSwapScalarEndian(vector[i], swappedVec[i]);
1246         }
1247         vector = swappedVec;
1248 }
1249
1250 template <class T>
1251 SIMD_FORCE_INLINE void btPlaneSpace1(const T& n, T& p, T& q)
1252 {
1253         if (btFabs(n[2]) > SIMDSQRT12)
1254         {
1255                 // choose p in y-z plane
1256                 btScalar a = n[1] * n[1] + n[2] * n[2];
1257                 btScalar k = btRecipSqrt(a);
1258                 p[0] = 0;
1259                 p[1] = -n[2] * k;
1260                 p[2] = n[1] * k;
1261                 // set q = n x p
1262                 q[0] = a * k;
1263                 q[1] = -n[0] * p[2];
1264                 q[2] = n[0] * p[1];
1265         }
1266         else
1267         {
1268                 // choose p in x-y plane
1269                 btScalar a = n[0] * n[0] + n[1] * n[1];
1270                 btScalar k = btRecipSqrt(a);
1271                 p[0] = -n[1] * k;
1272                 p[1] = n[0] * k;
1273                 p[2] = 0;
1274                 // set q = n x p
1275                 q[0] = -n[2] * p[1];
1276                 q[1] = n[2] * p[0];
1277                 q[2] = a * k;
1278         }
1279 }
1280
1281 struct btVector3FloatData
1282 {
1283         float m_floats[4];
1284 };
1285
1286 struct btVector3DoubleData
1287 {
1288         double m_floats[4];
1289 };
1290
1291 SIMD_FORCE_INLINE void btVector3::serializeFloat(struct btVector3FloatData& dataOut) const
1292 {
1293         ///could also do a memcpy, check if it is worth it
1294         for (int i = 0; i < 4; i++)
1295                 dataOut.m_floats[i] = float(m_floats[i]);
1296 }
1297
1298 SIMD_FORCE_INLINE void btVector3::deSerializeFloat(const struct btVector3FloatData& dataIn)
1299 {
1300         for (int i = 0; i < 4; i++)
1301                 m_floats[i] = btScalar(dataIn.m_floats[i]);
1302 }
1303
1304 SIMD_FORCE_INLINE void btVector3::serializeDouble(struct btVector3DoubleData& dataOut) const
1305 {
1306         ///could also do a memcpy, check if it is worth it
1307         for (int i = 0; i < 4; i++)
1308                 dataOut.m_floats[i] = double(m_floats[i]);
1309 }
1310
1311 SIMD_FORCE_INLINE void btVector3::deSerializeDouble(const struct btVector3DoubleData& dataIn)
1312 {
1313         for (int i = 0; i < 4; i++)
1314                 m_floats[i] = btScalar(dataIn.m_floats[i]);
1315 }
1316
1317 SIMD_FORCE_INLINE void btVector3::serialize(struct btVector3Data& dataOut) const
1318 {
1319         ///could also do a memcpy, check if it is worth it
1320         for (int i = 0; i < 4; i++)
1321                 dataOut.m_floats[i] = m_floats[i];
1322 }
1323
1324 SIMD_FORCE_INLINE void btVector3::deSerialize(const struct btVector3FloatData& dataIn)
1325 {
1326         for (int i = 0; i < 4; i++)
1327                 m_floats[i] = (btScalar)dataIn.m_floats[i];
1328 }
1329
1330 SIMD_FORCE_INLINE void btVector3::deSerialize(const struct btVector3DoubleData& dataIn)
1331 {
1332         for (int i = 0; i < 4; i++)
1333                 m_floats[i] = (btScalar)dataIn.m_floats[i];
1334 }
1335
1336 #endif  //BT_VECTOR3_H