src/LinearMath/btMatrix3x3.h

   1 /*
   2 Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  http://continuousphysics.com/Bullet/
   3
   4 This software is provided 'as-is', without any express or implied warranty.
   5 In no event will the authors be held liable for any damages arising from the use of this software.
   6 Permission is granted to anyone to use this software for any purpose,
   7 including commercial applications, and to alter it and redistribute it freely,
   8 subject to the following restrictions:
   9
  10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  12 3. This notice may not be removed or altered from any source distribution.
  13 */
  14
  15
  16 #ifndef BT_MATRIX3x3_H
  17 #define BT_MATRIX3x3_H
  18
  19 #include "btVector3.h"
  20 #include "btQuaternion.h"
  21 #include <stdio.h>
  22
  23 #ifdef BT_USE_SSE
  24 //const __m128 ATTRIBUTE_ALIGNED16(v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
  25 const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
  26 #endif
  27
  28 #if defined(BT_USE_SSE) || defined(BT_USE_NEON)
  29 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
  30 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
  31 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
  32 #endif
  33
  34 #ifdef BT_USE_DOUBLE_PRECISION
  35 #define btMatrix3x3Data btMatrix3x3DoubleData
  36 #else
  37 #define btMatrix3x3Data btMatrix3x3FloatData
  38 #endif //BT_USE_DOUBLE_PRECISION
  39
  40
  41 /**@brief The btMatrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with btQuaternion, btTransform and btVector3.
  42 * Make sure to only include a pure orthogonal matrix without scaling. */
  43 ATTRIBUTE_ALIGNED16(class) btMatrix3x3 {
  44
  45         ///Data storage for the matrix, each vector is a row of the matrix
  46         btVector3 m_el[3];
  47
  48 public:
  49         /** @brief No initializaion constructor */
  50         btMatrix3x3 () {}
  51
  52         //              explicit btMatrix3x3(const btScalar *m) { setFromOpenGLSubMatrix(m); }
  53
  54         /**@brief Constructor from Quaternion */
  55         explicit btMatrix3x3(const btQuaternion& q) { setRotation(q); }
  56         /*
  57         template <typename btScalar>
  58         Matrix3x3(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
  59         {
  60         setEulerYPR(yaw, pitch, roll);
  61         }
  62         */
  63         /** @brief Constructor with row major formatting */
  64         btMatrix3x3(const btScalar& xx, const btScalar& xy, const btScalar& xz,
  65                 const btScalar& yx, const btScalar& yy, const btScalar& yz,
  66                 const btScalar& zx, const btScalar& zy, const btScalar& zz)
  67         {
  68                 setValue(xx, xy, xz,
  69                         yx, yy, yz,
  70                         zx, zy, zz);
  71         }
  72
  73 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
  74         SIMD_FORCE_INLINE btMatrix3x3 (const btSimdFloat4 v0, const btSimdFloat4 v1, const btSimdFloat4 v2 )
  75         {
  76         m_el[0].mVec128 = v0;
  77         m_el[1].mVec128 = v1;
  78         m_el[2].mVec128 = v2;
  79         }
  80
  81         SIMD_FORCE_INLINE btMatrix3x3 (const btVector3& v0, const btVector3& v1, const btVector3& v2 )
  82         {
  83         m_el[0] = v0;
  84         m_el[1] = v1;
  85         m_el[2] = v2;
  86         }
  87
  88         // Copy constructor
  89         SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& rhs)
  90         {
  91                 m_el[0].mVec128 = rhs.m_el[0].mVec128;
  92                 m_el[1].mVec128 = rhs.m_el[1].mVec128;
  93                 m_el[2].mVec128 = rhs.m_el[2].mVec128;
  94         }
  95
  96         // Assignment Operator
  97         SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& m)
  98         {
  99                 m_el[0].mVec128 = m.m_el[0].mVec128;
 100                 m_el[1].mVec128 = m.m_el[1].mVec128;
 101                 m_el[2].mVec128 = m.m_el[2].mVec128;
 102
 103                 return *this;
 104         }
 105
 106 #else
 107
 108         /** @brief Copy constructor */
 109         SIMD_FORCE_INLINE btMatrix3x3 (const btMatrix3x3& other)
 110         {
 111                 m_el[0] = other.m_el[0];
 112                 m_el[1] = other.m_el[1];
 113                 m_el[2] = other.m_el[2];
 114         }
 115
 116         /** @brief Assignment Operator */
 117         SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& other)
 118         {
 119                 m_el[0] = other.m_el[0];
 120                 m_el[1] = other.m_el[1];
 121                 m_el[2] = other.m_el[2];
 122                 return *this;
 123         }
 124
 125 #endif
 126
 127         /** @brief Get a column of the matrix as a vector
 128         *  @param i Column number 0 indexed */
 129         SIMD_FORCE_INLINE btVector3 getColumn(int i) const
 130         {
 131                 return btVector3(m_el[0][i],m_el[1][i],m_el[2][i]);
 132         }
 133
 134
 135         /** @brief Get a row of the matrix as a vector
 136         *  @param i Row number 0 indexed */
 137         SIMD_FORCE_INLINE const btVector3& getRow(int i) const
 138         {
 139                 btFullAssert(0 <= i && i < 3);
 140                 return m_el[i];
 141         }
 142
 143         /** @brief Get a mutable reference to a row of the matrix as a vector
 144         *  @param i Row number 0 indexed */
 145         SIMD_FORCE_INLINE btVector3&  operator[](int i)
 146         {
 147                 btFullAssert(0 <= i && i < 3);
 148                 return m_el[i];
 149         }
 150
 151         /** @brief Get a const reference to a row of the matrix as a vector
 152         *  @param i Row number 0 indexed */
 153         SIMD_FORCE_INLINE const btVector3& operator[](int i) const
 154         {
 155                 btFullAssert(0 <= i && i < 3);
 156                 return m_el[i];
 157         }
 158
 159         /** @brief Multiply by the target matrix on the right
 160         *  @param m Rotation matrix to be applied
 161         * Equivilant to this = this * m */
 162         btMatrix3x3& operator*=(const btMatrix3x3& m);
 163
 164         /** @brief Adds by the target matrix on the right
 165         *  @param m matrix to be applied
 166         * Equivilant to this = this + m */
 167         btMatrix3x3& operator+=(const btMatrix3x3& m);
 168
 169         /** @brief Substractss by the target matrix on the right
 170         *  @param m matrix to be applied
 171         * Equivilant to this = this - m */
 172         btMatrix3x3& operator-=(const btMatrix3x3& m);
 173
 174         /** @brief Set from the rotational part of a 4x4 OpenGL matrix
 175         *  @param m A pointer to the beginning of the array of scalars*/
 176         void setFromOpenGLSubMatrix(const btScalar *m)
 177         {
 178                 m_el[0].setValue(m[0],m[4],m[8]);
 179                 m_el[1].setValue(m[1],m[5],m[9]);
 180                 m_el[2].setValue(m[2],m[6],m[10]);
 181
 182         }
 183         /** @brief Set the values of the matrix explicitly (row major)
 184         *  @param xx Top left
 185         *  @param xy Top Middle
 186         *  @param xz Top Right
 187         *  @param yx Middle Left
 188         *  @param yy Middle Middle
 189         *  @param yz Middle Right
 190         *  @param zx Bottom Left
 191         *  @param zy Bottom Middle
 192         *  @param zz Bottom Right*/
 193         void setValue(const btScalar& xx, const btScalar& xy, const btScalar& xz,
 194                 const btScalar& yx, const btScalar& yy, const btScalar& yz,
 195                 const btScalar& zx, const btScalar& zy, const btScalar& zz)
 196         {
 197                 m_el[0].setValue(xx,xy,xz);
 198                 m_el[1].setValue(yx,yy,yz);
 199                 m_el[2].setValue(zx,zy,zz);
 200         }
 201
 202         /** @brief Set the matrix from a quaternion
 203         *  @param q The Quaternion to match */
 204         void setRotation(const btQuaternion& q)
 205         {
 206                 btScalar d = q.length2();
 207                 btFullAssert(d != btScalar(0.0));
 208                 btScalar s = btScalar(2.0) / d;
 209
 210     #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
 211         __m128  vs, Q = q.get128();
 212                 __m128i Qi = btCastfTo128i(Q);
 213         __m128  Y, Z;
 214         __m128  V1, V2, V3;
 215         __m128  V11, V21, V31;
 216         __m128  NQ = _mm_xor_ps(Q, btvMzeroMask);
 217                 __m128i NQi = btCastfTo128i(NQ);
 218
 219         V1 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,2,3)));        // Y X Z W
 220                 V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0,0,1,3));     // -X -X  Y  W
 221         V3 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(2,1,0,3)));        // Z Y X W
 222         V1 = _mm_xor_ps(V1, vMPPP);     //      change the sign of the first element
 223
 224         V11     = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,1,0,3)));   // Y Y X W
 225                 V21 = _mm_unpackhi_ps(Q, Q);                    //  Z  Z  W  W
 226                 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0,2,0,3));       //  X  Z -X -W
 227
 228                 V2 = V2 * V1;   //
 229                 V1 = V1 * V11;  //
 230                 V3 = V3 * V31;  //
 231
 232         V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2,3,1,3));       //      -Z -W  Y  W
 233                 V11 = V11 * V21;        //
 234         V21 = _mm_xor_ps(V21, vMPPP);   //      change the sign of the first element
 235                 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3,3,1,3));       //       W  W -Y -W
 236         V31 = _mm_xor_ps(V31, vMPPP);   //      change the sign of the first element
 237                 Y = btCastiTo128f(_mm_shuffle_epi32 (NQi, BT_SHUFFLE(3,2,0,3)));        // -W -Z -X -W
 238                 Z = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,1,3))); //  Y  X  Y  W
 239
 240                 vs = _mm_load_ss(&s);
 241                 V21 = V21 * Y;
 242                 V31 = V31 * Z;
 243
 244                 V1 = V1 + V11;
 245         V2 = V2 + V21;
 246         V3 = V3 + V31;
 247
 248         vs = bt_splat3_ps(vs, 0);
 249             //  s ready
 250         V1 = V1 * vs;
 251         V2 = V2 * vs;
 252         V3 = V3 * vs;
 253
 254         V1 = V1 + v1000;
 255         V2 = V2 + v0100;
 256         V3 = V3 + v0010;
 257
 258         m_el[0] = V1;
 259         m_el[1] = V2;
 260         m_el[2] = V3;
 261     #else
 262                 btScalar xs = q.x() * s,   ys = q.y() * s,   zs = q.z() * s;
 263                 btScalar wx = q.w() * xs,  wy = q.w() * ys,  wz = q.w() * zs;
 264                 btScalar xx = q.x() * xs,  xy = q.x() * ys,  xz = q.x() * zs;
 265                 btScalar yy = q.y() * ys,  yz = q.y() * zs,  zz = q.z() * zs;
 266                 setValue(
 267             btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
 268                         xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
 269                         xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
 270         #endif
 271     }
 272
 273
 274         /** @brief Set the matrix from euler angles using YPR around YXZ respectively
 275         *  @param yaw Yaw about Y axis
 276         *  @param pitch Pitch about X axis
 277         *  @param roll Roll about Z axis
 278         */
 279         void setEulerYPR(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
 280         {
 281                 setEulerZYX(roll, pitch, yaw);
 282         }
 283
 284         /** @brief Set the matrix from euler angles YPR around ZYX axes
 285         * @param eulerX Roll about X axis
 286         * @param eulerY Pitch around Y axis
 287         * @param eulerZ Yaw aboud Z axis
 288         *
 289         * These angles are used to produce a rotation matrix. The euler
 290         * angles are applied in ZYX order. I.e a vector is first rotated
 291         * about X then Y and then Z
 292         **/
 293         void setEulerZYX(btScalar eulerX,btScalar eulerY,btScalar eulerZ) {
 294                 ///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code
 295                 btScalar ci ( btCos(eulerX));
 296                 btScalar cj ( btCos(eulerY));
 297                 btScalar ch ( btCos(eulerZ));
 298                 btScalar si ( btSin(eulerX));
 299                 btScalar sj ( btSin(eulerY));
 300                 btScalar sh ( btSin(eulerZ));
 301                 btScalar cc = ci * ch;
 302                 btScalar cs = ci * sh;
 303                 btScalar sc = si * ch;
 304                 btScalar ss = si * sh;
 305
 306                 setValue(cj * ch, sj * sc - cs, sj * cc + ss,
 307                         cj * sh, sj * ss + cc, sj * cs - sc,
 308                         -sj,      cj * si,      cj * ci);
 309         }
 310
 311         /**@brief Set the matrix to the identity */
 312         void setIdentity()
 313         {
 314 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
 315                         m_el[0] = v1000;
 316                         m_el[1] = v0100;
 317                         m_el[2] = v0010;
 318 #else
 319                 setValue(btScalar(1.0), btScalar(0.0), btScalar(0.0),
 320                         btScalar(0.0), btScalar(1.0), btScalar(0.0),
 321                         btScalar(0.0), btScalar(0.0), btScalar(1.0));
 322 #endif
 323         }
 324
 325         static const btMatrix3x3&       getIdentity()
 326         {
 327 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
 328         static const btMatrix3x3
 329         identityMatrix(v1000, v0100, v0010);
 330 #else
 331                 static const btMatrix3x3
 332         identityMatrix(
 333             btScalar(1.0), btScalar(0.0), btScalar(0.0),
 334                         btScalar(0.0), btScalar(1.0), btScalar(0.0),
 335                         btScalar(0.0), btScalar(0.0), btScalar(1.0));
 336 #endif
 337                 return identityMatrix;
 338         }
 339
 340         /**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective
 341         * @param m The array to be filled */
 342         void getOpenGLSubMatrix(btScalar *m) const
 343         {
 344 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
 345         __m128 v0 = m_el[0].mVec128;
 346         __m128 v1 = m_el[1].mVec128;
 347         __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
 348         __m128 *vm = (__m128 *)m;
 349         __m128 vT;
 350
 351         v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
 352
 353         vT = _mm_unpackhi_ps(v0, v1);   //      z0 z1 * *
 354         v0 = _mm_unpacklo_ps(v0, v1);   //      x0 x1 y0 y1
 355
 356         v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );   // y0 y1 y2 0
 357         v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );   // x0 x1 x2 0
 358         v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));  // z0 z1 z2 0
 359
 360         vm[0] = v0;
 361         vm[1] = v1;
 362         vm[2] = v2;
 363 #elif defined(BT_USE_NEON)
 364         // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
 365         static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
 366         float32x4_t *vm = (float32x4_t *)m;
 367         float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
 368         float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
 369         float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
 370         float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
 371         float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
 372         float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
 373
 374         vm[0] = v0;
 375         vm[1] = v1;
 376         vm[2] = v2;
 377 #else
 378                 m[0]  = btScalar(m_el[0].x());
 379                 m[1]  = btScalar(m_el[1].x());
 380                 m[2]  = btScalar(m_el[2].x());
 381                 m[3]  = btScalar(0.0);
 382                 m[4]  = btScalar(m_el[0].y());
 383                 m[5]  = btScalar(m_el[1].y());
 384                 m[6]  = btScalar(m_el[2].y());
 385                 m[7]  = btScalar(0.0);
 386                 m[8]  = btScalar(m_el[0].z());
 387                 m[9]  = btScalar(m_el[1].z());
 388                 m[10] = btScalar(m_el[2].z());
 389                 m[11] = btScalar(0.0);
 390 #endif
 391         }
 392
 393         /**@brief Get the matrix represented as a quaternion
 394         * @param q The quaternion which will be set */
 395         void getRotation(btQuaternion& q) const
 396         {
 397 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
 398         btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
 399         btScalar s, x;
 400
 401         union {
 402             btSimdFloat4 vec;
 403             btScalar f[4];
 404         } temp;
 405
 406         if (trace > btScalar(0.0))
 407         {
 408             x = trace + btScalar(1.0);
 409
 410             temp.f[0]=m_el[2].y() - m_el[1].z();
 411             temp.f[1]=m_el[0].z() - m_el[2].x();
 412             temp.f[2]=m_el[1].x() - m_el[0].y();
 413             temp.f[3]=x;
 414             //temp.f[3]= s * btScalar(0.5);
 415         }
 416         else
 417         {
 418             int i, j, k;
 419             if(m_el[0].x() < m_el[1].y())
 420             {
 421                 if( m_el[1].y() < m_el[2].z() )
 422                     { i = 2; j = 0; k = 1; }
 423                 else
 424                     { i = 1; j = 2; k = 0; }
 425             }
 426             else
 427             {
 428                 if( m_el[0].x() < m_el[2].z())
 429                     { i = 2; j = 0; k = 1; }
 430                 else
 431                     { i = 0; j = 1; k = 2; }
 432             }
 433
 434             x = m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0);
 435
 436             temp.f[3] = (m_el[k][j] - m_el[j][k]);
 437             temp.f[j] = (m_el[j][i] + m_el[i][j]);
 438             temp.f[k] = (m_el[k][i] + m_el[i][k]);
 439             temp.f[i] = x;
 440             //temp.f[i] = s * btScalar(0.5);
 441         }
 442
 443         s = btSqrt(x);
 444         q.set128(temp.vec);
 445         s = btScalar(0.5) / s;
 446
 447         q *= s;
 448 #else
 449                 btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
 450
 451                 btScalar temp[4];
 452
 453                 if (trace > btScalar(0.0))
 454                 {
 455                         btScalar s = btSqrt(trace + btScalar(1.0));
 456                         temp[3]=(s * btScalar(0.5));
 457                         s = btScalar(0.5) / s;
 458
 459                         temp[0]=((m_el[2].y() - m_el[1].z()) * s);
 460                         temp[1]=((m_el[0].z() - m_el[2].x()) * s);
 461                         temp[2]=((m_el[1].x() - m_el[0].y()) * s);
 462                 }
 463                 else
 464                 {
 465                         int i = m_el[0].x() < m_el[1].y() ?
 466                                 (m_el[1].y() < m_el[2].z() ? 2 : 1) :
 467                                 (m_el[0].x() < m_el[2].z() ? 2 : 0);
 468                         int j = (i + 1) % 3;
 469                         int k = (i + 2) % 3;
 470
 471                         btScalar s = btSqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0));
 472                         temp[i] = s * btScalar(0.5);
 473                         s = btScalar(0.5) / s;
 474
 475                         temp[3] = (m_el[k][j] - m_el[j][k]) * s;
 476                         temp[j] = (m_el[j][i] + m_el[i][j]) * s;
 477                         temp[k] = (m_el[k][i] + m_el[i][k]) * s;
 478                 }
 479                 q.setValue(temp[0],temp[1],temp[2],temp[3]);
 480 #endif
 481         }
 482
 483         /**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
 484         * @param yaw Yaw around Y axis
 485         * @param pitch Pitch around X axis
 486         * @param roll around Z axis */
 487         void getEulerYPR(btScalar& yaw, btScalar& pitch, btScalar& roll) const
 488         {
 489
 490                 // first use the normal calculus
 491                 yaw = btScalar(btAtan2(m_el[1].x(), m_el[0].x()));
 492                 pitch = btScalar(btAsin(-m_el[2].x()));
 493                 roll = btScalar(btAtan2(m_el[2].y(), m_el[2].z()));
 494
 495                 // on pitch = +/-HalfPI
 496                 if (btFabs(pitch)==SIMD_HALF_PI)
 497                 {
 498                         if (yaw>0)
 499                                 yaw-=SIMD_PI;
 500                         else
 501                                 yaw+=SIMD_PI;
 502
 503                         if (roll>0)
 504                                 roll-=SIMD_PI;
 505                         else
 506                                 roll+=SIMD_PI;
 507                 }
 508         };
 509
 510
 511         /**@brief Get the matrix represented as euler angles around ZYX
 512         * @param yaw Yaw around X axis
 513         * @param pitch Pitch around Y axis
 514         * @param roll around X axis
 515         * @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/
 516         void getEulerZYX(btScalar& yaw, btScalar& pitch, btScalar& roll, unsigned int solution_number = 1) const
 517         {
 518                 struct Euler
 519                 {
 520                         btScalar yaw;
 521                         btScalar pitch;
 522                         btScalar roll;
 523                 };
 524
 525                 Euler euler_out;
 526                 Euler euler_out2; //second solution
 527                 //get the pointer to the raw data
 528
 529                 // Check that pitch is not at a singularity
 530                 if (btFabs(m_el[2].x()) >= 1)
 531                 {
 532                         euler_out.yaw = 0;
 533                         euler_out2.yaw = 0;
 534
 535                         // From difference of angles formula
 536                         btScalar delta = btAtan2(m_el[0].x(),m_el[0].z());
 537                         if (m_el[2].x() > 0)  //gimbal locked up
 538                         {
 539                                 euler_out.pitch = SIMD_PI / btScalar(2.0);
 540                                 euler_out2.pitch = SIMD_PI / btScalar(2.0);
 541                                 euler_out.roll = euler_out.pitch + delta;
 542                                 euler_out2.roll = euler_out.pitch + delta;
 543                         }
 544                         else // gimbal locked down
 545                         {
 546                                 euler_out.pitch = -SIMD_PI / btScalar(2.0);
 547                                 euler_out2.pitch = -SIMD_PI / btScalar(2.0);
 548                                 euler_out.roll = -euler_out.pitch + delta;
 549                                 euler_out2.roll = -euler_out.pitch + delta;
 550                         }
 551                 }
 552                 else
 553                 {
 554                         euler_out.pitch = - btAsin(m_el[2].x());
 555                         euler_out2.pitch = SIMD_PI - euler_out.pitch;
 556
 557                         euler_out.roll = btAtan2(m_el[2].y()/btCos(euler_out.pitch),
 558                                 m_el[2].z()/btCos(euler_out.pitch));
 559                         euler_out2.roll = btAtan2(m_el[2].y()/btCos(euler_out2.pitch),
 560                                 m_el[2].z()/btCos(euler_out2.pitch));
 561
 562                         euler_out.yaw = btAtan2(m_el[1].x()/btCos(euler_out.pitch),
 563                                 m_el[0].x()/btCos(euler_out.pitch));
 564                         euler_out2.yaw = btAtan2(m_el[1].x()/btCos(euler_out2.pitch),
 565                                 m_el[0].x()/btCos(euler_out2.pitch));
 566                 }
 567
 568                 if (solution_number == 1)
 569                 {
 570                         yaw = euler_out.yaw;
 571                         pitch = euler_out.pitch;
 572                         roll = euler_out.roll;
 573                 }
 574                 else
 575                 {
 576                         yaw = euler_out2.yaw;
 577                         pitch = euler_out2.pitch;
 578                         roll = euler_out2.roll;
 579                 }
 580         }
 581
 582         /**@brief Create a scaled copy of the matrix
 583         * @param s Scaling vector The elements of the vector will scale each column */
 584
 585         btMatrix3x3 scaled(const btVector3& s) const
 586         {
 587 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
 588                 return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
 589 #else
 590                 return btMatrix3x3(
 591             m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(),
 592                         m_el[1].x() * s.x(), m_el[1].y() * s.y(), m_el[1].z() * s.z(),
 593                         m_el[2].x() * s.x(), m_el[2].y() * s.y(), m_el[2].z() * s.z());
 594 #endif
 595         }
 596
 597         /**@brief Return the determinant of the matrix */
 598         btScalar            determinant() const;
 599         /**@brief Return the adjoint of the matrix */
 600         btMatrix3x3 adjoint() const;
 601         /**@brief Return the matrix with all values non negative */
 602         btMatrix3x3 absolute() const;
 603         /**@brief Return the transpose of the matrix */
 604         btMatrix3x3 transpose() const;
 605         /**@brief Return the inverse of the matrix */
 606         btMatrix3x3 inverse() const;
 607
 608         btMatrix3x3 transposeTimes(const btMatrix3x3& m) const;
 609         btMatrix3x3 timesTranspose(const btMatrix3x3& m) const;
 610
 611         SIMD_FORCE_INLINE btScalar tdotx(const btVector3& v) const
 612         {
 613                 return m_el[0].x() * v.x() + m_el[1].x() * v.y() + m_el[2].x() * v.z();
 614         }
 615         SIMD_FORCE_INLINE btScalar tdoty(const btVector3& v) const
 616         {
 617                 return m_el[0].y() * v.x() + m_el[1].y() * v.y() + m_el[2].y() * v.z();
 618         }
 619         SIMD_FORCE_INLINE btScalar tdotz(const btVector3& v) const
 620         {
 621                 return m_el[0].z() * v.x() + m_el[1].z() * v.y() + m_el[2].z() * v.z();
 622         }
 623
 624
 625         /**@brief diagonalizes this matrix by the Jacobi method.
 626         * @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original
 627         * coordinate system, i.e., old_this = rot * new_this * rot^T.
 628         * @param threshold See iteration
 629         * @param iteration The iteration stops when all off-diagonal elements are less than the threshold multiplied
 630         * by the sum of the absolute values of the diagonal, or when maxSteps have been executed.
 631         *
 632         * Note that this matrix is assumed to be symmetric.
 633         */
 634         void diagonalize(btMatrix3x3& rot, btScalar threshold, int maxSteps)
 635         {
 636                 rot.setIdentity();
 637                 for (int step = maxSteps; step > 0; step--)
 638                 {
 639                         // find off-diagonal element [p][q] with largest magnitude
 640                         int p = 0;
 641                         int q = 1;
 642                         int r = 2;
 643                         btScalar max = btFabs(m_el[0][1]);
 644                         btScalar v = btFabs(m_el[0][2]);
 645                         if (v > max)
 646                         {
 647                                 q = 2;
 648                                 r = 1;
 649                                 max = v;
 650                         }
 651                         v = btFabs(m_el[1][2]);
 652                         if (v > max)
 653                         {
 654                                 p = 1;
 655                                 q = 2;
 656                                 r = 0;
 657                                 max = v;
 658                         }
 659
 660                         btScalar t = threshold * (btFabs(m_el[0][0]) + btFabs(m_el[1][1]) + btFabs(m_el[2][2]));
 661                         if (max <= t)
 662                         {
 663                                 if (max <= SIMD_EPSILON * t)
 664                                 {
 665                                         return;
 666                                 }
 667                                 step = 1;
 668                         }
 669
 670                         // compute Jacobi rotation J which leads to a zero for element [p][q]
 671                         btScalar mpq = m_el[p][q];
 672                         btScalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
 673                         btScalar theta2 = theta * theta;
 674                         btScalar cos;
 675                         btScalar sin;
 676                         if (theta2 * theta2 < btScalar(10 / SIMD_EPSILON))
 677                         {
 678                                 t = (theta >= 0) ? 1 / (theta + btSqrt(1 + theta2))
 679                                         : 1 / (theta - btSqrt(1 + theta2));
 680                                 cos = 1 / btSqrt(1 + t * t);
 681                                 sin = cos * t;
 682                         }
 683                         else
 684                         {
 685                                 // approximation for large theta-value, i.e., a nearly diagonal matrix
 686                                 t = 1 / (theta * (2 + btScalar(0.5) / theta2));
 687                                 cos = 1 - btScalar(0.5) * t * t;
 688                                 sin = cos * t;
 689                         }
 690
 691                         // apply rotation to matrix (this = J^T * this * J)
 692                         m_el[p][q] = m_el[q][p] = 0;
 693                         m_el[p][p] -= t * mpq;
 694                         m_el[q][q] += t * mpq;
 695                         btScalar mrp = m_el[r][p];
 696                         btScalar mrq = m_el[r][q];
 697                         m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
 698                         m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
 699
 700                         // apply rotation to rot (rot = rot * J)
 701                         for (int i = 0; i < 3; i++)
 702                         {
 703                                 btVector3& row = rot[i];
 704                                 mrp = row[p];
 705                                 mrq = row[q];
 706                                 row[p] = cos * mrp - sin * mrq;
 707                                 row[q] = cos * mrq + sin * mrp;
 708                         }
 709                 }
 710         }
 711
 712
 713
 714
 715         /**@brief Calculate the matrix cofactor
 716         * @param r1 The first row to use for calculating the cofactor
 717         * @param c1 The first column to use for calculating the cofactor
 718         * @param r1 The second row to use for calculating the cofactor
 719         * @param c1 The second column to use for calculating the cofactor
 720         * See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details
 721         */
 722         btScalar cofac(int r1, int c1, int r2, int c2) const
 723         {
 724                 return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
 725         }
 726
 727         void    serialize(struct        btMatrix3x3Data& dataOut) const;
 728
 729         void    serializeFloat(struct   btMatrix3x3FloatData& dataOut) const;
 730
 731         void    deSerialize(const struct        btMatrix3x3Data& dataIn);
 732
 733         void    deSerializeFloat(const struct   btMatrix3x3FloatData& dataIn);
 734
 735         void    deSerializeDouble(const struct  btMatrix3x3DoubleData& dataIn);
 736
 737 };
 738
 739
 740 SIMD_FORCE_INLINE btMatrix3x3&
 741 btMatrix3x3::operator*=(const btMatrix3x3& m)
 742 {
 743 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
 744     __m128 rv00, rv01, rv02;
 745     __m128 rv10, rv11, rv12;
 746     __m128 rv20, rv21, rv22;
 747     __m128 mv0, mv1, mv2;
 748
 749     rv02 = m_el[0].mVec128;
 750     rv12 = m_el[1].mVec128;
 751     rv22 = m_el[2].mVec128;
 752
 753     mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask);
 754     mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask);
 755     mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask);
 756
 757     // rv0
 758     rv00 = bt_splat_ps(rv02, 0);
 759     rv01 = bt_splat_ps(rv02, 1);
 760     rv02 = bt_splat_ps(rv02, 2);
 761
 762     rv00 = _mm_mul_ps(rv00, mv0);
 763     rv01 = _mm_mul_ps(rv01, mv1);
 764     rv02 = _mm_mul_ps(rv02, mv2);
 765
 766     // rv1
 767     rv10 = bt_splat_ps(rv12, 0);
 768     rv11 = bt_splat_ps(rv12, 1);
 769     rv12 = bt_splat_ps(rv12, 2);
 770
 771     rv10 = _mm_mul_ps(rv10, mv0);
 772     rv11 = _mm_mul_ps(rv11, mv1);
 773     rv12 = _mm_mul_ps(rv12, mv2);
 774
 775     // rv2
 776     rv20 = bt_splat_ps(rv22, 0);
 777     rv21 = bt_splat_ps(rv22, 1);
 778     rv22 = bt_splat_ps(rv22, 2);
 779
 780     rv20 = _mm_mul_ps(rv20, mv0);
 781     rv21 = _mm_mul_ps(rv21, mv1);
 782     rv22 = _mm_mul_ps(rv22, mv2);
 783
 784     rv00 = _mm_add_ps(rv00, rv01);
 785     rv10 = _mm_add_ps(rv10, rv11);
 786     rv20 = _mm_add_ps(rv20, rv21);
 787
 788     m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
 789     m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
 790     m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
 791
 792 #elif defined(BT_USE_NEON)
 793
 794     float32x4_t rv0, rv1, rv2;
 795     float32x4_t v0, v1, v2;
 796     float32x4_t mv0, mv1, mv2;
 797
 798     v0 = m_el[0].mVec128;
 799     v1 = m_el[1].mVec128;
 800     v2 = m_el[2].mVec128;
 801
 802     mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
 803     mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
 804     mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
 805
 806     rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
 807     rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
 808     rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
 809
 810     rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
 811     rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
 812     rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
 813
 814     rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
 815     rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
 816     rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
 817
 818     m_el[0].mVec128 = rv0;
 819     m_el[1].mVec128 = rv1;
 820     m_el[2].mVec128 = rv2;
 821 #else
 822         setValue(
 823         m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
 824                 m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
 825                 m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
 826 #endif
 827         return *this;
 828 }
 829
 830 SIMD_FORCE_INLINE btMatrix3x3&
 831 btMatrix3x3::operator+=(const btMatrix3x3& m)
 832 {
 833 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
 834     m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
 835     m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
 836     m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
 837 #else
 838         setValue(
 839                 m_el[0][0]+m.m_el[0][0],
 840                 m_el[0][1]+m.m_el[0][1],
 841                 m_el[0][2]+m.m_el[0][2],
 842                 m_el[1][0]+m.m_el[1][0],
 843                 m_el[1][1]+m.m_el[1][1],
 844                 m_el[1][2]+m.m_el[1][2],
 845                 m_el[2][0]+m.m_el[2][0],
 846                 m_el[2][1]+m.m_el[2][1],
 847                 m_el[2][2]+m.m_el[2][2]);
 848 #endif
 849         return *this;
 850 }
 851
 852 SIMD_FORCE_INLINE btMatrix3x3
 853 operator*(const btMatrix3x3& m, const btScalar & k)
 854 {
 855 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
 856     __m128 vk = bt_splat_ps(_mm_load_ss((float *)&k), 0x80);
 857     return btMatrix3x3(
 858                 _mm_mul_ps(m[0].mVec128, vk),
 859                 _mm_mul_ps(m[1].mVec128, vk),
 860                 _mm_mul_ps(m[2].mVec128, vk));
 861 #elif defined(BT_USE_NEON)
 862     return btMatrix3x3(
 863                 vmulq_n_f32(m[0].mVec128, k),
 864                 vmulq_n_f32(m[1].mVec128, k),
 865                 vmulq_n_f32(m[2].mVec128, k));
 866 #else
 867         return btMatrix3x3(
 868                 m[0].x()*k,m[0].y()*k,m[0].z()*k,
 869                 m[1].x()*k,m[1].y()*k,m[1].z()*k,
 870                 m[2].x()*k,m[2].y()*k,m[2].z()*k);
 871 #endif
 872 }
 873
 874 SIMD_FORCE_INLINE btMatrix3x3
 875 operator+(const btMatrix3x3& m1, const btMatrix3x3& m2)
 876 {
 877 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
 878         return btMatrix3x3(
 879         m1[0].mVec128 + m2[0].mVec128,
 880         m1[1].mVec128 + m2[1].mVec128,
 881         m1[2].mVec128 + m2[2].mVec128);
 882 #else
 883         return btMatrix3x3(
 884         m1[0][0]+m2[0][0],
 885         m1[0][1]+m2[0][1],
 886         m1[0][2]+m2[0][2],
 887
 888         m1[1][0]+m2[1][0],
 889         m1[1][1]+m2[1][1],
 890         m1[1][2]+m2[1][2],
 891
 892         m1[2][0]+m2[2][0],
 893         m1[2][1]+m2[2][1],
 894         m1[2][2]+m2[2][2]);
 895 #endif
 896 }
 897
 898 SIMD_FORCE_INLINE btMatrix3x3
 899 operator-(const btMatrix3x3& m1, const btMatrix3x3& m2)
 900 {
 901 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
 902         return btMatrix3x3(
 903         m1[0].mVec128 - m2[0].mVec128,
 904         m1[1].mVec128 - m2[1].mVec128,
 905         m1[2].mVec128 - m2[2].mVec128);
 906 #else
 907         return btMatrix3x3(
 908         m1[0][0]-m2[0][0],
 909         m1[0][1]-m2[0][1],
 910         m1[0][2]-m2[0][2],
 911
 912         m1[1][0]-m2[1][0],
 913         m1[1][1]-m2[1][1],
 914         m1[1][2]-m2[1][2],
 915
 916         m1[2][0]-m2[2][0],
 917         m1[2][1]-m2[2][1],
 918         m1[2][2]-m2[2][2]);
 919 #endif
 920 }
 921
 922
 923 SIMD_FORCE_INLINE btMatrix3x3&
 924 btMatrix3x3::operator-=(const btMatrix3x3& m)
 925 {
 926 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
 927     m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
 928     m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
 929     m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
 930 #else
 931         setValue(
 932         m_el[0][0]-m.m_el[0][0],
 933         m_el[0][1]-m.m_el[0][1],
 934         m_el[0][2]-m.m_el[0][2],
 935         m_el[1][0]-m.m_el[1][0],
 936         m_el[1][1]-m.m_el[1][1],
 937         m_el[1][2]-m.m_el[1][2],
 938         m_el[2][0]-m.m_el[2][0],
 939         m_el[2][1]-m.m_el[2][1],
 940         m_el[2][2]-m.m_el[2][2]);
 941 #endif
 942         return *this;
 943 }
 944
 945
 946 SIMD_FORCE_INLINE btScalar
 947 btMatrix3x3::determinant() const
 948 {
 949         return btTriple((*this)[0], (*this)[1], (*this)[2]);
 950 }
 951
 952
 953 SIMD_FORCE_INLINE btMatrix3x3
 954 btMatrix3x3::absolute() const
 955 {
 956 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
 957     return btMatrix3x3(
 958             _mm_and_ps(m_el[0].mVec128, btvAbsfMask),
 959             _mm_and_ps(m_el[1].mVec128, btvAbsfMask),
 960             _mm_and_ps(m_el[2].mVec128, btvAbsfMask));
 961 #elif defined(BT_USE_NEON)
 962     return btMatrix3x3(
 963             (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask),
 964             (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask),
 965             (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask));
 966 #else
 967         return btMatrix3x3(
 968             btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
 969             btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
 970             btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
 971 #endif
 972 }
 973
 974 SIMD_FORCE_INLINE btMatrix3x3
 975 btMatrix3x3::transpose() const
 976 {
 977 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
 978     __m128 v0 = m_el[0].mVec128;
 979     __m128 v1 = m_el[1].mVec128;
 980     __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
 981     __m128 vT;
 982
 983     v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
 984
 985     vT = _mm_unpackhi_ps(v0, v1);       //      z0 z1 * *
 986     v0 = _mm_unpacklo_ps(v0, v1);       //      x0 x1 y0 y1
 987
 988     v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );       // y0 y1 y2 0
 989     v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );       // x0 x1 x2 0
 990     v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));      // z0 z1 z2 0
 991
 992
 993     return btMatrix3x3( v0, v1, v2 );
 994 #elif defined(BT_USE_NEON)
 995     // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
 996     static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
 997     float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
 998     float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
 999     float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
1000     float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
1001     float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
1002     float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
1003     return btMatrix3x3( v0, v1, v2 );
1004 #else
1005         return btMatrix3x3( m_el[0].x(), m_el[1].x(), m_el[2].x(),
1006                         m_el[0].y(), m_el[1].y(), m_el[2].y(),
1007                         m_el[0].z(), m_el[1].z(), m_el[2].z());
1008 #endif
1009 }
1010
1011 SIMD_FORCE_INLINE btMatrix3x3
1012 btMatrix3x3::adjoint() const
1013 {
1014         return btMatrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2),
1015                 cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
1016                 cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
1017 }
1018
1019 SIMD_FORCE_INLINE btMatrix3x3
1020 btMatrix3x3::inverse() const
1021 {
1022         btVector3 co(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1));
1023         btScalar det = (*this)[0].dot(co);
1024         btFullAssert(det != btScalar(0.0));
1025         btScalar s = btScalar(1.0) / det;
1026         return btMatrix3x3(co.x() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s,
1027                 co.y() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
1028                 co.z() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
1029 }
1030
1031 SIMD_FORCE_INLINE btMatrix3x3
1032 btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
1033 {
1034 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1035     // zeros w
1036 //    static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
1037     __m128 row = m_el[0].mVec128;
1038     __m128 m0 = _mm_and_ps( m.getRow(0).mVec128, btvFFF0fMask );
1039     __m128 m1 = _mm_and_ps( m.getRow(1).mVec128, btvFFF0fMask);
1040     __m128 m2 = _mm_and_ps( m.getRow(2).mVec128, btvFFF0fMask );
1041     __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
1042     __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
1043     __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
1044     row = m_el[1].mVec128;
1045     r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
1046     r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
1047     r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
1048     row = m_el[2].mVec128;
1049     r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
1050     r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
1051     r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
1052     return btMatrix3x3( r0, r1, r2 );
1053
1054 #elif defined BT_USE_NEON
1055     // zeros w
1056     static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
1057     float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask );
1058     float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask );
1059     float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask );
1060     float32x4_t row = m_el[0].mVec128;
1061     float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0);
1062     float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1);
1063     float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0);
1064     row = m_el[1].mVec128;
1065     r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0);
1066     r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1);
1067     r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0);
1068     row = m_el[2].mVec128;
1069     r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0);
1070     r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1);
1071     r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0);
1072     return btMatrix3x3( r0, r1, r2 );
1073 #else
1074     return btMatrix3x3(
1075                 m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
1076                 m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
1077                 m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
1078                 m_el[0].y() * m[0].x() + m_el[1].y() * m[1].x() + m_el[2].y() * m[2].x(),
1079                 m_el[0].y() * m[0].y() + m_el[1].y() * m[1].y() + m_el[2].y() * m[2].y(),
1080                 m_el[0].y() * m[0].z() + m_el[1].y() * m[1].z() + m_el[2].y() * m[2].z(),
1081                 m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
1082                 m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
1083                 m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z());
1084 #endif
1085 }
1086
1087 SIMD_FORCE_INLINE btMatrix3x3
1088 btMatrix3x3::timesTranspose(const btMatrix3x3& m) const
1089 {
1090 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1091     __m128 a0 = m_el[0].mVec128;
1092     __m128 a1 = m_el[1].mVec128;
1093     __m128 a2 = m_el[2].mVec128;
1094
1095     btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
1096     __m128 mx = mT[0].mVec128;
1097     __m128 my = mT[1].mVec128;
1098     __m128 mz = mT[2].mVec128;
1099
1100     __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
1101     __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
1102     __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
1103     r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
1104     r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
1105     r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
1106     r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
1107     r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
1108     r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
1109     return btMatrix3x3( r0, r1, r2);
1110
1111 #elif defined BT_USE_NEON
1112     float32x4_t a0 = m_el[0].mVec128;
1113     float32x4_t a1 = m_el[1].mVec128;
1114     float32x4_t a2 = m_el[2].mVec128;
1115
1116     btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
1117     float32x4_t mx = mT[0].mVec128;
1118     float32x4_t my = mT[1].mVec128;
1119     float32x4_t mz = mT[2].mVec128;
1120
1121     float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0);
1122     float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0);
1123     float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0);
1124     r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1);
1125     r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1);
1126     r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1);
1127     r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0);
1128     r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0);
1129     r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0);
1130     return btMatrix3x3( r0, r1, r2 );
1131
1132 #else
1133         return btMatrix3x3(
1134                 m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
1135                 m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
1136                 m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
1137 #endif
1138 }
1139
1140 SIMD_FORCE_INLINE btVector3
1141 operator*(const btMatrix3x3& m, const btVector3& v)
1142 {
1143 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
1144     return v.dot3(m[0], m[1], m[2]);
1145 #else
1146         return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
1147 #endif
1148 }
1149
1150
1151 SIMD_FORCE_INLINE btVector3
1152 operator*(const btVector3& v, const btMatrix3x3& m)
1153 {
1154 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1155
1156     const __m128 vv = v.mVec128;
1157
1158     __m128 c0 = bt_splat_ps( vv, 0);
1159     __m128 c1 = bt_splat_ps( vv, 1);
1160     __m128 c2 = bt_splat_ps( vv, 2);
1161
1162     c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask) );
1163     c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask) );
1164     c0 = _mm_add_ps(c0, c1);
1165     c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask) );
1166
1167     return btVector3(_mm_add_ps(c0, c2));
1168 #elif defined(BT_USE_NEON)
1169     const float32x4_t vv = v.mVec128;
1170     const float32x2_t vlo = vget_low_f32(vv);
1171     const float32x2_t vhi = vget_high_f32(vv);
1172
1173     float32x4_t c0, c1, c2;
1174
1175     c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
1176     c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
1177     c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
1178
1179     c0 = vmulq_lane_f32(c0, vlo, 0);
1180     c1 = vmulq_lane_f32(c1, vlo, 1);
1181     c2 = vmulq_lane_f32(c2, vhi, 0);
1182     c0 = vaddq_f32(c0, c1);
1183     c0 = vaddq_f32(c0, c2);
1184
1185     return btVector3(c0);
1186 #else
1187         return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
1188 #endif
1189 }
1190
1191 SIMD_FORCE_INLINE btMatrix3x3
1192 operator*(const btMatrix3x3& m1, const btMatrix3x3& m2)
1193 {
1194 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1195
1196     __m128 m10 = m1[0].mVec128;
1197     __m128 m11 = m1[1].mVec128;
1198     __m128 m12 = m1[2].mVec128;
1199
1200     __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
1201
1202     __m128 c0 = bt_splat_ps( m10, 0);
1203     __m128 c1 = bt_splat_ps( m11, 0);
1204     __m128 c2 = bt_splat_ps( m12, 0);
1205
1206     c0 = _mm_mul_ps(c0, m2v);
1207     c1 = _mm_mul_ps(c1, m2v);
1208     c2 = _mm_mul_ps(c2, m2v);
1209
1210     m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
1211
1212     __m128 c0_1 = bt_splat_ps( m10, 1);
1213     __m128 c1_1 = bt_splat_ps( m11, 1);
1214     __m128 c2_1 = bt_splat_ps( m12, 1);
1215
1216     c0_1 = _mm_mul_ps(c0_1, m2v);
1217     c1_1 = _mm_mul_ps(c1_1, m2v);
1218     c2_1 = _mm_mul_ps(c2_1, m2v);
1219
1220     m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
1221
1222     c0 = _mm_add_ps(c0, c0_1);
1223     c1 = _mm_add_ps(c1, c1_1);
1224     c2 = _mm_add_ps(c2, c2_1);
1225
1226     m10 = bt_splat_ps( m10, 2);
1227     m11 = bt_splat_ps( m11, 2);
1228     m12 = bt_splat_ps( m12, 2);
1229
1230     m10 = _mm_mul_ps(m10, m2v);
1231     m11 = _mm_mul_ps(m11, m2v);
1232     m12 = _mm_mul_ps(m12, m2v);
1233
1234     c0 = _mm_add_ps(c0, m10);
1235     c1 = _mm_add_ps(c1, m11);
1236     c2 = _mm_add_ps(c2, m12);
1237
1238     return btMatrix3x3(c0, c1, c2);
1239
1240 #elif defined(BT_USE_NEON)
1241
1242     float32x4_t rv0, rv1, rv2;
1243     float32x4_t v0, v1, v2;
1244     float32x4_t mv0, mv1, mv2;
1245
1246     v0 = m1[0].mVec128;
1247     v1 = m1[1].mVec128;
1248     v2 = m1[2].mVec128;
1249
1250     mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask);
1251     mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask);
1252     mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask);
1253
1254     rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
1255     rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
1256     rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
1257
1258     rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
1259     rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
1260     rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
1261
1262     rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
1263     rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
1264     rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
1265
1266         return btMatrix3x3(rv0, rv1, rv2);
1267
1268 #else
1269         return btMatrix3x3(
1270                 m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]),
1271                 m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]),
1272                 m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2]));
1273 #endif
1274 }
1275
1276 /*
1277 SIMD_FORCE_INLINE btMatrix3x3 btMultTransposeLeft(const btMatrix3x3& m1, const btMatrix3x3& m2) {
1278 return btMatrix3x3(
1279 m1[0][0] * m2[0][0] + m1[1][0] * m2[1][0] + m1[2][0] * m2[2][0],
1280 m1[0][0] * m2[0][1] + m1[1][0] * m2[1][1] + m1[2][0] * m2[2][1],
1281 m1[0][0] * m2[0][2] + m1[1][0] * m2[1][2] + m1[2][0] * m2[2][2],
1282 m1[0][1] * m2[0][0] + m1[1][1] * m2[1][0] + m1[2][1] * m2[2][0],
1283 m1[0][1] * m2[0][1] + m1[1][1] * m2[1][1] + m1[2][1] * m2[2][1],
1284 m1[0][1] * m2[0][2] + m1[1][1] * m2[1][2] + m1[2][1] * m2[2][2],
1285 m1[0][2] * m2[0][0] + m1[1][2] * m2[1][0] + m1[2][2] * m2[2][0],
1286 m1[0][2] * m2[0][1] + m1[1][2] * m2[1][1] + m1[2][2] * m2[2][1],
1287 m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
1288 }
1289 */
1290
1291 /**@brief Equality operator between two matrices
1292 * It will test all elements are equal.  */
1293 SIMD_FORCE_INLINE bool operator==(const btMatrix3x3& m1, const btMatrix3x3& m2)
1294 {
1295 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1296
1297     __m128 c0, c1, c2;
1298
1299     c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
1300     c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
1301     c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
1302
1303     c0 = _mm_and_ps(c0, c1);
1304     c0 = _mm_and_ps(c0, c2);
1305
1306     return (0x7 == _mm_movemask_ps((__m128)c0));
1307 #else
1308         return
1309     (   m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
1310                 m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
1311                 m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] );
1312 #endif
1313 }
1314
1315 ///for serialization
1316 struct  btMatrix3x3FloatData
1317 {
1318         btVector3FloatData m_el[3];
1319 };
1320
1321 ///for serialization
1322 struct  btMatrix3x3DoubleData
1323 {
1324         btVector3DoubleData m_el[3];
1325 };
1326
1327
1328
1329
1330 SIMD_FORCE_INLINE       void    btMatrix3x3::serialize(struct   btMatrix3x3Data& dataOut) const
1331 {
1332         for (int i=0;i<3;i++)
1333                 m_el[i].serialize(dataOut.m_el[i]);
1334 }
1335
1336 SIMD_FORCE_INLINE       void    btMatrix3x3::serializeFloat(struct      btMatrix3x3FloatData& dataOut) const
1337 {
1338         for (int i=0;i<3;i++)
1339                 m_el[i].serializeFloat(dataOut.m_el[i]);
1340 }
1341
1342
1343 SIMD_FORCE_INLINE       void    btMatrix3x3::deSerialize(const struct   btMatrix3x3Data& dataIn)
1344 {
1345         for (int i=0;i<3;i++)
1346                 m_el[i].deSerialize(dataIn.m_el[i]);
1347 }
1348
1349 SIMD_FORCE_INLINE       void    btMatrix3x3::deSerializeFloat(const struct      btMatrix3x3FloatData& dataIn)
1350 {
1351         for (int i=0;i<3;i++)
1352                 m_el[i].deSerializeFloat(dataIn.m_el[i]);
1353 }
1354
1355 SIMD_FORCE_INLINE       void    btMatrix3x3::deSerializeDouble(const struct     btMatrix3x3DoubleData& dataIn)
1356 {
1357         for (int i=0;i<3;i++)
1358                 m_el[i].deSerializeDouble(dataIn.m_el[i]);
1359 }
1360
1361 #endif //BT_MATRIX3x3_H
1362