2 Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans http://bulletphysics.org
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
15 #ifndef B3_MATRIX3x3_H
16 #define B3_MATRIX3x3_H
18 #include "b3Vector3.h"
19 #include "b3Quaternion.h"
23 //const __m128 B3_ATTRIBUTE_ALIGNED16(b3v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
24 const __m128 B3_ATTRIBUTE_ALIGNED16(b3vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
27 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
28 const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
29 const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
30 const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
33 #ifdef B3_USE_DOUBLE_PRECISION
34 #define b3Matrix3x3Data b3Matrix3x3DoubleData
36 #define b3Matrix3x3Data b3Matrix3x3FloatData
37 #endif //B3_USE_DOUBLE_PRECISION
39 /**@brief The b3Matrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with b3Quaternion, b3Transform and b3Vector3.
40 * Make sure to only include a pure orthogonal matrix without scaling. */
41 B3_ATTRIBUTE_ALIGNED16(class)
44 ///Data storage for the matrix, each vector is a row of the matrix
48 /** @brief No initializaion constructor */
51 // explicit b3Matrix3x3(const b3Scalar *m) { setFromOpenGLSubMatrix(m); }
53 /**@brief Constructor from Quaternion */
54 explicit b3Matrix3x3(const b3Quaternion& q) { setRotation(q); }
56 template <typename b3Scalar>
57 Matrix3x3(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
59 setEulerYPR(yaw, pitch, roll);
62 /** @brief Constructor with row major formatting */
63 b3Matrix3x3(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz,
64 const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz,
65 const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz)
72 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
73 B3_FORCE_INLINE b3Matrix3x3(const b3SimdFloat4 v0, const b3SimdFloat4 v1, const b3SimdFloat4 v2)
80 B3_FORCE_INLINE b3Matrix3x3(const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2)
88 B3_FORCE_INLINE b3Matrix3x3(const b3Matrix3x3& rhs)
90 m_el[0].mVec128 = rhs.m_el[0].mVec128;
91 m_el[1].mVec128 = rhs.m_el[1].mVec128;
92 m_el[2].mVec128 = rhs.m_el[2].mVec128;
95 // Assignment Operator
96 B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& m)
98 m_el[0].mVec128 = m.m_el[0].mVec128;
99 m_el[1].mVec128 = m.m_el[1].mVec128;
100 m_el[2].mVec128 = m.m_el[2].mVec128;
107 /** @brief Copy constructor */
108 B3_FORCE_INLINE b3Matrix3x3(const b3Matrix3x3& other)
110 m_el[0] = other.m_el[0];
111 m_el[1] = other.m_el[1];
112 m_el[2] = other.m_el[2];
115 /** @brief Assignment Operator */
116 B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& other)
118 m_el[0] = other.m_el[0];
119 m_el[1] = other.m_el[1];
120 m_el[2] = other.m_el[2];
126 /** @brief Get a column of the matrix as a vector
127 * @param i Column number 0 indexed */
128 B3_FORCE_INLINE b3Vector3 getColumn(int i) const
130 return b3MakeVector3(m_el[0][i], m_el[1][i], m_el[2][i]);
133 /** @brief Get a row of the matrix as a vector
134 * @param i Row number 0 indexed */
135 B3_FORCE_INLINE const b3Vector3& getRow(int i) const
137 b3FullAssert(0 <= i && i < 3);
141 /** @brief Get a mutable reference to a row of the matrix as a vector
142 * @param i Row number 0 indexed */
143 B3_FORCE_INLINE b3Vector3& operator[](int i)
145 b3FullAssert(0 <= i && i < 3);
149 /** @brief Get a const reference to a row of the matrix as a vector
150 * @param i Row number 0 indexed */
151 B3_FORCE_INLINE const b3Vector3& operator[](int i) const
153 b3FullAssert(0 <= i && i < 3);
157 /** @brief Multiply by the target matrix on the right
158 * @param m Rotation matrix to be applied
159 * Equivilant to this = this * m */
160 b3Matrix3x3& operator*=(const b3Matrix3x3& m);
162 /** @brief Adds by the target matrix on the right
163 * @param m matrix to be applied
164 * Equivilant to this = this + m */
165 b3Matrix3x3& operator+=(const b3Matrix3x3& m);
167 /** @brief Substractss by the target matrix on the right
168 * @param m matrix to be applied
169 * Equivilant to this = this - m */
170 b3Matrix3x3& operator-=(const b3Matrix3x3& m);
172 /** @brief Set from the rotational part of a 4x4 OpenGL matrix
173 * @param m A pointer to the beginning of the array of scalars*/
174 void setFromOpenGLSubMatrix(const b3Scalar* m)
176 m_el[0].setValue(m[0], m[4], m[8]);
177 m_el[1].setValue(m[1], m[5], m[9]);
178 m_el[2].setValue(m[2], m[6], m[10]);
180 /** @brief Set the values of the matrix explicitly (row major)
182 * @param xy Top Middle
183 * @param xz Top Right
184 * @param yx Middle Left
185 * @param yy Middle Middle
186 * @param yz Middle Right
187 * @param zx Bottom Left
188 * @param zy Bottom Middle
189 * @param zz Bottom Right*/
190 void setValue(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz,
191 const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz,
192 const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz)
194 m_el[0].setValue(xx, xy, xz);
195 m_el[1].setValue(yx, yy, yz);
196 m_el[2].setValue(zx, zy, zz);
199 /** @brief Set the matrix from a quaternion
200 * @param q The Quaternion to match */
201 void setRotation(const b3Quaternion& q)
203 b3Scalar d = q.length2();
204 b3FullAssert(d != b3Scalar(0.0));
205 b3Scalar s = b3Scalar(2.0) / d;
207 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
208 __m128 vs, Q = q.get128();
209 __m128i Qi = b3CastfTo128i(Q);
212 __m128 V11, V21, V31;
213 __m128 NQ = _mm_xor_ps(Q, b3vMzeroMask);
214 __m128i NQi = b3CastfTo128i(NQ);
216 V1 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 0, 2, 3))); // Y X Z W
217 V2 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(0, 0, 1, 3)); // -X -X Y W
218 V3 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(2, 1, 0, 3))); // Z Y X W
219 V1 = _mm_xor_ps(V1, b3vMPPP); // change the sign of the first element
221 V11 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 1, 0, 3))); // Y Y X W
222 V21 = _mm_unpackhi_ps(Q, Q); // Z Z W W
223 V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(0, 2, 0, 3)); // X Z -X -W
229 V11 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(2, 3, 1, 3)); // -Z -W Y W
231 V21 = _mm_xor_ps(V21, b3vMPPP); // change the sign of the first element
232 V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(3, 3, 1, 3)); // W W -Y -W
233 V31 = _mm_xor_ps(V31, b3vMPPP); // change the sign of the first element
234 Y = b3CastiTo128f(_mm_shuffle_epi32(NQi, B3_SHUFFLE(3, 2, 0, 3))); // -W -Z -X -W
235 Z = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 0, 1, 3))); // Y X Y W
237 vs = _mm_load_ss(&s);
245 vs = b3_splat3_ps(vs, 0);
255 m_el[0] = b3MakeVector3(V1);
256 m_el[1] = b3MakeVector3(V2);
257 m_el[2] = b3MakeVector3(V3);
259 b3Scalar xs = q.getX() * s, ys = q.getY() * s, zs = q.getZ() * s;
260 b3Scalar wx = q.getW() * xs, wy = q.getW() * ys, wz = q.getW() * zs;
261 b3Scalar xx = q.getX() * xs, xy = q.getX() * ys, xz = q.getX() * zs;
262 b3Scalar yy = q.getY() * ys, yz = q.getY() * zs, zz = q.getZ() * zs;
264 b3Scalar(1.0) - (yy + zz), xy - wz, xz + wy,
265 xy + wz, b3Scalar(1.0) - (xx + zz), yz - wx,
266 xz - wy, yz + wx, b3Scalar(1.0) - (xx + yy));
270 /** @brief Set the matrix from euler angles using YPR around YXZ respectively
271 * @param yaw Yaw about Y axis
272 * @param pitch Pitch about X axis
273 * @param roll Roll about Z axis
275 void setEulerYPR(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
277 setEulerZYX(roll, pitch, yaw);
280 /** @brief Set the matrix from euler angles YPR around ZYX axes
281 * @param eulerX Roll about X axis
282 * @param eulerY Pitch around Y axis
283 * @param eulerZ Yaw aboud Z axis
285 * These angles are used to produce a rotation matrix. The euler
286 * angles are applied in ZYX order. I.e a vector is first rotated
287 * about X then Y and then Z
289 void setEulerZYX(b3Scalar eulerX, b3Scalar eulerY, b3Scalar eulerZ)
291 ///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code
292 b3Scalar ci(b3Cos(eulerX));
293 b3Scalar cj(b3Cos(eulerY));
294 b3Scalar ch(b3Cos(eulerZ));
295 b3Scalar si(b3Sin(eulerX));
296 b3Scalar sj(b3Sin(eulerY));
297 b3Scalar sh(b3Sin(eulerZ));
298 b3Scalar cc = ci * ch;
299 b3Scalar cs = ci * sh;
300 b3Scalar sc = si * ch;
301 b3Scalar ss = si * sh;
303 setValue(cj * ch, sj * sc - cs, sj * cc + ss,
304 cj * sh, sj * ss + cc, sj * cs - sc,
305 -sj, cj * si, cj * ci);
308 /**@brief Set the matrix to the identity */
311 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
312 m_el[0] = b3MakeVector3(b3v1000);
313 m_el[1] = b3MakeVector3(b3v0100);
314 m_el[2] = b3MakeVector3(b3v0010);
316 setValue(b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0),
317 b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0),
318 b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0));
322 static const b3Matrix3x3& getIdentity()
324 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
325 static const b3Matrix3x3
326 identityMatrix(b3v1000, b3v0100, b3v0010);
328 static const b3Matrix3x3
330 b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0),
331 b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0),
332 b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0));
334 return identityMatrix;
337 /**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective
338 * @param m The array to be filled */
339 void getOpenGLSubMatrix(b3Scalar * m) const
341 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
342 __m128 v0 = m_el[0].mVec128;
343 __m128 v1 = m_el[1].mVec128;
344 __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2
345 __m128* vm = (__m128*)m;
348 v2 = _mm_and_ps(v2, b3vFFF0fMask); // x2 y2 z2 0
350 vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * *
351 v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1
353 v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3)); // y0 y1 y2 0
354 v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3)); // x0 x1 x2 0
355 v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT))); // z0 z1 z2 0
360 #elif defined(B3_USE_NEON)
361 // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
362 static const uint32x2_t zMask = (const uint32x2_t){-1, 0};
363 float32x4_t* vm = (float32x4_t*)m;
364 float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128); // {x0 x1 z0 z1}, {y0 y1 w0 w1}
365 float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f)); // {x2 0 }, {y2 0}
366 float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
367 float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
368 float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
369 float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q); // z0 z1 z2 0
375 m[0] = b3Scalar(m_el[0].getX());
376 m[1] = b3Scalar(m_el[1].getX());
377 m[2] = b3Scalar(m_el[2].getX());
378 m[3] = b3Scalar(0.0);
379 m[4] = b3Scalar(m_el[0].getY());
380 m[5] = b3Scalar(m_el[1].getY());
381 m[6] = b3Scalar(m_el[2].getY());
382 m[7] = b3Scalar(0.0);
383 m[8] = b3Scalar(m_el[0].getZ());
384 m[9] = b3Scalar(m_el[1].getZ());
385 m[10] = b3Scalar(m_el[2].getZ());
386 m[11] = b3Scalar(0.0);
390 /**@brief Get the matrix represented as a quaternion
391 * @param q The quaternion which will be set */
392 void getRotation(b3Quaternion & q) const
394 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
395 b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ();
403 if (trace > b3Scalar(0.0))
405 x = trace + b3Scalar(1.0);
407 temp.f[0] = m_el[2].getY() - m_el[1].getZ();
408 temp.f[1] = m_el[0].getZ() - m_el[2].getX();
409 temp.f[2] = m_el[1].getX() - m_el[0].getY();
411 //temp.f[3]= s * b3Scalar(0.5);
416 if (m_el[0].getX() < m_el[1].getY())
418 if (m_el[1].getY() < m_el[2].getZ())
433 if (m_el[0].getX() < m_el[2].getZ())
447 x = m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0);
449 temp.f[3] = (m_el[k][j] - m_el[j][k]);
450 temp.f[j] = (m_el[j][i] + m_el[i][j]);
451 temp.f[k] = (m_el[k][i] + m_el[i][k]);
453 //temp.f[i] = s * b3Scalar(0.5);
458 s = b3Scalar(0.5) / s;
462 b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ();
466 if (trace > b3Scalar(0.0))
468 b3Scalar s = b3Sqrt(trace + b3Scalar(1.0));
469 temp[3] = (s * b3Scalar(0.5));
470 s = b3Scalar(0.5) / s;
472 temp[0] = ((m_el[2].getY() - m_el[1].getZ()) * s);
473 temp[1] = ((m_el[0].getZ() - m_el[2].getX()) * s);
474 temp[2] = ((m_el[1].getX() - m_el[0].getY()) * s);
478 int i = m_el[0].getX() < m_el[1].getY() ? (m_el[1].getY() < m_el[2].getZ() ? 2 : 1) : (m_el[0].getX() < m_el[2].getZ() ? 2 : 0);
482 b3Scalar s = b3Sqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0));
483 temp[i] = s * b3Scalar(0.5);
484 s = b3Scalar(0.5) / s;
486 temp[3] = (m_el[k][j] - m_el[j][k]) * s;
487 temp[j] = (m_el[j][i] + m_el[i][j]) * s;
488 temp[k] = (m_el[k][i] + m_el[i][k]) * s;
490 q.setValue(temp[0], temp[1], temp[2], temp[3]);
494 /**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
495 * @param yaw Yaw around Y axis
496 * @param pitch Pitch around X axis
497 * @param roll around Z axis */
498 void getEulerYPR(b3Scalar & yaw, b3Scalar & pitch, b3Scalar & roll) const
500 // first use the normal calculus
501 yaw = b3Scalar(b3Atan2(m_el[1].getX(), m_el[0].getX()));
502 pitch = b3Scalar(b3Asin(-m_el[2].getX()));
503 roll = b3Scalar(b3Atan2(m_el[2].getY(), m_el[2].getZ()));
505 // on pitch = +/-HalfPI
506 if (b3Fabs(pitch) == B3_HALF_PI)
520 /**@brief Get the matrix represented as euler angles around ZYX
521 * @param yaw Yaw around X axis
522 * @param pitch Pitch around Y axis
523 * @param roll around X axis
524 * @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/
525 void getEulerZYX(b3Scalar & yaw, b3Scalar & pitch, b3Scalar & roll, unsigned int solution_number = 1) const
535 Euler euler_out2; //second solution
536 //get the pointer to the raw data
538 // Check that pitch is not at a singularity
539 if (b3Fabs(m_el[2].getX()) >= 1)
544 // From difference of angles formula
545 b3Scalar delta = b3Atan2(m_el[0].getX(), m_el[0].getZ());
546 if (m_el[2].getX() > 0) //gimbal locked up
548 euler_out.pitch = B3_PI / b3Scalar(2.0);
549 euler_out2.pitch = B3_PI / b3Scalar(2.0);
550 euler_out.roll = euler_out.pitch + delta;
551 euler_out2.roll = euler_out.pitch + delta;
553 else // gimbal locked down
555 euler_out.pitch = -B3_PI / b3Scalar(2.0);
556 euler_out2.pitch = -B3_PI / b3Scalar(2.0);
557 euler_out.roll = -euler_out.pitch + delta;
558 euler_out2.roll = -euler_out.pitch + delta;
563 euler_out.pitch = -b3Asin(m_el[2].getX());
564 euler_out2.pitch = B3_PI - euler_out.pitch;
566 euler_out.roll = b3Atan2(m_el[2].getY() / b3Cos(euler_out.pitch),
567 m_el[2].getZ() / b3Cos(euler_out.pitch));
568 euler_out2.roll = b3Atan2(m_el[2].getY() / b3Cos(euler_out2.pitch),
569 m_el[2].getZ() / b3Cos(euler_out2.pitch));
571 euler_out.yaw = b3Atan2(m_el[1].getX() / b3Cos(euler_out.pitch),
572 m_el[0].getX() / b3Cos(euler_out.pitch));
573 euler_out2.yaw = b3Atan2(m_el[1].getX() / b3Cos(euler_out2.pitch),
574 m_el[0].getX() / b3Cos(euler_out2.pitch));
577 if (solution_number == 1)
580 pitch = euler_out.pitch;
581 roll = euler_out.roll;
585 yaw = euler_out2.yaw;
586 pitch = euler_out2.pitch;
587 roll = euler_out2.roll;
591 /**@brief Create a scaled copy of the matrix
592 * @param s Scaling vector The elements of the vector will scale each column */
594 b3Matrix3x3 scaled(const b3Vector3& s) const
596 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
597 return b3Matrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
600 m_el[0].getX() * s.getX(), m_el[0].getY() * s.getY(), m_el[0].getZ() * s.getZ(),
601 m_el[1].getX() * s.getX(), m_el[1].getY() * s.getY(), m_el[1].getZ() * s.getZ(),
602 m_el[2].getX() * s.getX(), m_el[2].getY() * s.getY(), m_el[2].getZ() * s.getZ());
606 /**@brief Return the determinant of the matrix */
607 b3Scalar determinant() const;
608 /**@brief Return the adjoint of the matrix */
609 b3Matrix3x3 adjoint() const;
610 /**@brief Return the matrix with all values non negative */
611 b3Matrix3x3 absolute() const;
612 /**@brief Return the transpose of the matrix */
613 b3Matrix3x3 transpose() const;
614 /**@brief Return the inverse of the matrix */
615 b3Matrix3x3 inverse() const;
617 b3Matrix3x3 transposeTimes(const b3Matrix3x3& m) const;
618 b3Matrix3x3 timesTranspose(const b3Matrix3x3& m) const;
620 B3_FORCE_INLINE b3Scalar tdotx(const b3Vector3& v) const
622 return m_el[0].getX() * v.getX() + m_el[1].getX() * v.getY() + m_el[2].getX() * v.getZ();
624 B3_FORCE_INLINE b3Scalar tdoty(const b3Vector3& v) const
626 return m_el[0].getY() * v.getX() + m_el[1].getY() * v.getY() + m_el[2].getY() * v.getZ();
628 B3_FORCE_INLINE b3Scalar tdotz(const b3Vector3& v) const
630 return m_el[0].getZ() * v.getX() + m_el[1].getZ() * v.getY() + m_el[2].getZ() * v.getZ();
633 /**@brief diagonalizes this matrix by the Jacobi method.
634 * @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original
635 * coordinate system, i.e., old_this = rot * new_this * rot^T.
636 * @param threshold See iteration
637 * @param iteration The iteration stops when all off-diagonal elements are less than the threshold multiplied
638 * by the sum of the absolute values of the diagonal, or when maxSteps have been executed.
640 * Note that this matrix is assumed to be symmetric.
642 void diagonalize(b3Matrix3x3 & rot, b3Scalar threshold, int maxSteps)
645 for (int step = maxSteps; step > 0; step--)
647 // find off-diagonal element [p][q] with largest magnitude
651 b3Scalar max = b3Fabs(m_el[0][1]);
652 b3Scalar v = b3Fabs(m_el[0][2]);
659 v = b3Fabs(m_el[1][2]);
668 b3Scalar t = threshold * (b3Fabs(m_el[0][0]) + b3Fabs(m_el[1][1]) + b3Fabs(m_el[2][2]));
671 if (max <= B3_EPSILON * t)
678 // compute Jacobi rotation J which leads to a zero for element [p][q]
679 b3Scalar mpq = m_el[p][q];
680 b3Scalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
681 b3Scalar theta2 = theta * theta;
684 if (theta2 * theta2 < b3Scalar(10 / B3_EPSILON))
686 t = (theta >= 0) ? 1 / (theta + b3Sqrt(1 + theta2))
687 : 1 / (theta - b3Sqrt(1 + theta2));
688 cos = 1 / b3Sqrt(1 + t * t);
693 // approximation for large theta-value, i.e., a nearly diagonal matrix
694 t = 1 / (theta * (2 + b3Scalar(0.5) / theta2));
695 cos = 1 - b3Scalar(0.5) * t * t;
699 // apply rotation to matrix (this = J^T * this * J)
700 m_el[p][q] = m_el[q][p] = 0;
701 m_el[p][p] -= t * mpq;
702 m_el[q][q] += t * mpq;
703 b3Scalar mrp = m_el[r][p];
704 b3Scalar mrq = m_el[r][q];
705 m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
706 m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
708 // apply rotation to rot (rot = rot * J)
709 for (int i = 0; i < 3; i++)
711 b3Vector3& row = rot[i];
714 row[p] = cos * mrp - sin * mrq;
715 row[q] = cos * mrq + sin * mrp;
720 /**@brief Calculate the matrix cofactor
721 * @param r1 The first row to use for calculating the cofactor
722 * @param c1 The first column to use for calculating the cofactor
723 * @param r1 The second row to use for calculating the cofactor
724 * @param c1 The second column to use for calculating the cofactor
725 * See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details
727 b3Scalar cofac(int r1, int c1, int r2, int c2) const
729 return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
732 void serialize(struct b3Matrix3x3Data & dataOut) const;
734 void serializeFloat(struct b3Matrix3x3FloatData & dataOut) const;
736 void deSerialize(const struct b3Matrix3x3Data& dataIn);
738 void deSerializeFloat(const struct b3Matrix3x3FloatData& dataIn);
740 void deSerializeDouble(const struct b3Matrix3x3DoubleData& dataIn);
743 B3_FORCE_INLINE b3Matrix3x3&
744 b3Matrix3x3::operator*=(const b3Matrix3x3& m)
746 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
747 __m128 rv00, rv01, rv02;
748 __m128 rv10, rv11, rv12;
749 __m128 rv20, rv21, rv22;
750 __m128 mv0, mv1, mv2;
752 rv02 = m_el[0].mVec128;
753 rv12 = m_el[1].mVec128;
754 rv22 = m_el[2].mVec128;
756 mv0 = _mm_and_ps(m[0].mVec128, b3vFFF0fMask);
757 mv1 = _mm_and_ps(m[1].mVec128, b3vFFF0fMask);
758 mv2 = _mm_and_ps(m[2].mVec128, b3vFFF0fMask);
761 rv00 = b3_splat_ps(rv02, 0);
762 rv01 = b3_splat_ps(rv02, 1);
763 rv02 = b3_splat_ps(rv02, 2);
765 rv00 = _mm_mul_ps(rv00, mv0);
766 rv01 = _mm_mul_ps(rv01, mv1);
767 rv02 = _mm_mul_ps(rv02, mv2);
770 rv10 = b3_splat_ps(rv12, 0);
771 rv11 = b3_splat_ps(rv12, 1);
772 rv12 = b3_splat_ps(rv12, 2);
774 rv10 = _mm_mul_ps(rv10, mv0);
775 rv11 = _mm_mul_ps(rv11, mv1);
776 rv12 = _mm_mul_ps(rv12, mv2);
779 rv20 = b3_splat_ps(rv22, 0);
780 rv21 = b3_splat_ps(rv22, 1);
781 rv22 = b3_splat_ps(rv22, 2);
783 rv20 = _mm_mul_ps(rv20, mv0);
784 rv21 = _mm_mul_ps(rv21, mv1);
785 rv22 = _mm_mul_ps(rv22, mv2);
787 rv00 = _mm_add_ps(rv00, rv01);
788 rv10 = _mm_add_ps(rv10, rv11);
789 rv20 = _mm_add_ps(rv20, rv21);
791 m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
792 m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
793 m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
795 #elif defined(B3_USE_NEON)
797 float32x4_t rv0, rv1, rv2;
798 float32x4_t v0, v1, v2;
799 float32x4_t mv0, mv1, mv2;
801 v0 = m_el[0].mVec128;
802 v1 = m_el[1].mVec128;
803 v2 = m_el[2].mVec128;
805 mv0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask);
806 mv1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask);
807 mv2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask);
809 rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
810 rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
811 rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
813 rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
814 rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
815 rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
817 rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
818 rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
819 rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
821 m_el[0].mVec128 = rv0;
822 m_el[1].mVec128 = rv1;
823 m_el[2].mVec128 = rv2;
826 m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
827 m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
828 m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
833 B3_FORCE_INLINE b3Matrix3x3&
834 b3Matrix3x3::operator+=(const b3Matrix3x3& m)
836 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
837 m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
838 m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
839 m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
842 m_el[0][0] + m.m_el[0][0],
843 m_el[0][1] + m.m_el[0][1],
844 m_el[0][2] + m.m_el[0][2],
845 m_el[1][0] + m.m_el[1][0],
846 m_el[1][1] + m.m_el[1][1],
847 m_el[1][2] + m.m_el[1][2],
848 m_el[2][0] + m.m_el[2][0],
849 m_el[2][1] + m.m_el[2][1],
850 m_el[2][2] + m.m_el[2][2]);
855 B3_FORCE_INLINE b3Matrix3x3
856 operator*(const b3Matrix3x3& m, const b3Scalar& k)
858 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
859 __m128 vk = b3_splat_ps(_mm_load_ss((float*)&k), 0x80);
861 _mm_mul_ps(m[0].mVec128, vk),
862 _mm_mul_ps(m[1].mVec128, vk),
863 _mm_mul_ps(m[2].mVec128, vk));
864 #elif defined(B3_USE_NEON)
866 vmulq_n_f32(m[0].mVec128, k),
867 vmulq_n_f32(m[1].mVec128, k),
868 vmulq_n_f32(m[2].mVec128, k));
871 m[0].getX() * k, m[0].getY() * k, m[0].getZ() * k,
872 m[1].getX() * k, m[1].getY() * k, m[1].getZ() * k,
873 m[2].getX() * k, m[2].getY() * k, m[2].getZ() * k);
877 B3_FORCE_INLINE b3Matrix3x3
878 operator+(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
880 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
882 m1[0].mVec128 + m2[0].mVec128,
883 m1[1].mVec128 + m2[1].mVec128,
884 m1[2].mVec128 + m2[2].mVec128);
897 m1[2][2] + m2[2][2]);
901 B3_FORCE_INLINE b3Matrix3x3
902 operator-(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
904 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
906 m1[0].mVec128 - m2[0].mVec128,
907 m1[1].mVec128 - m2[1].mVec128,
908 m1[2].mVec128 - m2[2].mVec128);
921 m1[2][2] - m2[2][2]);
925 B3_FORCE_INLINE b3Matrix3x3&
926 b3Matrix3x3::operator-=(const b3Matrix3x3& m)
928 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
929 m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
930 m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
931 m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
934 m_el[0][0] - m.m_el[0][0],
935 m_el[0][1] - m.m_el[0][1],
936 m_el[0][2] - m.m_el[0][2],
937 m_el[1][0] - m.m_el[1][0],
938 m_el[1][1] - m.m_el[1][1],
939 m_el[1][2] - m.m_el[1][2],
940 m_el[2][0] - m.m_el[2][0],
941 m_el[2][1] - m.m_el[2][1],
942 m_el[2][2] - m.m_el[2][2]);
947 B3_FORCE_INLINE b3Scalar
948 b3Matrix3x3::determinant() const
950 return b3Triple((*this)[0], (*this)[1], (*this)[2]);
953 B3_FORCE_INLINE b3Matrix3x3
954 b3Matrix3x3::absolute() const
956 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
958 _mm_and_ps(m_el[0].mVec128, b3vAbsfMask),
959 _mm_and_ps(m_el[1].mVec128, b3vAbsfMask),
960 _mm_and_ps(m_el[2].mVec128, b3vAbsfMask));
961 #elif defined(B3_USE_NEON)
963 (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, b3v3AbsMask),
964 (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, b3v3AbsMask),
965 (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, b3v3AbsMask));
968 b3Fabs(m_el[0].getX()), b3Fabs(m_el[0].getY()), b3Fabs(m_el[0].getZ()),
969 b3Fabs(m_el[1].getX()), b3Fabs(m_el[1].getY()), b3Fabs(m_el[1].getZ()),
970 b3Fabs(m_el[2].getX()), b3Fabs(m_el[2].getY()), b3Fabs(m_el[2].getZ()));
974 B3_FORCE_INLINE b3Matrix3x3
975 b3Matrix3x3::transpose() const
977 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
978 __m128 v0 = m_el[0].mVec128;
979 __m128 v1 = m_el[1].mVec128;
980 __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2
983 v2 = _mm_and_ps(v2, b3vFFF0fMask); // x2 y2 z2 0
985 vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * *
986 v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1
988 v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3)); // y0 y1 y2 0
989 v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3)); // x0 x1 x2 0
990 v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT))); // z0 z1 z2 0
992 return b3Matrix3x3(v0, v1, v2);
993 #elif defined(B3_USE_NEON)
994 // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
995 static const uint32x2_t zMask = (const uint32x2_t){-1, 0};
996 float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128); // {x0 x1 z0 z1}, {y0 y1 w0 w1}
997 float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f)); // {x2 0 }, {y2 0}
998 float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
999 float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
1000 float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
1001 float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q); // z0 z1 z2 0
1002 return b3Matrix3x3(v0, v1, v2);
1004 return b3Matrix3x3(m_el[0].getX(), m_el[1].getX(), m_el[2].getX(),
1005 m_el[0].getY(), m_el[1].getY(), m_el[2].getY(),
1006 m_el[0].getZ(), m_el[1].getZ(), m_el[2].getZ());
1010 B3_FORCE_INLINE b3Matrix3x3
1011 b3Matrix3x3::adjoint() const
1013 return b3Matrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2),
1014 cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
1015 cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
1018 B3_FORCE_INLINE b3Matrix3x3
1019 b3Matrix3x3::inverse() const
1021 b3Vector3 co = b3MakeVector3(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1));
1022 b3Scalar det = (*this)[0].dot(co);
1023 b3FullAssert(det != b3Scalar(0.0));
1024 b3Scalar s = b3Scalar(1.0) / det;
1025 return b3Matrix3x3(co.getX() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s,
1026 co.getY() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
1027 co.getZ() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
1030 B3_FORCE_INLINE b3Matrix3x3
1031 b3Matrix3x3::transposeTimes(const b3Matrix3x3& m) const
1033 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
1035 // static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
1036 __m128 row = m_el[0].mVec128;
1037 __m128 m0 = _mm_and_ps(m.getRow(0).mVec128, b3vFFF0fMask);
1038 __m128 m1 = _mm_and_ps(m.getRow(1).mVec128, b3vFFF0fMask);
1039 __m128 m2 = _mm_and_ps(m.getRow(2).mVec128, b3vFFF0fMask);
1040 __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
1041 __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
1042 __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
1043 row = m_el[1].mVec128;
1044 r0 = _mm_add_ps(r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
1045 r1 = _mm_add_ps(r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
1046 r2 = _mm_add_ps(r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
1047 row = m_el[2].mVec128;
1048 r0 = _mm_add_ps(r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
1049 r1 = _mm_add_ps(r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
1050 r2 = _mm_add_ps(r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
1051 return b3Matrix3x3(r0, r1, r2);
1053 #elif defined B3_USE_NEON
1055 static const uint32x4_t xyzMask = (const uint32x4_t){-1, -1, -1, 0};
1056 float32x4_t m0 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(0).mVec128, xyzMask);
1057 float32x4_t m1 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(1).mVec128, xyzMask);
1058 float32x4_t m2 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(2).mVec128, xyzMask);
1059 float32x4_t row = m_el[0].mVec128;
1060 float32x4_t r0 = vmulq_lane_f32(m0, vget_low_f32(row), 0);
1061 float32x4_t r1 = vmulq_lane_f32(m0, vget_low_f32(row), 1);
1062 float32x4_t r2 = vmulq_lane_f32(m0, vget_high_f32(row), 0);
1063 row = m_el[1].mVec128;
1064 r0 = vmlaq_lane_f32(r0, m1, vget_low_f32(row), 0);
1065 r1 = vmlaq_lane_f32(r1, m1, vget_low_f32(row), 1);
1066 r2 = vmlaq_lane_f32(r2, m1, vget_high_f32(row), 0);
1067 row = m_el[2].mVec128;
1068 r0 = vmlaq_lane_f32(r0, m2, vget_low_f32(row), 0);
1069 r1 = vmlaq_lane_f32(r1, m2, vget_low_f32(row), 1);
1070 r2 = vmlaq_lane_f32(r2, m2, vget_high_f32(row), 0);
1071 return b3Matrix3x3(r0, r1, r2);
1074 m_el[0].getX() * m[0].getX() + m_el[1].getX() * m[1].getX() + m_el[2].getX() * m[2].getX(),
1075 m_el[0].getX() * m[0].getY() + m_el[1].getX() * m[1].getY() + m_el[2].getX() * m[2].getY(),
1076 m_el[0].getX() * m[0].getZ() + m_el[1].getX() * m[1].getZ() + m_el[2].getX() * m[2].getZ(),
1077 m_el[0].getY() * m[0].getX() + m_el[1].getY() * m[1].getX() + m_el[2].getY() * m[2].getX(),
1078 m_el[0].getY() * m[0].getY() + m_el[1].getY() * m[1].getY() + m_el[2].getY() * m[2].getY(),
1079 m_el[0].getY() * m[0].getZ() + m_el[1].getY() * m[1].getZ() + m_el[2].getY() * m[2].getZ(),
1080 m_el[0].getZ() * m[0].getX() + m_el[1].getZ() * m[1].getX() + m_el[2].getZ() * m[2].getX(),
1081 m_el[0].getZ() * m[0].getY() + m_el[1].getZ() * m[1].getY() + m_el[2].getZ() * m[2].getY(),
1082 m_el[0].getZ() * m[0].getZ() + m_el[1].getZ() * m[1].getZ() + m_el[2].getZ() * m[2].getZ());
1086 B3_FORCE_INLINE b3Matrix3x3
1087 b3Matrix3x3::timesTranspose(const b3Matrix3x3& m) const
1089 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
1090 __m128 a0 = m_el[0].mVec128;
1091 __m128 a1 = m_el[1].mVec128;
1092 __m128 a2 = m_el[2].mVec128;
1094 b3Matrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
1095 __m128 mx = mT[0].mVec128;
1096 __m128 my = mT[1].mVec128;
1097 __m128 mz = mT[2].mVec128;
1099 __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
1100 __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
1101 __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
1102 r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
1103 r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
1104 r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
1105 r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
1106 r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
1107 r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
1108 return b3Matrix3x3(r0, r1, r2);
1110 #elif defined B3_USE_NEON
1111 float32x4_t a0 = m_el[0].mVec128;
1112 float32x4_t a1 = m_el[1].mVec128;
1113 float32x4_t a2 = m_el[2].mVec128;
1115 b3Matrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
1116 float32x4_t mx = mT[0].mVec128;
1117 float32x4_t my = mT[1].mVec128;
1118 float32x4_t mz = mT[2].mVec128;
1120 float32x4_t r0 = vmulq_lane_f32(mx, vget_low_f32(a0), 0);
1121 float32x4_t r1 = vmulq_lane_f32(mx, vget_low_f32(a1), 0);
1122 float32x4_t r2 = vmulq_lane_f32(mx, vget_low_f32(a2), 0);
1123 r0 = vmlaq_lane_f32(r0, my, vget_low_f32(a0), 1);
1124 r1 = vmlaq_lane_f32(r1, my, vget_low_f32(a1), 1);
1125 r2 = vmlaq_lane_f32(r2, my, vget_low_f32(a2), 1);
1126 r0 = vmlaq_lane_f32(r0, mz, vget_high_f32(a0), 0);
1127 r1 = vmlaq_lane_f32(r1, mz, vget_high_f32(a1), 0);
1128 r2 = vmlaq_lane_f32(r2, mz, vget_high_f32(a2), 0);
1129 return b3Matrix3x3(r0, r1, r2);
1133 m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
1134 m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
1135 m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
1139 B3_FORCE_INLINE b3Vector3
1140 operator*(const b3Matrix3x3& m, const b3Vector3& v)
1142 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
1143 return v.dot3(m[0], m[1], m[2]);
1145 return b3MakeVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
1149 B3_FORCE_INLINE b3Vector3
1150 operator*(const b3Vector3& v, const b3Matrix3x3& m)
1152 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
1154 const __m128 vv = v.mVec128;
1156 __m128 c0 = b3_splat_ps(vv, 0);
1157 __m128 c1 = b3_splat_ps(vv, 1);
1158 __m128 c2 = b3_splat_ps(vv, 2);
1160 c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, b3vFFF0fMask));
1161 c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, b3vFFF0fMask));
1162 c0 = _mm_add_ps(c0, c1);
1163 c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, b3vFFF0fMask));
1165 return b3MakeVector3(_mm_add_ps(c0, c2));
1166 #elif defined(B3_USE_NEON)
1167 const float32x4_t vv = v.mVec128;
1168 const float32x2_t vlo = vget_low_f32(vv);
1169 const float32x2_t vhi = vget_high_f32(vv);
1171 float32x4_t c0, c1, c2;
1173 c0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask);
1174 c1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask);
1175 c2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask);
1177 c0 = vmulq_lane_f32(c0, vlo, 0);
1178 c1 = vmulq_lane_f32(c1, vlo, 1);
1179 c2 = vmulq_lane_f32(c2, vhi, 0);
1180 c0 = vaddq_f32(c0, c1);
1181 c0 = vaddq_f32(c0, c2);
1183 return b3MakeVector3(c0);
1185 return b3MakeVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
1189 B3_FORCE_INLINE b3Matrix3x3
1190 operator*(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
1192 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
1194 __m128 m10 = m1[0].mVec128;
1195 __m128 m11 = m1[1].mVec128;
1196 __m128 m12 = m1[2].mVec128;
1198 __m128 m2v = _mm_and_ps(m2[0].mVec128, b3vFFF0fMask);
1200 __m128 c0 = b3_splat_ps(m10, 0);
1201 __m128 c1 = b3_splat_ps(m11, 0);
1202 __m128 c2 = b3_splat_ps(m12, 0);
1204 c0 = _mm_mul_ps(c0, m2v);
1205 c1 = _mm_mul_ps(c1, m2v);
1206 c2 = _mm_mul_ps(c2, m2v);
1208 m2v = _mm_and_ps(m2[1].mVec128, b3vFFF0fMask);
1210 __m128 c0_1 = b3_splat_ps(m10, 1);
1211 __m128 c1_1 = b3_splat_ps(m11, 1);
1212 __m128 c2_1 = b3_splat_ps(m12, 1);
1214 c0_1 = _mm_mul_ps(c0_1, m2v);
1215 c1_1 = _mm_mul_ps(c1_1, m2v);
1216 c2_1 = _mm_mul_ps(c2_1, m2v);
1218 m2v = _mm_and_ps(m2[2].mVec128, b3vFFF0fMask);
1220 c0 = _mm_add_ps(c0, c0_1);
1221 c1 = _mm_add_ps(c1, c1_1);
1222 c2 = _mm_add_ps(c2, c2_1);
1224 m10 = b3_splat_ps(m10, 2);
1225 m11 = b3_splat_ps(m11, 2);
1226 m12 = b3_splat_ps(m12, 2);
1228 m10 = _mm_mul_ps(m10, m2v);
1229 m11 = _mm_mul_ps(m11, m2v);
1230 m12 = _mm_mul_ps(m12, m2v);
1232 c0 = _mm_add_ps(c0, m10);
1233 c1 = _mm_add_ps(c1, m11);
1234 c2 = _mm_add_ps(c2, m12);
1236 return b3Matrix3x3(c0, c1, c2);
1238 #elif defined(B3_USE_NEON)
1240 float32x4_t rv0, rv1, rv2;
1241 float32x4_t v0, v1, v2;
1242 float32x4_t mv0, mv1, mv2;
1248 mv0 = (float32x4_t)vandq_s32((int32x4_t)m2[0].mVec128, b3vFFF0Mask);
1249 mv1 = (float32x4_t)vandq_s32((int32x4_t)m2[1].mVec128, b3vFFF0Mask);
1250 mv2 = (float32x4_t)vandq_s32((int32x4_t)m2[2].mVec128, b3vFFF0Mask);
1252 rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
1253 rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
1254 rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
1256 rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
1257 rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
1258 rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
1260 rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
1261 rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
1262 rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
1264 return b3Matrix3x3(rv0, rv1, rv2);
1268 m2.tdotx(m1[0]), m2.tdoty(m1[0]), m2.tdotz(m1[0]),
1269 m2.tdotx(m1[1]), m2.tdoty(m1[1]), m2.tdotz(m1[1]),
1270 m2.tdotx(m1[2]), m2.tdoty(m1[2]), m2.tdotz(m1[2]));
1275 B3_FORCE_INLINE b3Matrix3x3 b3MultTransposeLeft(const b3Matrix3x3& m1, const b3Matrix3x3& m2) {
1277 m1[0][0] * m2[0][0] + m1[1][0] * m2[1][0] + m1[2][0] * m2[2][0],
1278 m1[0][0] * m2[0][1] + m1[1][0] * m2[1][1] + m1[2][0] * m2[2][1],
1279 m1[0][0] * m2[0][2] + m1[1][0] * m2[1][2] + m1[2][0] * m2[2][2],
1280 m1[0][1] * m2[0][0] + m1[1][1] * m2[1][0] + m1[2][1] * m2[2][0],
1281 m1[0][1] * m2[0][1] + m1[1][1] * m2[1][1] + m1[2][1] * m2[2][1],
1282 m1[0][1] * m2[0][2] + m1[1][1] * m2[1][2] + m1[2][1] * m2[2][2],
1283 m1[0][2] * m2[0][0] + m1[1][2] * m2[1][0] + m1[2][2] * m2[2][0],
1284 m1[0][2] * m2[0][1] + m1[1][2] * m2[1][1] + m1[2][2] * m2[2][1],
1285 m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
1289 /**@brief Equality operator between two matrices
1290 * It will test all elements are equal. */
1291 B3_FORCE_INLINE bool operator==(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
1293 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
1297 c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
1298 c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
1299 c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
1301 c0 = _mm_and_ps(c0, c1);
1302 c0 = _mm_and_ps(c0, c2);
1304 return (0x7 == _mm_movemask_ps((__m128)c0));
1306 return (m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
1307 m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
1308 m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2]);
1312 ///for serialization
1313 struct b3Matrix3x3FloatData
1315 b3Vector3FloatData m_el[3];
1318 ///for serialization
1319 struct b3Matrix3x3DoubleData
1321 b3Vector3DoubleData m_el[3];
1324 B3_FORCE_INLINE void b3Matrix3x3::serialize(struct b3Matrix3x3Data& dataOut) const
1326 for (int i = 0; i < 3; i++)
1327 m_el[i].serialize(dataOut.m_el[i]);
1330 B3_FORCE_INLINE void b3Matrix3x3::serializeFloat(struct b3Matrix3x3FloatData& dataOut) const
1332 for (int i = 0; i < 3; i++)
1333 m_el[i].serializeFloat(dataOut.m_el[i]);
1336 B3_FORCE_INLINE void b3Matrix3x3::deSerialize(const struct b3Matrix3x3Data& dataIn)
1338 for (int i = 0; i < 3; i++)
1339 m_el[i].deSerialize(dataIn.m_el[i]);
1342 B3_FORCE_INLINE void b3Matrix3x3::deSerializeFloat(const struct b3Matrix3x3FloatData& dataIn)
1344 for (int i = 0; i < 3; i++)
1345 m_el[i].deSerializeFloat(dataIn.m_el[i]);
1348 B3_FORCE_INLINE void b3Matrix3x3::deSerializeDouble(const struct b3Matrix3x3DoubleData& dataIn)
1350 for (int i = 0; i < 3; i++)
1351 m_el[i].deSerializeDouble(dataIn.m_el[i]);
1354 #endif //B3_MATRIX3x3_H