dali/internal/common/matrix-utils.cpp

   1 /*
   2  * Copyright (c) 2023 Samsung Electronics Co., Ltd.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  *
  16  */
  17
  18 // CLASS HEADERS
  19 #include <dali/internal/common/matrix-utils.h>
  20
  21 // EXTERNAL INCLUDES
  22 #include <cstdint> // uint32_t
  23 #include <cstring> // memcpy
  24
  25 // INTERNAL INCLUDE
  26 #include <dali/internal/render/common/performance-monitor.h>
  27 #include <dali/public-api/math/matrix.h>
  28 #include <dali/public-api/math/matrix3.h>
  29 #include <dali/public-api/math/quaternion.h>
  30
  31 namespace
  32 {
  33 const uint32_t NUM_BYTES_IN_MATRIX(16 * sizeof(float));
  34 const uint32_t NUM_BYTES_IN_MATRIX3(9 * sizeof(float));
  35
  36 } // namespace
  37
  38 namespace Dali::Internal
  39 {
  40 using Internal::PerformanceMonitor;
  41
  42 namespace MatrixUtils
  43 {
  44 // Dali::Quaternion
  45
  46 void ConvertQuaternion(float*& result, const Dali::Quaternion& rotation)
  47 {
  48   MATH_INCREASE_COUNTER(PerformanceMonitor::QUATERNION_TO_MATRIX);
  49
  50   const float xx = rotation.mVector.x * rotation.mVector.x;
  51   const float yy = rotation.mVector.y * rotation.mVector.y;
  52   const float zz = rotation.mVector.z * rotation.mVector.z;
  53   const float xy = rotation.mVector.x * rotation.mVector.y;
  54   const float xz = rotation.mVector.x * rotation.mVector.z;
  55   const float wx = rotation.mVector.w * rotation.mVector.x;
  56   const float wy = rotation.mVector.w * rotation.mVector.y;
  57   const float wz = rotation.mVector.w * rotation.mVector.z;
  58   const float yz = rotation.mVector.y * rotation.mVector.z;
  59
  60   // clang-format off
  61   result[0] = 1.0f - 2.0f * (yy + zz);
  62   result[1] =        2.0f * (xy + wz);
  63   result[2] =        2.0f * (xz - wy);
  64   result[3] = 0.0f;
  65
  66   result[4] =        2.0f * (xy - wz);
  67   result[5] = 1.0f - 2.0f * (xx + zz);
  68   result[6] =        2.0f * (yz + wx);
  69   result[7] = 0.0f;
  70
  71   result[8] =        2.0f * (xz + wy);
  72   result[9] =        2.0f * (yz - wx);
  73   result[10]= 1.0f - 2.0f * (xx + yy);
  74   result[11]= 0.0f;
  75
  76   result[12]= 0.0f;
  77   result[13]= 0.0f;
  78   result[14]= 0.0f;
  79   result[15]= 1.0f;
  80   // clang-format on
  81 }
  82
  83 // Dali::Matrix
  84
  85 void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& rhs)
  86 {
  87   MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
  88   MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 64); // 64 = 16*4
  89
  90   float*       temp   = result.AsFloat();
  91   const float* rhsPtr = rhs.AsFloat();
  92   const float* lhsPtr = lhs.AsFloat();
  93
  94 #ifndef __ARM_NEON__
  95
  96   for(int32_t i = 0; i < 4; i++)
  97   {
  98     // i<<2 gives the first vector / column
  99     const int32_t loc0 = i << 2;
 100     const int32_t loc1 = loc0 + 1;
 101     const int32_t loc2 = loc0 + 2;
 102     const int32_t loc3 = loc0 + 3;
 103
 104     const float value0 = lhsPtr[loc0];
 105     const float value1 = lhsPtr[loc1];
 106     const float value2 = lhsPtr[loc2];
 107     const float value3 = lhsPtr[loc3];
 108
 109     temp[loc0] = (value0 * rhsPtr[0]) +
 110                  (value1 * rhsPtr[4]) +
 111                  (value2 * rhsPtr[8]) +
 112                  (value3 * rhsPtr[12]);
 113
 114     temp[loc1] = (value0 * rhsPtr[1]) +
 115                  (value1 * rhsPtr[5]) +
 116                  (value2 * rhsPtr[9]) +
 117                  (value3 * rhsPtr[13]);
 118
 119     temp[loc2] = (value0 * rhsPtr[2]) +
 120                  (value1 * rhsPtr[6]) +
 121                  (value2 * rhsPtr[10]) +
 122                  (value3 * rhsPtr[14]);
 123
 124     temp[loc3] = (value0 * rhsPtr[3]) +
 125                  (value1 * rhsPtr[7]) +
 126                  (value2 * rhsPtr[11]) +
 127                  (value3 * rhsPtr[15]);
 128   }
 129
 130 #else
 131
 132   // 64 32bit registers,
 133   // aliased to
 134   // s = 32 bit single-word s0 -s63
 135   // d = 64 bit double-word d0 -d31
 136   // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
 137   // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
 138
 139   // load and stores interleaved as NEON can load and store while calculating
 140   asm volatile(
 141     "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
 142     "VLDM         %0,  {q8-q11}       \n\t" // load matrix 2 (rhsPtr) q[q8-q11]
 143     "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
 144     "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
 145     "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
 146     "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
 147
 148     "VMLA.F32     q12, q9, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
 149     "VMLA.F32     q13, q9, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
 150     "VMLA.F32     q14, q9, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
 151     "VMLA.F32     q15, q9, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
 152
 153     "VMLA.F32     q12, q10, d1[0]     \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
 154     "VMLA.F32     q13, q10, d3[0]     \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
 155     "VMLA.F32     q14, q10, d5[0]     \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
 156     "VMLA.F32     q15, q10, d7[0]     \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
 157
 158     "VMLA.F32     q12, q11, d1[1]     \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[3]
 159     "VMLA.F32     q13, q11, d3[1]     \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[7]
 160     "VMLA.F32     q14, q11, d5[1]     \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[11]
 161     "VMLA.F32     q15, q11, d7[1]     \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[15]
 162     "VSTM         %2,  {q12-q15}      \n\t" // store entire output matrix.
 163     : "+r"(rhsPtr), "+r"(lhsPtr), "+r"(temp)
 164     :
 165     : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory");
 166
 167 #endif
 168 }
 169
 170 void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Quaternion& rhs)
 171 {
 172   MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
 173   MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 54); // 54 = 36+18
 174
 175   float  matrix[16];
 176   float* rhsPtr = &matrix[0];
 177   ConvertQuaternion(rhsPtr, rhs);
 178
 179   // quaternion contains just rotation so it really only needs 3x3 matrix
 180
 181   float*       temp   = result.AsFloat();
 182   const float* lhsPtr = lhs.AsFloat();
 183
 184 #ifndef __ARM_NEON__
 185
 186   for(int32_t i = 0; i < 4; i++)
 187   {
 188     // i<<2 gives the first vector / column
 189     const int32_t loc0 = i << 2;
 190     const int32_t loc1 = loc0 + 1;
 191     const int32_t loc2 = loc0 + 2;
 192     const int32_t loc3 = loc0 + 3;
 193
 194     const float value0 = lhsPtr[loc0];
 195     const float value1 = lhsPtr[loc1];
 196     const float value2 = lhsPtr[loc2];
 197     const float value3 = lhsPtr[loc3];
 198
 199     temp[loc0] = (value0 * rhsPtr[0]) +
 200                  (value1 * rhsPtr[4]) +
 201                  (value2 * rhsPtr[8]) +
 202                  (0.0f); //value3 * rhsPtr[12] is 0.0f
 203
 204     temp[loc1] = (value0 * rhsPtr[1]) +
 205                  (value1 * rhsPtr[5]) +
 206                  (value2 * rhsPtr[9]) +
 207                  (0.0f); //value3 * rhsPtr[13] is 0.0f
 208
 209     temp[loc2] = (value0 * rhsPtr[2]) +
 210                  (value1 * rhsPtr[6]) +
 211                  (value2 * rhsPtr[10]) +
 212                  (0.0f); //value3 * rhsPtr[14] is 0.0f
 213
 214     temp[loc3] = (0.0f) +  //value0 * rhsPtr[3] is 0.0f
 215                  (0.0f) +  //value1 * rhsPtr[7] is 0.0f
 216                  (0.0f) +  //value2 * rhsPtr[11] is 0.0f
 217                  (value3); // rhsPtr[15] is 1.0f
 218   }
 219
 220 #else
 221   // Store 4th row values that might be overwrited.
 222   const float value0 = lhsPtr[3];
 223   const float value1 = lhsPtr[7];
 224   const float value2 = lhsPtr[11];
 225   const float value3 = lhsPtr[15];
 226
 227   // 64 32bit registers,
 228   // aliased to
 229   // s = 32 bit single-word s0 -s63
 230   // d = 64 bit double-word d0 -d31
 231   // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
 232   // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
 233
 234   // load and stores interleaved as NEON can load and store while calculating
 235   asm volatile(
 236     "VLDM         %1,   {q0-q3}       \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
 237     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [0..3]
 238     "VMUL.F32     q4,   q8,   d0[0]   \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
 239     "VMUL.F32     q5,   q8,   d2[0]   \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
 240     "VMUL.F32     q6,   q8,   d4[0]   \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
 241     "VMUL.F32     q7,   q8,   d6[0]   \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
 242     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [4..7]
 243     "VMLA.F32     q4,   q8,   d0[1]   \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
 244     "VMLA.F32     q5,   q8,   d2[1]   \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
 245     "VMLA.F32     q6,   q8,   d4[1]   \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
 246     "VMLA.F32     q7,   q8,   d6[1]   \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
 247     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [8..11]
 248     "VMLA.F32     q4,   q8,   d1[0]   \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
 249     "VMLA.F32     q5,   q8,   d3[0]   \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
 250     "VMLA.F32     q6,   q8,   d5[0]   \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
 251     "VMLA.F32     q7,   q8,   d7[0]   \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
 252     "VSTM         %0,   {q4-q7}       \n\t" // store entire output matrix.
 253     :
 254     : "r"(temp), "r"(lhsPtr), "r"(rhsPtr)
 255     : "%r0", "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", "%q8", "memory");
 256
 257   // Restore 4th row values.
 258   temp[3]  = value0;
 259   temp[7]  = value1;
 260   temp[11] = value2;
 261   temp[15] = value3;
 262 #endif
 263 }
 264
 265 void MultiplyTransformMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& rhs)
 266 {
 267   MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
 268   MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 36); // 36 = 9*4
 269
 270   float*       temp   = result.AsFloat();
 271   const float* rhsPtr = rhs.AsFloat();
 272   const float* lhsPtr = lhs.AsFloat();
 273
 274 #ifndef __ARM_NEON__
 275
 276   for(int32_t i = 0; i < 4; i++)
 277   {
 278     // i<<2 gives the first vector / column
 279     const int32_t loc0 = i << 2;
 280     const int32_t loc1 = loc0 + 1;
 281     const int32_t loc2 = loc0 + 2;
 282
 283     const float value0 = lhsPtr[loc0];
 284     const float value1 = lhsPtr[loc1];
 285     const float value2 = lhsPtr[loc2];
 286
 287     temp[loc0] = (value0 * rhsPtr[0]) +
 288                  (value1 * rhsPtr[4]) +
 289                  (value2 * rhsPtr[8]) +
 290                  (i == 3 ? rhsPtr[12] : 0.0f); // lhsPtr[loc3] is 0.0f, or 1.0f only if i == 3
 291
 292     temp[loc1] = (value0 * rhsPtr[1]) +
 293                  (value1 * rhsPtr[5]) +
 294                  (value2 * rhsPtr[9]) +
 295                  (i == 3 ? rhsPtr[13] : 0.0f); // lhsPtr[loc3] is 0.0f, or 1.0f only if i == 3
 296
 297     temp[loc2] = (value0 * rhsPtr[2]) +
 298                  (value1 * rhsPtr[6]) +
 299                  (value2 * rhsPtr[10]) +
 300                  (i == 3 ? rhsPtr[14] : 0.0f); // lhsPtr[loc3] is 0.0f, or 1.0f only if i == 3
 301   }
 302   temp[3] = temp[7] = temp[11] = 0.0f;
 303   temp[15]                     = 1.0f;
 304
 305 #else
 306
 307   // 64 32bit registers,
 308   // aliased to
 309   // s = 32 bit single-word s0 -s63
 310   // d = 64 bit double-word d0 -d31
 311   // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
 312   // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
 313
 314   // load and stores interleaved as NEON can load and store while calculating
 315   asm volatile(
 316     "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
 317     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [0..3]
 318     "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
 319     "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
 320     "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
 321     "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
 322
 323     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [4..7]
 324     "VMLA.F32     q12, q8, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
 325     "VMLA.F32     q13, q8, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
 326     "VMLA.F32     q14, q8, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
 327     "VMLA.F32     q15, q8, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
 328
 329     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [8..11]
 330     "VMLA.F32     q12, q8, d1[0]      \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
 331     "VMLA.F32     q13, q8, d3[0]      \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
 332     "VMLA.F32     q14, q8, d5[0]      \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
 333     "VMLA.F32     q15, q8, d7[0]      \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
 334
 335     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [12..15]
 336     "VADD.F32     q15, q15, q8        \n\t" // column 3 = column3 + rhsPtr[12..15]
 337     "VSTM         %0,  {q12-q15}      \n\t" // store entire output matrix.
 338     :
 339     : "r"(temp), "r"(lhsPtr), "r"(rhsPtr)
 340     : "%r0", "q0", "q1", "q2", "q3", "q8", "q12", "q13", "q14", "q15", "memory");
 341
 342 #endif
 343 }
 344
 345 void MultiplyProjectionMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& projection)
 346 {
 347   MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
 348   MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 32); // 32 = 8*4
 349
 350   float*       temp   = result.AsFloat();
 351   const float* rhsPtr = projection.AsFloat();
 352   const float* lhsPtr = lhs.AsFloat();
 353
 354 #ifndef __ARM_NEON__
 355
 356   // We only use rhsPtr's 0, 1, 2, 4, 5, 6, 10, 11, 14, 15 index.
 357   const float rhs0  = rhsPtr[0];
 358   const float rhs1  = rhsPtr[1];
 359   const float rhs2  = rhsPtr[2];
 360   const float rhs4  = rhsPtr[4];
 361   const float rhs5  = rhsPtr[5];
 362   const float rhs6  = rhsPtr[6];
 363   const float rhs10 = rhsPtr[10];
 364   const float rhs11 = rhsPtr[11];
 365   const float rhs14 = rhsPtr[14];
 366   const float rhs15 = rhsPtr[15];
 367
 368   for(int32_t i = 0; i < 4; i++)
 369   {
 370     // i<<2 gives the first vector / column
 371     const int32_t loc0 = i << 2;
 372     const int32_t loc1 = loc0 + 1;
 373     const int32_t loc2 = loc0 + 2;
 374     const int32_t loc3 = loc0 + 3;
 375
 376     const float value0 = lhsPtr[loc0];
 377     const float value1 = lhsPtr[loc1];
 378     const float value2 = lhsPtr[loc2];
 379
 380     temp[loc0] = (value0 * rhs0) + (value1 * rhs4);
 381     temp[loc1] = (value0 * rhs1) + (value1 * rhs5);
 382     temp[loc2] = (value0 * rhs2) + (value1 * rhs6) + (value2 * rhs10) + (i == 3 ? rhs14 : 0.0f);
 383     temp[loc3] = (value2 * rhs11) + (i == 3 ? rhs15 : 0.0f);
 384   }
 385
 386 #else
 387
 388   // 64 32bit registers,
 389   // aliased to
 390   // s = 32 bit single-word s0 -s63
 391   // d = 64 bit double-word d0 -d31
 392   // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
 393   // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
 394
 395   // load and stores interleaved as NEON can load and store while calculating
 396   asm volatile(
 397     "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
 398     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [0..3]
 399     "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
 400     "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
 401     "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
 402     "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
 403
 404     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [4..7]
 405     "VMLA.F32     q12, q8, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
 406     "VMLA.F32     q13, q8, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
 407     "VMLA.F32     q14, q8, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
 408     "VMLA.F32     q15, q8, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
 409
 410     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [8..11]
 411     "VMLA.F32     d25, d17, d1[0]     \n\t" // column 0[2,3] += rhsPtr[10,11] * lhsPtr[2]
 412     "VMLA.F32     d27, d17, d3[0]     \n\t" // column 1[2,3] += rhsPtr[10,11] * lhsPtr[6]
 413     "VMLA.F32     d29, d17, d5[0]     \n\t" // column 2[2,3] += rhsPtr[10,11] * lhsPtr[10]
 414     "VMLA.F32     d31, d17, d7[0]     \n\t" // column 3[2,3] += rhsPtr[10,11] * lhsPtr[14]
 415
 416     "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [12..15]
 417     "VADD.F32     d31, d31, d17       \n\t" // column 3[2,3] = column3[2,3] + rhsPtr[14,15]
 418     "VSTM         %0,  {q12-q15}      \n\t" // store entire output matrix.
 419     :
 420     : "r"(temp), "r"(lhsPtr), "r"(rhsPtr)
 421     : "%r0", "q0", "q1", "q2", "q3", "q8", "q12", "q13", "q14", "q15", "memory");
 422
 423 #endif
 424 }
 425
 426 void MultiplyAssign(Dali::Matrix& result, const Dali::Matrix& rhs)
 427 {
 428   MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
 429   MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 64); // 64 = 16*4
 430
 431 #ifndef __ARM_NEON__
 432
 433   float*       lhsPtr = result.AsFloat();
 434   const float* rhsPtr = rhs.AsFloat();
 435   float*       temp   = nullptr;
 436
 437   if(lhsPtr == rhsPtr)
 438   {
 439     // If rhs is same matrix with result, we need to copy temperal vaules.
 440     temp = static_cast<float*>(malloc(NUM_BYTES_IN_MATRIX));
 441     memcpy(temp, rhsPtr, NUM_BYTES_IN_MATRIX);
 442     rhsPtr = temp;
 443   }
 444
 445   // Calculate and store as row major.
 446   for(int32_t i = 0; i < 4; i++)
 447   {
 448     const int32_t loc0 = i;
 449     const int32_t loc1 = loc0 | 4;
 450     const int32_t loc2 = loc0 | 8;
 451     const int32_t loc3 = loc0 | 12;
 452
 453     const float value0 = lhsPtr[loc0];
 454     const float value1 = lhsPtr[loc1];
 455     const float value2 = lhsPtr[loc2];
 456     const float value3 = lhsPtr[loc3];
 457
 458     lhsPtr[loc0] = (value0 * rhsPtr[0]) +
 459                    (value1 * rhsPtr[1]) +
 460                    (value2 * rhsPtr[2]) +
 461                    (value3 * rhsPtr[3]);
 462
 463     lhsPtr[loc1] = (value0 * rhsPtr[4]) +
 464                    (value1 * rhsPtr[5]) +
 465                    (value2 * rhsPtr[6]) +
 466                    (value3 * rhsPtr[7]);
 467
 468     lhsPtr[loc2] = (value0 * rhsPtr[8]) +
 469                    (value1 * rhsPtr[9]) +
 470                    (value2 * rhsPtr[10]) +
 471                    (value3 * rhsPtr[11]);
 472
 473     lhsPtr[loc3] = (value0 * rhsPtr[12]) +
 474                    (value1 * rhsPtr[13]) +
 475                    (value2 * rhsPtr[14]) +
 476                    (value3 * rhsPtr[15]);
 477   }
 478
 479   if(temp)
 480   {
 481     // If we allocate temperal memory, we should free it.
 482     free(temp);
 483   }
 484
 485 #else
 486   // We store temperal values into register. Don't worry about overlap.
 487   // Copy normal Multiply code.
 488   // Becareful the name of pointer is crossed!
 489
 490   float*       temp   = result.AsFloat();
 491   const float* rhsPtr = result.AsFloat();
 492   const float* lhsPtr = rhs.AsFloat();
 493
 494   // 64 32bit registers,
 495   // aliased to
 496   // s = 32 bit single-word s0 -s63
 497   // d = 64 bit double-word d0 -d31
 498   // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
 499   // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
 500
 501   // load and stores interleaved as NEON can load and store while calculating
 502   asm volatile(
 503     "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
 504     "VLDM         %0,  {q8-q11}       \n\t" // load matrix 2 (rhsPtr) q[q8-q11]
 505     "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
 506     "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
 507     "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
 508     "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
 509
 510     "VMLA.F32     q12, q9, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
 511     "VMLA.F32     q13, q9, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
 512     "VMLA.F32     q14, q9, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
 513     "VMLA.F32     q15, q9, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
 514
 515     "VMLA.F32     q12, q10, d1[0]     \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
 516     "VMLA.F32     q13, q10, d3[0]     \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
 517     "VMLA.F32     q14, q10, d5[0]     \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
 518     "VMLA.F32     q15, q10, d7[0]     \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
 519
 520     "VMLA.F32     q12, q11, d1[1]     \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[3]
 521     "VMLA.F32     q13, q11, d3[1]     \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[7]
 522     "VMLA.F32     q14, q11, d5[1]     \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[11]
 523     "VMLA.F32     q15, q11, d7[1]     \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[15]
 524     "VSTM         %2,  {q12-q15}      \n\t" // store entire output matrix.
 525     : "+r"(rhsPtr), "+r"(lhsPtr), "+r"(temp)
 526     :
 527     : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory");
 528
 529 #endif
 530 }
 531
 532 // Dali::Matrix3
 533
 534 void Multiply(Dali::Matrix3& result, const Dali::Matrix3& lhs, const Dali::Matrix3& rhs)
 535 {
 536   float*       temp   = result.AsFloat();
 537   const float* rhsPtr = rhs.AsFloat();
 538   const float* lhsPtr = lhs.AsFloat();
 539
 540   for(int32_t i = 0; i < 3; i++)
 541   {
 542     const int32_t loc0 = i * 3;
 543     const int32_t loc1 = loc0 + 1;
 544     const int32_t loc2 = loc0 + 2;
 545
 546     const float value0 = lhsPtr[loc0];
 547     const float value1 = lhsPtr[loc1];
 548     const float value2 = lhsPtr[loc2];
 549
 550     temp[loc0] = (value0 * rhsPtr[0]) +
 551                  (value1 * rhsPtr[3]) +
 552                  (value2 * rhsPtr[6]);
 553
 554     temp[loc1] = (value0 * rhsPtr[1]) +
 555                  (value1 * rhsPtr[4]) +
 556                  (value2 * rhsPtr[7]);
 557
 558     temp[loc2] = (value0 * rhsPtr[2]) +
 559                  (value1 * rhsPtr[5]) +
 560                  (value2 * rhsPtr[8]);
 561   }
 562 }
 563
 564 void MultiplyAssign(Dali::Matrix3& result, const Dali::Matrix3& rhs)
 565 {
 566   float*       lhsPtr = result.AsFloat();
 567   const float* rhsPtr = rhs.AsFloat();
 568   float*       temp   = nullptr;
 569
 570   if(lhsPtr == rhsPtr)
 571   {
 572     // If rhs is same matrix with result, we need to copy temperal vaules.
 573     temp = static_cast<float*>(malloc(NUM_BYTES_IN_MATRIX3));
 574     memcpy(temp, rhsPtr, NUM_BYTES_IN_MATRIX3);
 575     rhsPtr = temp;
 576   }
 577
 578   // Calculate and store as row major.
 579   for(int32_t i = 0; i < 3; i++)
 580   {
 581     const int32_t loc0 = i;
 582     const int32_t loc1 = loc0 + 3;
 583     const int32_t loc2 = loc0 + 6;
 584
 585     const float value0 = lhsPtr[loc0];
 586     const float value1 = lhsPtr[loc1];
 587     const float value2 = lhsPtr[loc2];
 588
 589     lhsPtr[loc0] = (value0 * rhsPtr[0]) +
 590                    (value1 * rhsPtr[1]) +
 591                    (value2 * rhsPtr[2]);
 592
 593     lhsPtr[loc1] = (value0 * rhsPtr[3]) +
 594                    (value1 * rhsPtr[4]) +
 595                    (value2 * rhsPtr[5]);
 596
 597     lhsPtr[loc2] = (value0 * rhsPtr[6]) +
 598                    (value1 * rhsPtr[7]) +
 599                    (value2 * rhsPtr[8]);
 600   }
 601
 602   if(temp)
 603   {
 604     // If we allocate temperal memory, we should free it.
 605     free(temp);
 606   }
 607 }
 608
 609 } // namespace MatrixUtils
 610 } // namespace Dali::Internal