2 * Copyright (c) 2023 Samsung Electronics Co., Ltd.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <dali/internal/common/matrix-utils.h>
22 #include <cstdint> // uint32_t
23 #include <cstring> // memcpy
26 #include <dali/internal/render/common/performance-monitor.h>
27 #include <dali/public-api/math/matrix.h>
28 #include <dali/public-api/math/matrix3.h>
29 #include <dali/public-api/math/quaternion.h>
33 const uint32_t NUM_BYTES_IN_MATRIX(16 * sizeof(float));
34 const uint32_t NUM_BYTES_IN_MATRIX3(9 * sizeof(float));
38 namespace Dali::Internal
40 using Internal::PerformanceMonitor;
46 void ConvertQuaternion(float*& result, const Dali::Quaternion& rotation)
48 MATH_INCREASE_COUNTER(PerformanceMonitor::QUATERNION_TO_MATRIX);
50 const float xx = rotation.mVector.x * rotation.mVector.x;
51 const float yy = rotation.mVector.y * rotation.mVector.y;
52 const float zz = rotation.mVector.z * rotation.mVector.z;
53 const float xy = rotation.mVector.x * rotation.mVector.y;
54 const float xz = rotation.mVector.x * rotation.mVector.z;
55 const float wx = rotation.mVector.w * rotation.mVector.x;
56 const float wy = rotation.mVector.w * rotation.mVector.y;
57 const float wz = rotation.mVector.w * rotation.mVector.z;
58 const float yz = rotation.mVector.y * rotation.mVector.z;
61 result[0] = 1.0f - 2.0f * (yy + zz);
62 result[1] = 2.0f * (xy + wz);
63 result[2] = 2.0f * (xz - wy);
66 result[4] = 2.0f * (xy - wz);
67 result[5] = 1.0f - 2.0f * (xx + zz);
68 result[6] = 2.0f * (yz + wx);
71 result[8] = 2.0f * (xz + wy);
72 result[9] = 2.0f * (yz - wx);
73 result[10]= 1.0f - 2.0f * (xx + yy);
85 void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& rhs)
87 MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
88 MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 64); // 64 = 16*4
90 float* temp = result.AsFloat();
91 const float* rhsPtr = rhs.AsFloat();
92 const float* lhsPtr = lhs.AsFloat();
96 for(int32_t i = 0; i < 4; i++)
98 // i<<2 gives the first vector / column
99 const int32_t loc0 = i << 2;
100 const int32_t loc1 = loc0 + 1;
101 const int32_t loc2 = loc0 + 2;
102 const int32_t loc3 = loc0 + 3;
104 const float value0 = lhsPtr[loc0];
105 const float value1 = lhsPtr[loc1];
106 const float value2 = lhsPtr[loc2];
107 const float value3 = lhsPtr[loc3];
109 temp[loc0] = (value0 * rhsPtr[0]) +
110 (value1 * rhsPtr[4]) +
111 (value2 * rhsPtr[8]) +
112 (value3 * rhsPtr[12]);
114 temp[loc1] = (value0 * rhsPtr[1]) +
115 (value1 * rhsPtr[5]) +
116 (value2 * rhsPtr[9]) +
117 (value3 * rhsPtr[13]);
119 temp[loc2] = (value0 * rhsPtr[2]) +
120 (value1 * rhsPtr[6]) +
121 (value2 * rhsPtr[10]) +
122 (value3 * rhsPtr[14]);
124 temp[loc3] = (value0 * rhsPtr[3]) +
125 (value1 * rhsPtr[7]) +
126 (value2 * rhsPtr[11]) +
127 (value3 * rhsPtr[15]);
132 // 64 32bit registers,
134 // d = 64 bit double-word d0 -d31
135 // q =128 bit quad-word q0 -q15 (enough to handle a column of 4 floats in a matrix)
136 // e.g. q0 = d0 and d1
138 // load and stores interleaved as NEON can load and store while calculating
140 "VLDM %1, {q0-q3} \n\t" // load matrix 1 (lhsPtr) q[0..q3]
141 "VLDM %0, {q8-q11} \n\t" // load matrix 2 (rhsPtr) q[q8-q11]
142 "VMUL.F32 q12, q8, d0[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0..3]
143 "VMUL.F32 q13, q8, d2[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4..7]
144 "VMUL.F32 q14, q8, d4[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8..11]
145 "VMUL.F32 q15, q8, d6[0] \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12..15]
147 "VMLA.F32 q12, q9, d0[1] \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[0..3]
148 "VMLA.F32 q13, q9, d2[1] \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[4..7]
149 "VMLA.F32 q14, q9, d4[1] \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[8..11]
150 "VMLA.F32 q15, q9, d6[1] \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[12..15]
152 "VMLA.F32 q12, q10, d1[0] \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[0..3]
153 "VMLA.F32 q13, q10, d3[0] \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[4..7]
154 "VMLA.F32 q14, q10, d5[0] \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[8..11]
155 "VMLA.F32 q15, q10, d7[0] \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[12..15]
157 "VMLA.F32 q12, q11, d1[1] \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[0..3]
158 "VMLA.F32 q13, q11, d3[1] \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[4..7]
159 "VMLA.F32 q14, q11, d5[1] \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[8..11]
160 "VMLA.F32 q15, q11, d7[1] \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[12..15]
161 "VSTM %2, {q12-q15} \n\t" // store entire output matrix.
162 : "+r"(rhsPtr), "+r"(lhsPtr), "+r"(temp)
164 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory");
169 void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Quaternion& rhs)
171 MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
172 MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 54); // 54 = 36+18
175 float* rhsPtr = &matrix[0];
176 ConvertQuaternion(rhsPtr, rhs);
178 // quaternion contains just rotation so it really only needs 3x3 matrix
180 float* temp = result.AsFloat();
181 const float* lhsPtr = lhs.AsFloat();
185 for(int32_t i = 0; i < 4; i++)
187 // i<<2 gives the first vector / column
188 const int32_t loc0 = i << 2;
189 const int32_t loc1 = loc0 + 1;
190 const int32_t loc2 = loc0 + 2;
191 const int32_t loc3 = loc0 + 3;
193 const float value0 = lhsPtr[loc0];
194 const float value1 = lhsPtr[loc1];
195 const float value2 = lhsPtr[loc2];
196 const float value3 = lhsPtr[loc3];
198 temp[loc0] = (value0 * rhsPtr[0]) +
199 (value1 * rhsPtr[4]) +
200 (value2 * rhsPtr[8]) +
201 (0.0f); //value3 * rhsPtr[12] is 0.0f
203 temp[loc1] = (value0 * rhsPtr[1]) +
204 (value1 * rhsPtr[5]) +
205 (value2 * rhsPtr[9]) +
206 (0.0f); //value3 * rhsPtr[13] is 0.0f
208 temp[loc2] = (value0 * rhsPtr[2]) +
209 (value1 * rhsPtr[6]) +
210 (value2 * rhsPtr[10]) +
211 (0.0f); //value3 * rhsPtr[14] is 0.0f
213 temp[loc3] = (0.0f) + //value0 * rhsPtr[3] is 0.0f
214 (0.0f) + //value1 * rhsPtr[7] is 0.0f
215 (0.0f) + //value2 * rhsPtr[11] is 0.0f
216 (value3); // rhsPtr[15] is 1.0f
221 // 64 32bit registers,
223 // d = 64 bit double-word d0 -d31
224 // q =128 bit quad-word q0 -q15 (enough to handle a column of 4 floats in a matrix)
225 // e.g. q0 = d0 and d1
226 // load and stores interleaved as NEON can load and store while calculating
228 "VLDM %1, {q4-q6} \n\t" // load matrix 1 (lhsPtr)
229 "VLD1.F32 {q7}, [%2]! \n\t" // load matrix 2 (rhsPtr) [0..3]
230 "VMUL.F32 q0, q7, d8[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0..3]
231 "VMUL.F32 q1, q7, d10[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4..7]
232 "VMUL.F32 q2, q7, d12[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8..11]
233 "VLD1.F32 {q7}, [%2]! \n\t" // load matrix 2 (rhsPtr) [4..7]
234 "VMLA.F32 q0, q7, d8[1] \n\t" // column 0+= rhsPtr[4..7] * lhsPtr[0..3]
235 "VMLA.F32 q1, q7, d10[1] \n\t" // column 1+= rhsPtr[4..7] * lhsPtr[4..7]
236 "VMLA.F32 q2, q7, d12[1] \n\t" // column 2+= rhsPtr[4..7] * lhsPtr[8..11]
237 "VLD1.F32 {q7}, [%2]! \n\t" // load matrix 2 (rhsPtr) [8..11]
238 "VMLA.F32 q0, q7, d9[0] \n\t" // column 0+= rhsPtr[8..11] * lhsPtr[0..3]
239 "VMLA.F32 q1, q7, d11[0] \n\t" // column 1+= rhsPtr[8..11] * lhsPtr[4..7]
240 "VMLA.F32 q2, q7, d13[0] \n\t" // column 2+= rhsPtr[8..11] * lhsPtr[8..11]
241 "VSTM %0, {q0-q2} \n\t" // store entire output matrix.
243 : "r"(temp), "r"(lhsPtr), "r"(rhsPtr)
244 : "%r0", "%q0", "%q1", "%q2", "%q4", "%q5", "%q6", "%q7", "memory");
253 void MultiplyProjectionMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& projection)
255 // TODO : Implement with NEON.
256 // Current NEON code is copy of Multiply.
258 MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
259 MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 40); // 40 = 10*4
261 float* temp = result.AsFloat();
262 const float* rhsPtr = projection.AsFloat();
263 const float* lhsPtr = lhs.AsFloat();
267 // We only use rhsPtr's 0, 1, 2, 4, 5, 6, 10, 11, 14, 15 index.
268 const float rhs0 = rhsPtr[0];
269 const float rhs1 = rhsPtr[1];
270 const float rhs2 = rhsPtr[2];
271 const float rhs4 = rhsPtr[4];
272 const float rhs5 = rhsPtr[5];
273 const float rhs6 = rhsPtr[6];
274 const float rhs10 = rhsPtr[10];
275 const float rhs11 = rhsPtr[11];
276 const float rhs14 = rhsPtr[14];
277 const float rhs15 = rhsPtr[15];
279 for(int32_t i = 0; i < 4; i++)
281 // i<<2 gives the first vector / column
282 const int32_t loc0 = i << 2;
283 const int32_t loc1 = loc0 + 1;
284 const int32_t loc2 = loc0 + 2;
285 const int32_t loc3 = loc0 + 3;
287 const float value0 = lhsPtr[loc0];
288 const float value1 = lhsPtr[loc1];
289 const float value2 = lhsPtr[loc2];
290 const float value3 = lhsPtr[loc3];
292 temp[loc0] = (value0 * rhs0) + (value1 * rhs4);
293 temp[loc1] = (value0 * rhs1) + (value1 * rhs5);
294 temp[loc2] = (value0 * rhs2) + (value1 * rhs6) + (value2 * rhs10) + (value3 * rhs14);
295 temp[loc3] = (value2 * rhs11) + (value3 * rhs15);
300 // 64 32bit registers,
302 // d = 64 bit double-word d0 -d31
303 // q =128 bit quad-word q0 -q15 (enough to handle a column of 4 floats in a matrix)
304 // e.g. q0 = d0 and d1
306 // load and stores interleaved as NEON can load and store while calculating
308 "VLDM %1, {q0-q3} \n\t" // load matrix 1 (lhsPtr) q[0..q3]
309 "VLDM %0, {q8-q11} \n\t" // load matrix 2 (rhsPtr) q[q8-q11]
310 "VMUL.F32 q12, q8, d0[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0..3]
311 "VMUL.F32 q13, q8, d2[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4..7]
312 "VMUL.F32 q14, q8, d4[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8..11]
313 "VMUL.F32 q15, q8, d6[0] \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12..15]
315 "VMLA.F32 q12, q9, d0[1] \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[0..3]
316 "VMLA.F32 q13, q9, d2[1] \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[4..7]
317 "VMLA.F32 q14, q9, d4[1] \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[8..11]
318 "VMLA.F32 q15, q9, d6[1] \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[12..15]
320 "VMLA.F32 q12, q10, d1[0] \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[0..3]
321 "VMLA.F32 q13, q10, d3[0] \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[4..7]
322 "VMLA.F32 q14, q10, d5[0] \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[8..11]
323 "VMLA.F32 q15, q10, d7[0] \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[12..15]
325 "VMLA.F32 q12, q11, d1[1] \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[0..3]
326 "VMLA.F32 q13, q11, d3[1] \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[4..7]
327 "VMLA.F32 q14, q11, d5[1] \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[8..11]
328 "VMLA.F32 q15, q11, d7[1] \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[12..15]
329 "VSTM %2, {q12-q15} \n\t" // store entire output matrix.
330 : "+r"(rhsPtr), "+r"(lhsPtr), "+r"(temp)
332 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory");
337 void MultiplyAssign(Dali::Matrix& result, const Dali::Matrix& rhs)
339 MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
340 MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 64); // 64 = 16*4
342 // TODO : Implement with NEON.
344 float* lhsPtr = result.AsFloat();
345 const float* rhsPtr = rhs.AsFloat();
346 float* temp = nullptr;
350 // If rhs is same matrix with result, we need to copy temperal vaules.
351 temp = static_cast<float*>(malloc(NUM_BYTES_IN_MATRIX));
352 memcpy(temp, rhsPtr, NUM_BYTES_IN_MATRIX);
356 // Calculate and store as row major.
357 for(int32_t i = 0; i < 4; i++)
359 const int32_t loc0 = i;
360 const int32_t loc1 = loc0 | 4;
361 const int32_t loc2 = loc0 | 8;
362 const int32_t loc3 = loc0 | 12;
364 const float value0 = lhsPtr[loc0];
365 const float value1 = lhsPtr[loc1];
366 const float value2 = lhsPtr[loc2];
367 const float value3 = lhsPtr[loc3];
369 lhsPtr[loc0] = (value0 * rhsPtr[0]) +
370 (value1 * rhsPtr[1]) +
371 (value2 * rhsPtr[2]) +
372 (value3 * rhsPtr[3]);
374 lhsPtr[loc1] = (value0 * rhsPtr[4]) +
375 (value1 * rhsPtr[5]) +
376 (value2 * rhsPtr[6]) +
377 (value3 * rhsPtr[7]);
379 lhsPtr[loc2] = (value0 * rhsPtr[8]) +
380 (value1 * rhsPtr[9]) +
381 (value2 * rhsPtr[10]) +
382 (value3 * rhsPtr[11]);
384 lhsPtr[loc3] = (value0 * rhsPtr[12]) +
385 (value1 * rhsPtr[13]) +
386 (value2 * rhsPtr[14]) +
387 (value3 * rhsPtr[15]);
392 // If we allocate temperal memory, we should free it.
399 void Multiply(Dali::Matrix3& result, const Dali::Matrix3& lhs, const Dali::Matrix3& rhs)
401 float* temp = result.AsFloat();
402 const float* rhsPtr = rhs.AsFloat();
403 const float* lhsPtr = lhs.AsFloat();
405 for(int32_t i = 0; i < 3; i++)
407 const int32_t loc0 = i * 3;
408 const int32_t loc1 = loc0 + 1;
409 const int32_t loc2 = loc0 + 2;
411 const float value0 = lhsPtr[loc0];
412 const float value1 = lhsPtr[loc1];
413 const float value2 = lhsPtr[loc2];
415 temp[loc0] = (value0 * rhsPtr[0]) +
416 (value1 * rhsPtr[3]) +
417 (value2 * rhsPtr[6]);
419 temp[loc1] = (value0 * rhsPtr[1]) +
420 (value1 * rhsPtr[4]) +
421 (value2 * rhsPtr[7]);
423 temp[loc2] = (value0 * rhsPtr[2]) +
424 (value1 * rhsPtr[5]) +
425 (value2 * rhsPtr[8]);
429 void MultiplyAssign(Dali::Matrix3& result, const Dali::Matrix3& rhs)
431 float* lhsPtr = result.AsFloat();
432 const float* rhsPtr = rhs.AsFloat();
433 float* temp = nullptr;
437 // If rhs is same matrix with result, we need to copy temperal vaules.
438 temp = static_cast<float*>(malloc(NUM_BYTES_IN_MATRIX3));
439 memcpy(temp, rhsPtr, NUM_BYTES_IN_MATRIX3);
443 // Calculate and store as row major.
444 for(int32_t i = 0; i < 3; i++)
446 const int32_t loc0 = i;
447 const int32_t loc1 = loc0 + 3;
448 const int32_t loc2 = loc0 + 6;
450 const float value0 = lhsPtr[loc0];
451 const float value1 = lhsPtr[loc1];
452 const float value2 = lhsPtr[loc2];
454 lhsPtr[loc0] = (value0 * rhsPtr[0]) +
455 (value1 * rhsPtr[1]) +
456 (value2 * rhsPtr[2]);
458 lhsPtr[loc1] = (value0 * rhsPtr[3]) +
459 (value1 * rhsPtr[4]) +
460 (value2 * rhsPtr[5]);
462 lhsPtr[loc2] = (value0 * rhsPtr[6]) +
463 (value1 * rhsPtr[7]) +
464 (value2 * rhsPtr[8]);
469 // If we allocate temperal memory, we should free it.
474 } // namespace MatrixUtils
475 } // namespace Dali::Internal