2 * Copyright (c) 2023 Samsung Electronics Co., Ltd.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <dali/internal/common/matrix-utils.h>
22 #include <cstdint> // uint32_t
23 #include <cstring> // memcpy
26 #include <dali/internal/render/common/performance-monitor.h>
27 #include <dali/public-api/math/matrix.h>
28 #include <dali/public-api/math/matrix3.h>
29 #include <dali/public-api/math/quaternion.h>
33 const uint32_t NUM_BYTES_IN_MATRIX(16 * sizeof(float));
34 const uint32_t NUM_BYTES_IN_MATRIX3(9 * sizeof(float));
38 namespace Dali::Internal
40 using Internal::PerformanceMonitor;
46 void ConvertQuaternion(float*& result, const Dali::Quaternion& rotation)
48 MATH_INCREASE_COUNTER(PerformanceMonitor::QUATERNION_TO_MATRIX);
50 const float xx = rotation.mVector.x * rotation.mVector.x;
51 const float yy = rotation.mVector.y * rotation.mVector.y;
52 const float zz = rotation.mVector.z * rotation.mVector.z;
53 const float xy = rotation.mVector.x * rotation.mVector.y;
54 const float xz = rotation.mVector.x * rotation.mVector.z;
55 const float wx = rotation.mVector.w * rotation.mVector.x;
56 const float wy = rotation.mVector.w * rotation.mVector.y;
57 const float wz = rotation.mVector.w * rotation.mVector.z;
58 const float yz = rotation.mVector.y * rotation.mVector.z;
61 result[0] = 1.0f - 2.0f * (yy + zz);
62 result[1] = 2.0f * (xy + wz);
63 result[2] = 2.0f * (xz - wy);
66 result[4] = 2.0f * (xy - wz);
67 result[5] = 1.0f - 2.0f * (xx + zz);
68 result[6] = 2.0f * (yz + wx);
71 result[8] = 2.0f * (xz + wy);
72 result[9] = 2.0f * (yz - wx);
73 result[10]= 1.0f - 2.0f * (xx + yy);
85 void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& rhs)
87 MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
88 MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 64); // 64 = 16*4
90 float* temp = result.AsFloat();
91 const float* rhsPtr = rhs.AsFloat();
92 const float* lhsPtr = lhs.AsFloat();
96 for(int32_t i = 0; i < 4; i++)
98 // i<<2 gives the first vector / column
99 const int32_t loc0 = i << 2;
100 const int32_t loc1 = loc0 + 1;
101 const int32_t loc2 = loc0 + 2;
102 const int32_t loc3 = loc0 + 3;
104 const float value0 = lhsPtr[loc0];
105 const float value1 = lhsPtr[loc1];
106 const float value2 = lhsPtr[loc2];
107 const float value3 = lhsPtr[loc3];
109 temp[loc0] = (value0 * rhsPtr[0]) +
110 (value1 * rhsPtr[4]) +
111 (value2 * rhsPtr[8]) +
112 (value3 * rhsPtr[12]);
114 temp[loc1] = (value0 * rhsPtr[1]) +
115 (value1 * rhsPtr[5]) +
116 (value2 * rhsPtr[9]) +
117 (value3 * rhsPtr[13]);
119 temp[loc2] = (value0 * rhsPtr[2]) +
120 (value1 * rhsPtr[6]) +
121 (value2 * rhsPtr[10]) +
122 (value3 * rhsPtr[14]);
124 temp[loc3] = (value0 * rhsPtr[3]) +
125 (value1 * rhsPtr[7]) +
126 (value2 * rhsPtr[11]) +
127 (value3 * rhsPtr[15]);
132 // 64 32bit registers,
134 // s = 32 bit single-word s0 -s63
135 // d = 64 bit double-word d0 -d31
136 // q =128 bit quad-word q0 -q15 (enough to handle a column of 4 floats in a matrix)
137 // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
139 // load and stores interleaved as NEON can load and store while calculating
141 "VLDM %1, {q0-q3} \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
142 "VLDM %0, {q8-q11} \n\t" // load matrix 2 (rhsPtr) q[q8-q11]
143 "VMUL.F32 q12, q8, d0[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
144 "VMUL.F32 q13, q8, d2[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
145 "VMUL.F32 q14, q8, d4[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
146 "VMUL.F32 q15, q8, d6[0] \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
148 "VMLA.F32 q12, q9, d0[1] \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
149 "VMLA.F32 q13, q9, d2[1] \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
150 "VMLA.F32 q14, q9, d4[1] \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
151 "VMLA.F32 q15, q9, d6[1] \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
153 "VMLA.F32 q12, q10, d1[0] \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
154 "VMLA.F32 q13, q10, d3[0] \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
155 "VMLA.F32 q14, q10, d5[0] \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
156 "VMLA.F32 q15, q10, d7[0] \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
158 "VMLA.F32 q12, q11, d1[1] \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[3]
159 "VMLA.F32 q13, q11, d3[1] \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[7]
160 "VMLA.F32 q14, q11, d5[1] \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[11]
161 "VMLA.F32 q15, q11, d7[1] \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[15]
162 "VSTM %2, {q12-q15} \n\t" // store entire output matrix.
163 : "+r"(rhsPtr), "+r"(lhsPtr), "+r"(temp)
165 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory");
170 void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Quaternion& rhs)
172 MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
173 MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 54); // 54 = 36+18
176 float* rhsPtr = &matrix[0];
177 ConvertQuaternion(rhsPtr, rhs);
179 // quaternion contains just rotation so it really only needs 3x3 matrix
181 float* temp = result.AsFloat();
182 const float* lhsPtr = lhs.AsFloat();
186 for(int32_t i = 0; i < 4; i++)
188 // i<<2 gives the first vector / column
189 const int32_t loc0 = i << 2;
190 const int32_t loc1 = loc0 + 1;
191 const int32_t loc2 = loc0 + 2;
192 const int32_t loc3 = loc0 + 3;
194 const float value0 = lhsPtr[loc0];
195 const float value1 = lhsPtr[loc1];
196 const float value2 = lhsPtr[loc2];
197 const float value3 = lhsPtr[loc3];
199 temp[loc0] = (value0 * rhsPtr[0]) +
200 (value1 * rhsPtr[4]) +
201 (value2 * rhsPtr[8]) +
202 (0.0f); //value3 * rhsPtr[12] is 0.0f
204 temp[loc1] = (value0 * rhsPtr[1]) +
205 (value1 * rhsPtr[5]) +
206 (value2 * rhsPtr[9]) +
207 (0.0f); //value3 * rhsPtr[13] is 0.0f
209 temp[loc2] = (value0 * rhsPtr[2]) +
210 (value1 * rhsPtr[6]) +
211 (value2 * rhsPtr[10]) +
212 (0.0f); //value3 * rhsPtr[14] is 0.0f
214 temp[loc3] = (0.0f) + //value0 * rhsPtr[3] is 0.0f
215 (0.0f) + //value1 * rhsPtr[7] is 0.0f
216 (0.0f) + //value2 * rhsPtr[11] is 0.0f
217 (value3); // rhsPtr[15] is 1.0f
221 // Store 4th row values that might be overwrited.
222 const float value0 = lhsPtr[3];
223 const float value1 = lhsPtr[7];
224 const float value2 = lhsPtr[11];
225 const float value3 = lhsPtr[15];
227 // 64 32bit registers,
229 // s = 32 bit single-word s0 -s63
230 // d = 64 bit double-word d0 -d31
231 // q =128 bit quad-word q0 -q15 (enough to handle a column of 4 floats in a matrix)
232 // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
234 // load and stores interleaved as NEON can load and store while calculating
236 "VLDM %1, {q0-q3} \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
237 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [0..3]
238 "VMUL.F32 q4, q8, d0[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
239 "VMUL.F32 q5, q8, d2[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
240 "VMUL.F32 q6, q8, d4[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
241 "VMUL.F32 q7, q8, d6[0] \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
242 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [4..7]
243 "VMLA.F32 q4, q8, d0[1] \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
244 "VMLA.F32 q5, q8, d2[1] \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
245 "VMLA.F32 q6, q8, d4[1] \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
246 "VMLA.F32 q7, q8, d6[1] \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
247 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [8..11]
248 "VMLA.F32 q4, q8, d1[0] \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
249 "VMLA.F32 q5, q8, d3[0] \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
250 "VMLA.F32 q6, q8, d5[0] \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
251 "VMLA.F32 q7, q8, d7[0] \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
252 "VSTM %0, {q4-q7} \n\t" // store entire output matrix.
254 : "r"(temp), "r"(lhsPtr), "r"(rhsPtr)
255 : "%r0", "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", "%q8", "memory");
257 // Restore 4th row values.
265 void MultiplyTransformMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& rhs)
267 MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
268 MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 36); // 36 = 9*4
270 float* temp = result.AsFloat();
271 const float* rhsPtr = rhs.AsFloat();
272 const float* lhsPtr = lhs.AsFloat();
276 for(int32_t i = 0; i < 4; i++)
278 // i<<2 gives the first vector / column
279 const int32_t loc0 = i << 2;
280 const int32_t loc1 = loc0 + 1;
281 const int32_t loc2 = loc0 + 2;
283 const float value0 = lhsPtr[loc0];
284 const float value1 = lhsPtr[loc1];
285 const float value2 = lhsPtr[loc2];
287 temp[loc0] = (value0 * rhsPtr[0]) +
288 (value1 * rhsPtr[4]) +
289 (value2 * rhsPtr[8]) +
290 (i == 3 ? rhsPtr[12] : 0.0f); // lhsPtr[loc3] is 0.0f, or 1.0f only if i == 3
292 temp[loc1] = (value0 * rhsPtr[1]) +
293 (value1 * rhsPtr[5]) +
294 (value2 * rhsPtr[9]) +
295 (i == 3 ? rhsPtr[13] : 0.0f); // lhsPtr[loc3] is 0.0f, or 1.0f only if i == 3
297 temp[loc2] = (value0 * rhsPtr[2]) +
298 (value1 * rhsPtr[6]) +
299 (value2 * rhsPtr[10]) +
300 (i == 3 ? rhsPtr[14] : 0.0f); // lhsPtr[loc3] is 0.0f, or 1.0f only if i == 3
302 temp[3] = temp[7] = temp[11] = 0.0f;
307 // 64 32bit registers,
309 // s = 32 bit single-word s0 -s63
310 // d = 64 bit double-word d0 -d31
311 // q =128 bit quad-word q0 -q15 (enough to handle a column of 4 floats in a matrix)
312 // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
314 // load and stores interleaved as NEON can load and store while calculating
316 "VLDM %1, {q0-q3} \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
317 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [0..3]
318 "VMUL.F32 q12, q8, d0[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
319 "VMUL.F32 q13, q8, d2[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
320 "VMUL.F32 q14, q8, d4[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
321 "VMUL.F32 q15, q8, d6[0] \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
323 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [4..7]
324 "VMLA.F32 q12, q8, d0[1] \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
325 "VMLA.F32 q13, q8, d2[1] \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
326 "VMLA.F32 q14, q8, d4[1] \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
327 "VMLA.F32 q15, q8, d6[1] \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
329 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [8..11]
330 "VMLA.F32 q12, q8, d1[0] \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
331 "VMLA.F32 q13, q8, d3[0] \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
332 "VMLA.F32 q14, q8, d5[0] \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
333 "VMLA.F32 q15, q8, d7[0] \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
335 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [12..15]
336 "VADD.F32 q15, q15, q8 \n\t" // column 3 = column3 + rhsPtr[12..15]
337 "VSTM %0, {q12-q15} \n\t" // store entire output matrix.
339 : "r"(temp), "r"(lhsPtr), "r"(rhsPtr)
340 : "%r0", "q0", "q1", "q2", "q3", "q8", "q12", "q13", "q14", "q15", "memory");
345 void MultiplyProjectionMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& projection)
347 MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
348 MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 32); // 32 = 8*4
350 float* temp = result.AsFloat();
351 const float* rhsPtr = projection.AsFloat();
352 const float* lhsPtr = lhs.AsFloat();
356 // We only use rhsPtr's 0, 1, 2, 4, 5, 6, 10, 11, 14, 15 index.
357 const float rhs0 = rhsPtr[0];
358 const float rhs1 = rhsPtr[1];
359 const float rhs2 = rhsPtr[2];
360 const float rhs4 = rhsPtr[4];
361 const float rhs5 = rhsPtr[5];
362 const float rhs6 = rhsPtr[6];
363 const float rhs10 = rhsPtr[10];
364 const float rhs11 = rhsPtr[11];
365 const float rhs14 = rhsPtr[14];
366 const float rhs15 = rhsPtr[15];
368 for(int32_t i = 0; i < 4; i++)
370 // i<<2 gives the first vector / column
371 const int32_t loc0 = i << 2;
372 const int32_t loc1 = loc0 + 1;
373 const int32_t loc2 = loc0 + 2;
374 const int32_t loc3 = loc0 + 3;
376 const float value0 = lhsPtr[loc0];
377 const float value1 = lhsPtr[loc1];
378 const float value2 = lhsPtr[loc2];
380 temp[loc0] = (value0 * rhs0) + (value1 * rhs4);
381 temp[loc1] = (value0 * rhs1) + (value1 * rhs5);
382 temp[loc2] = (value0 * rhs2) + (value1 * rhs6) + (value2 * rhs10) + (i == 3 ? rhs14 : 0.0f);
383 temp[loc3] = (value2 * rhs11) + (i == 3 ? rhs15 : 0.0f);
388 // 64 32bit registers,
390 // s = 32 bit single-word s0 -s63
391 // d = 64 bit double-word d0 -d31
392 // q =128 bit quad-word q0 -q15 (enough to handle a column of 4 floats in a matrix)
393 // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
395 // load and stores interleaved as NEON can load and store while calculating
397 "VLDM %1, {q0-q3} \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
398 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [0..3]
399 "VMUL.F32 q12, q8, d0[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
400 "VMUL.F32 q13, q8, d2[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
401 "VMUL.F32 q14, q8, d4[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
402 "VMUL.F32 q15, q8, d6[0] \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
404 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [4..7]
405 "VMLA.F32 q12, q8, d0[1] \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
406 "VMLA.F32 q13, q8, d2[1] \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
407 "VMLA.F32 q14, q8, d4[1] \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
408 "VMLA.F32 q15, q8, d6[1] \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
410 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [8..11]
411 "VMLA.F32 d25, d17, d1[0] \n\t" // column 0[2,3] += rhsPtr[10,11] * lhsPtr[2]
412 "VMLA.F32 d27, d17, d3[0] \n\t" // column 1[2,3] += rhsPtr[10,11] * lhsPtr[6]
413 "VMLA.F32 d29, d17, d5[0] \n\t" // column 2[2,3] += rhsPtr[10,11] * lhsPtr[10]
414 "VMLA.F32 d31, d17, d7[0] \n\t" // column 3[2,3] += rhsPtr[10,11] * lhsPtr[14]
416 "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [12..15]
417 "VADD.F32 d31, d31, d17 \n\t" // column 3[2,3] = column3[2,3] + rhsPtr[14,15]
418 "VSTM %0, {q12-q15} \n\t" // store entire output matrix.
420 : "r"(temp), "r"(lhsPtr), "r"(rhsPtr)
421 : "%r0", "q0", "q1", "q2", "q3", "q8", "q12", "q13", "q14", "q15", "memory");
426 void MultiplyAssign(Dali::Matrix& result, const Dali::Matrix& rhs)
428 MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
429 MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 64); // 64 = 16*4
433 float* lhsPtr = result.AsFloat();
434 const float* rhsPtr = rhs.AsFloat();
435 float* temp = nullptr;
439 // If rhs is same matrix with result, we need to copy temperal vaules.
440 temp = static_cast<float*>(malloc(NUM_BYTES_IN_MATRIX));
441 memcpy(temp, rhsPtr, NUM_BYTES_IN_MATRIX);
445 // Calculate and store as row major.
446 for(int32_t i = 0; i < 4; i++)
448 const int32_t loc0 = i;
449 const int32_t loc1 = loc0 | 4;
450 const int32_t loc2 = loc0 | 8;
451 const int32_t loc3 = loc0 | 12;
453 const float value0 = lhsPtr[loc0];
454 const float value1 = lhsPtr[loc1];
455 const float value2 = lhsPtr[loc2];
456 const float value3 = lhsPtr[loc3];
458 lhsPtr[loc0] = (value0 * rhsPtr[0]) +
459 (value1 * rhsPtr[1]) +
460 (value2 * rhsPtr[2]) +
461 (value3 * rhsPtr[3]);
463 lhsPtr[loc1] = (value0 * rhsPtr[4]) +
464 (value1 * rhsPtr[5]) +
465 (value2 * rhsPtr[6]) +
466 (value3 * rhsPtr[7]);
468 lhsPtr[loc2] = (value0 * rhsPtr[8]) +
469 (value1 * rhsPtr[9]) +
470 (value2 * rhsPtr[10]) +
471 (value3 * rhsPtr[11]);
473 lhsPtr[loc3] = (value0 * rhsPtr[12]) +
474 (value1 * rhsPtr[13]) +
475 (value2 * rhsPtr[14]) +
476 (value3 * rhsPtr[15]);
481 // If we allocate temperal memory, we should free it.
486 // We store temperal values into register. Don't worry about overlap.
487 // Copy normal Multiply code.
488 // Becareful the name of pointer is crossed!
490 float* temp = result.AsFloat();
491 const float* rhsPtr = result.AsFloat();
492 const float* lhsPtr = rhs.AsFloat();
494 // 64 32bit registers,
496 // s = 32 bit single-word s0 -s63
497 // d = 64 bit double-word d0 -d31
498 // q =128 bit quad-word q0 -q15 (enough to handle a column of 4 floats in a matrix)
499 // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
501 // load and stores interleaved as NEON can load and store while calculating
503 "VLDM %1, {q0-q3} \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
504 "VLDM %0, {q8-q11} \n\t" // load matrix 2 (rhsPtr) q[q8-q11]
505 "VMUL.F32 q12, q8, d0[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
506 "VMUL.F32 q13, q8, d2[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
507 "VMUL.F32 q14, q8, d4[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
508 "VMUL.F32 q15, q8, d6[0] \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
510 "VMLA.F32 q12, q9, d0[1] \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
511 "VMLA.F32 q13, q9, d2[1] \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
512 "VMLA.F32 q14, q9, d4[1] \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
513 "VMLA.F32 q15, q9, d6[1] \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
515 "VMLA.F32 q12, q10, d1[0] \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
516 "VMLA.F32 q13, q10, d3[0] \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
517 "VMLA.F32 q14, q10, d5[0] \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
518 "VMLA.F32 q15, q10, d7[0] \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
520 "VMLA.F32 q12, q11, d1[1] \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[3]
521 "VMLA.F32 q13, q11, d3[1] \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[7]
522 "VMLA.F32 q14, q11, d5[1] \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[11]
523 "VMLA.F32 q15, q11, d7[1] \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[15]
524 "VSTM %2, {q12-q15} \n\t" // store entire output matrix.
525 : "+r"(rhsPtr), "+r"(lhsPtr), "+r"(temp)
527 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory");
534 void Multiply(Dali::Matrix3& result, const Dali::Matrix3& lhs, const Dali::Matrix3& rhs)
536 float* temp = result.AsFloat();
537 const float* rhsPtr = rhs.AsFloat();
538 const float* lhsPtr = lhs.AsFloat();
540 for(int32_t i = 0; i < 3; i++)
542 const int32_t loc0 = i * 3;
543 const int32_t loc1 = loc0 + 1;
544 const int32_t loc2 = loc0 + 2;
546 const float value0 = lhsPtr[loc0];
547 const float value1 = lhsPtr[loc1];
548 const float value2 = lhsPtr[loc2];
550 temp[loc0] = (value0 * rhsPtr[0]) +
551 (value1 * rhsPtr[3]) +
552 (value2 * rhsPtr[6]);
554 temp[loc1] = (value0 * rhsPtr[1]) +
555 (value1 * rhsPtr[4]) +
556 (value2 * rhsPtr[7]);
558 temp[loc2] = (value0 * rhsPtr[2]) +
559 (value1 * rhsPtr[5]) +
560 (value2 * rhsPtr[8]);
564 void MultiplyAssign(Dali::Matrix3& result, const Dali::Matrix3& rhs)
566 float* lhsPtr = result.AsFloat();
567 const float* rhsPtr = rhs.AsFloat();
568 float* temp = nullptr;
572 // If rhs is same matrix with result, we need to copy temperal vaules.
573 temp = static_cast<float*>(malloc(NUM_BYTES_IN_MATRIX3));
574 memcpy(temp, rhsPtr, NUM_BYTES_IN_MATRIX3);
578 // Calculate and store as row major.
579 for(int32_t i = 0; i < 3; i++)
581 const int32_t loc0 = i;
582 const int32_t loc1 = loc0 + 3;
583 const int32_t loc2 = loc0 + 6;
585 const float value0 = lhsPtr[loc0];
586 const float value1 = lhsPtr[loc1];
587 const float value2 = lhsPtr[loc2];
589 lhsPtr[loc0] = (value0 * rhsPtr[0]) +
590 (value1 * rhsPtr[1]) +
591 (value2 * rhsPtr[2]);
593 lhsPtr[loc1] = (value0 * rhsPtr[3]) +
594 (value1 * rhsPtr[4]) +
595 (value2 * rhsPtr[5]);
597 lhsPtr[loc2] = (value0 * rhsPtr[6]) +
598 (value1 * rhsPtr[7]) +
599 (value2 * rhsPtr[8]);
604 // If we allocate temperal memory, we should free it.
609 } // namespace MatrixUtils
610 } // namespace Dali::Internal