From: Eunki, Hong Date: Wed, 15 Feb 2023 07:34:37 +0000 (+0900) Subject: Fix matrix multiply with quaternion bug in ARM X-Git-Tag: dali_2.2.14~1 X-Git-Url: http://review.tizen.org/git/?p=platform%2Fcore%2Fuifw%2Fdali-core.git;a=commitdiff_plain;h=5b32be114e4ec0be0ff7348dbf45e1d5d640d152 Fix matrix multiply with quaternion bug in ARM There was some bug when we try to use MatrixUtils::Multiply at ARM devices. We fix it. Change-Id: I976e03573cbafacbd30fce2b6e2ae73c89b50e83 Signed-off-by: Eunki, Hong --- diff --git a/dali/internal/common/matrix-utils.cpp b/dali/internal/common/matrix-utils.cpp index e5c7566..00bccce 100644 --- a/dali/internal/common/matrix-utils.cpp +++ b/dali/internal/common/matrix-utils.cpp @@ -217,6 +217,11 @@ void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Quatern } #else + // Store 4th row values that might be overwrited. + const float value0 = lhsPtr[3]; + const float value1 = lhsPtr[7]; + const float value2 = lhsPtr[11]; + const float value3 = lhsPtr[15]; // 64 32bit registers, // aliased to @@ -225,28 +230,32 @@ void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Quatern // e.g. q0 = d0 and d1 // load and stores interleaved as NEON can load and store while calculating asm volatile( - "VLDM %1, {q4-q6} \n\t" // load matrix 1 (lhsPtr) - "VLD1.F32 {q7}, [%2]! \n\t" // load matrix 2 (rhsPtr) [0..3] - "VMUL.F32 q0, q7, d8[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0..3] - "VMUL.F32 q1, q7, d10[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4..7] - "VMUL.F32 q2, q7, d12[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8..11] - "VLD1.F32 {q7}, [%2]! \n\t" // load matrix 2 (rhsPtr) [4..7] - "VMLA.F32 q0, q7, d8[1] \n\t" // column 0+= rhsPtr[4..7] * lhsPtr[0..3] - "VMLA.F32 q1, q7, d10[1] \n\t" // column 1+= rhsPtr[4..7] * lhsPtr[4..7] - "VMLA.F32 q2, q7, d12[1] \n\t" // column 2+= rhsPtr[4..7] * lhsPtr[8..11] - "VLD1.F32 {q7}, [%2]! \n\t" // load matrix 2 (rhsPtr) [8..11] - "VMLA.F32 q0, q7, d9[0] \n\t" // column 0+= rhsPtr[8..11] * lhsPtr[0..3] - "VMLA.F32 q1, q7, d11[0] \n\t" // column 1+= rhsPtr[8..11] * lhsPtr[4..7] - "VMLA.F32 q2, q7, d13[0] \n\t" // column 2+= rhsPtr[8..11] * lhsPtr[8..11] - "VSTM %0, {q0-q2} \n\t" // store entire output matrix. + "VLDM %1, {q4-q7} \n\t" // load matrix 1 (lhsPtr) + "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [0..3] + "VMUL.F32 q0, q8, d8[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0] + "VMUL.F32 q1, q8, d10[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4] + "VMUL.F32 q2, q8, d12[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8] + "VMUL.F32 q3, q8, d14[0] \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12] + "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [4..7] + "VMLA.F32 q0, q8, d8[1] \n\t" // column 0+= rhsPtr[4..7] * lhsPtr[1] + "VMLA.F32 q1, q8, d10[1] \n\t" // column 1+= rhsPtr[4..7] * lhsPtr[5] + "VMLA.F32 q2, q8, d12[1] \n\t" // column 2+= rhsPtr[4..7] * lhsPtr[9] + "VMLA.F32 q3, q8, d14[1] \n\t" // column 3+= rhsPtr[4..7] * lhsPtr[13] + "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [8..11] + "VMLA.F32 q0, q8, d9[0] \n\t" // column 0+= rhsPtr[8..11] * lhsPtr[2] + "VMLA.F32 q1, q8, d11[0] \n\t" // column 1+= rhsPtr[8..11] * lhsPtr[6] + "VMLA.F32 q2, q8, d13[0] \n\t" // column 2+= rhsPtr[8..11] * lhsPtr[10] + "VMLA.F32 q3, q8, d15[0] \n\t" // column 3+= rhsPtr[8..11] * lhsPtr[14] + "VSTM %0, {q0-q3} \n\t" // store entire output matrix. : : "r"(temp), "r"(lhsPtr), "r"(rhsPtr) - : "%r0", "%q0", "%q1", "%q2", "%q4", "%q5", "%q6", "%q7", "memory"); + : "%r0", "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", "%q8", "memory"); - temp[12] = 0.0f; - temp[13] = 0.0f; - temp[14] = 0.0f; - temp[15] = 1.0f; + // Restore 4th row values. + temp[3] = value0; + temp[7] = value1; + temp[11] = value2; + temp[15] = value3; #endif }