Multiply only for Transform Matrix + NEON comment clean up

author Eunki Hong <eunkiki.hong@samsung.com>

Sat, 18 Feb 2023 07:05:50 +0000 (16:05 +0900)

committer Eunki, Hong <eunkiki.hong@samsung.com>

Mon, 20 Feb 2023 12:22:05 +0000 (21:22 +0900)
author Eunki Hong <eunkiki.hong@samsung.com>
Sat, 18 Feb 2023 07:05:50 +0000 (16:05 +0900)
committer Eunki, Hong <eunkiki.hong@samsung.com>
Mon, 20 Feb 2023 12:22:05 +0000 (21:22 +0900)
diff --git a/automated-tests/src/dali-internal/utc-Dali-Internal-MatrixUtils.cpp b/automated-tests/src/dali-internal/utc-Dali-Internal-MatrixUtils.cpp

index 8b6ce8f..3f5b2c1 100644 (file)
--- a/automated-tests/src/dali-internal/utc-Dali-Internal-MatrixUtils.cpp
+++ b/automated-tests/src/dali-internal/utc-Dali-Internal-MatrixUtils.cpp
@@ -112,9 +112,61 @@ int UtcDaliMatrixUtilsMultiplyMatrixQuaternionP(void)
    END_TEST;
  }
  
+int UtcDaliMatrixUtilsMultiplyTransformMatrix(void)
+{
+  tet_infoline("Multiplication two transform matrixs\n");
+
+  Matrix expectMatrix;
+  Matrix resultMatrix;
+  for(int32_t repeatCount = 0; repeatCount < 10; repeatCount++)
+  {
+    Vector3    lpos         = Vector3(Dali::Random::Range(-50.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f));
+    Vector3    laxis        = Vector3(Dali::Random::Range(1.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f));
+    float      lradian      = Dali::Random::Range(0.0f, 5.0f);
+    Quaternion lorientation = Quaternion(Radian(lradian), laxis);
+    Vector3    lscale       = Vector3(Dali::Random::Range(-50.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f));
+
+    Vector3    rpos         = Vector3(Dali::Random::Range(-50.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f));
+    Vector3    raxis        = Vector3(Dali::Random::Range(1.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f));
+    float      rradian      = Dali::Random::Range(0.0f, 5.0f);
+    Quaternion rorientation = Quaternion(Radian(rradian), raxis);
+    Vector3    rscale       = Vector3(Dali::Random::Range(-50.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f), Dali::Random::Range(-50.0f, 50.0f));
+
+    Matrix lhs, rhs;
+    lhs.SetTransformComponents(lscale, lorientation, lpos);
+    rhs.SetTransformComponents(rscale, rorientation, rpos);
+
+    // Get result by Multiply API
+    Internal::MatrixUtils::Multiply(expectMatrix, lhs, rhs);
+    // Get result by MultiplyTransformMatrix API
+    Internal::MatrixUtils::MultiplyTransformMatrix(resultMatrix, lhs, rhs);
+
+    {
+      std::ostringstream oss;
+      oss << "lhs          : " << lhs << "\n";
+      oss << "lpos         : " << lpos << "\n";
+      oss << "lorientation : " << lorientation << "\n";
+      oss << "lscale       : " << lscale << "\n";
+
+      oss << "rhs          : " << rhs << "\n";
+      oss << "rpos         : " << rpos << "\n";
+      oss << "rorientation : " << rorientation << "\n";
+      oss << "rscale       : " << rscale << "\n";
+
+      oss << "expect     : " << expectMatrix << "\n";
+      oss << "result     : " << resultMatrix << "\n";
+      tet_printf("test result : \n%s\n", oss.str().c_str());
+    }
+
+    DALI_TEST_EQUALS(expectMatrix, resultMatrix, 0.01f, TEST_LOCATION);
+  }
+
+  END_TEST;
+}
+
  int UtcDaliMatrixUtilsMultiplyProjectionMatrix(void)
  {
-  tet_infoline("Multiplication Assign operator with self matrix\n");
+  tet_infoline("Multiplication projection matrix and random matrix\n");
  
    Matrix viewMatrix;
    Matrix projectionMatrix;
diff --git a/dali/internal/common/matrix-utils.cpp b/dali/internal/common/matrix-utils.cpp

index 00bccce..6ad3d7d 100644 (file)
--- a/dali/internal/common/matrix-utils.cpp
+++ b/dali/internal/common/matrix-utils.cpp
@@ -131,33 +131,34 @@ void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix&
  
    // 64 32bit registers,
    // aliased to
+  // s = 32 bit single-word s0 -s63
    // d = 64 bit double-word d0 -d31
    // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
-  // e.g. q0 = d0 and d1
+  // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
  
    // load and stores interleaved as NEON can load and store while calculating
    asm volatile(
-    "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[0..q3]
+    "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
      "VLDM         %0,  {q8-q11}       \n\t" // load matrix 2 (rhsPtr) q[q8-q11]
-    "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0..3]
-    "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4..7]
-    "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8..11]
-    "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12..15]
-
-    "VMLA.F32     q12, q9, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[0..3]
-    "VMLA.F32     q13, q9, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[4..7]
-    "VMLA.F32     q14, q9, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[8..11]
-    "VMLA.F32     q15, q9, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[12..15]
-
-    "VMLA.F32     q12, q10, d1[0]     \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[0..3]
-    "VMLA.F32     q13, q10, d3[0]     \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[4..7]
-    "VMLA.F32     q14, q10, d5[0]     \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[8..11]
-    "VMLA.F32     q15, q10, d7[0]     \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[12..15]
-
-    "VMLA.F32     q12, q11, d1[1]     \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[0..3]
-    "VMLA.F32     q13, q11, d3[1]     \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[4..7]
-    "VMLA.F32     q14, q11, d5[1]     \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[8..11]
-    "VMLA.F32     q15, q11, d7[1]     \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[12..15]
+    "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
+    "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
+    "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
+    "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
+
+    "VMLA.F32     q12, q9, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
+    "VMLA.F32     q13, q9, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
+    "VMLA.F32     q14, q9, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
+    "VMLA.F32     q15, q9, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
+
+    "VMLA.F32     q12, q10, d1[0]     \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
+    "VMLA.F32     q13, q10, d3[0]     \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
+    "VMLA.F32     q14, q10, d5[0]     \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
+    "VMLA.F32     q15, q10, d7[0]     \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
+
+    "VMLA.F32     q12, q11, d1[1]     \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[3]
+    "VMLA.F32     q13, q11, d3[1]     \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[7]
+    "VMLA.F32     q14, q11, d5[1]     \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[11]
+    "VMLA.F32     q15, q11, d7[1]     \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[15]
      "VSTM         %2,  {q12-q15}      \n\t" // store entire output matrix.
      : "+r"(rhsPtr), "+r"(lhsPtr), "+r"(temp)
      :
@@ -225,28 +226,30 @@ void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Quatern
  
    // 64 32bit registers,
    // aliased to
+  // s = 32 bit single-word s0 -s63
    // d = 64 bit double-word d0 -d31
    // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
-  // e.g. q0 = d0 and d1
+  // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
+
    // load and stores interleaved as NEON can load and store while calculating
    asm volatile(
-    "VLDM         %1,   {q4-q7}       \n\t" // load matrix 1 (lhsPtr)
+    "VLDM         %1,   {q0-q3}       \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
      "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [0..3]
-    "VMUL.F32     q0,   q8,   d8[0]   \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
-    "VMUL.F32     q1,   q8,   d10[0]  \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
-    "VMUL.F32     q2,   q8,   d12[0]  \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
-    "VMUL.F32     q3,   q8,   d14[0]  \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
+    "VMUL.F32     q4,   q8,   d0[0]   \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
+    "VMUL.F32     q5,   q8,   d2[0]   \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
+    "VMUL.F32     q6,   q8,   d4[0]   \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
+    "VMUL.F32     q7,   q8,   d6[0]   \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
      "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [4..7]
-    "VMLA.F32     q0,   q8,   d8[1]   \n\t" // column 0+= rhsPtr[4..7] * lhsPtr[1]
-    "VMLA.F32     q1,   q8,   d10[1]  \n\t" // column 1+= rhsPtr[4..7] * lhsPtr[5]
-    "VMLA.F32     q2,   q8,   d12[1]  \n\t" // column 2+= rhsPtr[4..7] * lhsPtr[9]
-    "VMLA.F32     q3,   q8,   d14[1]  \n\t" // column 3+= rhsPtr[4..7] * lhsPtr[13]
+    "VMLA.F32     q4,   q8,   d0[1]   \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
+    "VMLA.F32     q5,   q8,   d2[1]   \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
+    "VMLA.F32     q6,   q8,   d4[1]   \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
+    "VMLA.F32     q7,   q8,   d6[1]   \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
      "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [8..11]
-    "VMLA.F32     q0,   q8,   d9[0]   \n\t" // column 0+= rhsPtr[8..11] * lhsPtr[2]
-    "VMLA.F32     q1,   q8,   d11[0]  \n\t" // column 1+= rhsPtr[8..11] * lhsPtr[6]
-    "VMLA.F32     q2,   q8,   d13[0]  \n\t" // column 2+= rhsPtr[8..11] * lhsPtr[10]
-    "VMLA.F32     q3,   q8,   d15[0]  \n\t" // column 3+= rhsPtr[8..11] * lhsPtr[14]
-    "VSTM         %0,   {q0-q3}       \n\t" // store entire output matrix.
+    "VMLA.F32     q4,   q8,   d1[0]   \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
+    "VMLA.F32     q5,   q8,   d3[0]   \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
+    "VMLA.F32     q6,   q8,   d5[0]   \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
+    "VMLA.F32     q7,   q8,   d7[0]   \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
+    "VSTM         %0,   {q4-q7}       \n\t" // store entire output matrix.
      :
      : "r"(temp), "r"(lhsPtr), "r"(rhsPtr)
      : "%r0", "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", "%q8", "memory");
@@ -259,13 +262,90 @@ void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Quatern
  #endif
  }
  
-void MultiplyProjectionMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& projection)
+void MultiplyTransformMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& rhs)
  {
-  // TODO : Implement with NEON.
-  // Current NEON code is copy of Multiply.
+  MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
+  MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 36); // 36 = 9*4
+
+  float*       temp   = result.AsFloat();
+  const float* rhsPtr = rhs.AsFloat();
+  const float* lhsPtr = lhs.AsFloat();
+
+#ifndef __ARM_NEON__
+
+  for(int32_t i = 0; i < 4; i++)
+  {
+    // i<<2 gives the first vector / column
+    const int32_t loc0 = i << 2;
+    const int32_t loc1 = loc0 + 1;
+    const int32_t loc2 = loc0 + 2;
+
+    const float value0 = lhsPtr[loc0];
+    const float value1 = lhsPtr[loc1];
+    const float value2 = lhsPtr[loc2];
+
+    temp[loc0] = (value0 * rhsPtr[0]) +
+                 (value1 * rhsPtr[4]) +
+                 (value2 * rhsPtr[8]) +
+                 (i == 3 ? rhsPtr[12] : 0.0f); // lhsPtr[loc3] is 0.0f, or 1.0f only if i == 3
+
+    temp[loc1] = (value0 * rhsPtr[1]) +
+                 (value1 * rhsPtr[5]) +
+                 (value2 * rhsPtr[9]) +
+                 (i == 3 ? rhsPtr[13] : 0.0f); // lhsPtr[loc3] is 0.0f, or 1.0f only if i == 3
+
+    temp[loc2] = (value0 * rhsPtr[2]) +
+                 (value1 * rhsPtr[6]) +
+                 (value2 * rhsPtr[10]) +
+                 (i == 3 ? rhsPtr[14] : 0.0f); // lhsPtr[loc3] is 0.0f, or 1.0f only if i == 3
+  }
+  temp[3] = temp[7] = temp[11] = 0.0f;
+  temp[15]                     = 1.0f;
+
+#else
+
+  // 64 32bit registers,
+  // aliased to
+  // s = 32 bit single-word s0 -s63
+  // d = 64 bit double-word d0 -d31
+  // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
+  // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
+
+  // load and stores interleaved as NEON can load and store while calculating
+  asm volatile(
+    "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
+    "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [0..3]
+    "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
+    "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
+    "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
+    "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
+
+    "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [4..7]
+    "VMLA.F32     q12, q8, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
+    "VMLA.F32     q13, q8, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
+    "VMLA.F32     q14, q8, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
+    "VMLA.F32     q15, q8, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
  
+    "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [8..11]
+    "VMLA.F32     q12, q8, d1[0]      \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
+    "VMLA.F32     q13, q8, d3[0]      \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
+    "VMLA.F32     q14, q8, d5[0]      \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
+    "VMLA.F32     q15, q8, d7[0]      \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
+
+    "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [12..15]
+    "VADD.F32     q15, q15, q8        \n\t" // column 3 = column3 + rhsPtr[12..15]
+    "VSTM         %0,  {q12-q15}      \n\t" // store entire output matrix.
+    :
+    : "r"(temp), "r"(lhsPtr), "r"(rhsPtr)
+    : "%r0", "q0", "q1", "q2", "q3", "q8", "q12", "q13", "q14", "q15", "memory");
+
+#endif
+}
+
+void MultiplyProjectionMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& projection)
+{
    MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
-  MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 40); // 40 = 10*4
+  MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 32); // 32 = 8*4
  
    float*       temp   = result.AsFloat();
    const float* rhsPtr = projection.AsFloat();
@@ -296,49 +376,49 @@ void MultiplyProjectionMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, con
      const float value0 = lhsPtr[loc0];
      const float value1 = lhsPtr[loc1];
      const float value2 = lhsPtr[loc2];
-    const float value3 = lhsPtr[loc3];
  
      temp[loc0] = (value0 * rhs0) + (value1 * rhs4);
      temp[loc1] = (value0 * rhs1) + (value1 * rhs5);
-    temp[loc2] = (value0 * rhs2) + (value1 * rhs6) + (value2 * rhs10) + (value3 * rhs14);
-    temp[loc3] = (value2 * rhs11) + (value3 * rhs15);
+    temp[loc2] = (value0 * rhs2) + (value1 * rhs6) + (value2 * rhs10) + (i == 3 ? rhs14 : 0.0f);
+    temp[loc3] = (value2 * rhs11) + (i == 3 ? rhs15 : 0.0f);
    }
  
  #else
  
    // 64 32bit registers,
    // aliased to
+  // s = 32 bit single-word s0 -s63
    // d = 64 bit double-word d0 -d31
    // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
-  // e.g. q0 = d0 and d1
+  // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
  
    // load and stores interleaved as NEON can load and store while calculating
    asm volatile(
-    "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[0..q3]
-    "VLDM         %0,  {q8-q11}       \n\t" // load matrix 2 (rhsPtr) q[q8-q11]
-    "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0..3]
-    "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4..7]
-    "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8..11]
-    "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12..15]
-
-    "VMLA.F32     q12, q9, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[0..3]
-    "VMLA.F32     q13, q9, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[4..7]
-    "VMLA.F32     q14, q9, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[8..11]
-    "VMLA.F32     q15, q9, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[12..15]
-
-    "VMLA.F32     q12, q10, d1[0]     \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[0..3]
-    "VMLA.F32     q13, q10, d3[0]     \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[4..7]
-    "VMLA.F32     q14, q10, d5[0]     \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[8..11]
-    "VMLA.F32     q15, q10, d7[0]     \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[12..15]
-
-    "VMLA.F32     q12, q11, d1[1]     \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[0..3]
-    "VMLA.F32     q13, q11, d3[1]     \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[4..7]
-    "VMLA.F32     q14, q11, d5[1]     \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[8..11]
-    "VMLA.F32     q15, q11, d7[1]     \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[12..15]
-    "VSTM         %2,  {q12-q15}      \n\t" // store entire output matrix.
-    : "+r"(rhsPtr), "+r"(lhsPtr), "+r"(temp)
+    "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
+    "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [0..3]
+    "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
+    "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
+    "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
+    "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
+
+    "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [4..7]
+    "VMLA.F32     q12, q8, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
+    "VMLA.F32     q13, q8, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
+    "VMLA.F32     q14, q8, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
+    "VMLA.F32     q15, q8, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
+
+    "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [8..11]
+    "VMLA.F32     d25, d17, d1[0]     \n\t" // column 0[2,3] += rhsPtr[10,11] * lhsPtr[2]
+    "VMLA.F32     d27, d17, d3[0]     \n\t" // column 1[2,3] += rhsPtr[10,11] * lhsPtr[6]
+    "VMLA.F32     d29, d17, d5[0]     \n\t" // column 2[2,3] += rhsPtr[10,11] * lhsPtr[10]
+    "VMLA.F32     d31, d17, d7[0]     \n\t" // column 3[2,3] += rhsPtr[10,11] * lhsPtr[14]
+
+    "VLD1.F32     {q8}, [%2]!         \n\t" // load matrix 2 (rhsPtr) [12..15]
+    "VADD.F32     d31, d31, d17       \n\t" // column 3[2,3] = column3[2,3] + rhsPtr[14,15]
+    "VSTM         %0,  {q12-q15}      \n\t" // store entire output matrix.
      :
-    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory");
+    : "r"(temp), "r"(lhsPtr), "r"(rhsPtr)
+    : "%r0", "q0", "q1", "q2", "q3", "q8", "q12", "q13", "q14", "q15", "memory");
  
  #endif
  }
@@ -348,7 +428,7 @@ void MultiplyAssign(Dali::Matrix& result, const Dali::Matrix& rhs)
    MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
    MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 64); // 64 = 16*4
  
-  // TODO : Implement with NEON.
+#ifndef __ARM_NEON__
  
    float*       lhsPtr = result.AsFloat();
    const float* rhsPtr = rhs.AsFloat();
@@ -401,6 +481,52 @@ void MultiplyAssign(Dali::Matrix& result, const Dali::Matrix& rhs)
      // If we allocate temperal memory, we should free it.
      free(temp);
    }
+
+#else
+  // We store temperal values into register. Don't worry about overlap.
+  // Copy normal Multiply code.
+  // Becareful the name of pointer is crossed!
+
+  float*       temp   = result.AsFloat();
+  const float* rhsPtr = result.AsFloat();
+  const float* lhsPtr = rhs.AsFloat();
+
+  // 64 32bit registers,
+  // aliased to
+  // s = 32 bit single-word s0 -s63
+  // d = 64 bit double-word d0 -d31
+  // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
+  // e.g. q0 = d0 and d1 = s0, s1, s2, and s3
+
+  // load and stores interleaved as NEON can load and store while calculating
+  asm volatile(
+    "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[q0-q3]
+    "VLDM         %0,  {q8-q11}       \n\t" // load matrix 2 (rhsPtr) q[q8-q11]
+    "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
+    "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
+    "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
+    "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
+
+    "VMLA.F32     q12, q9, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[1]
+    "VMLA.F32     q13, q9, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[5]
+    "VMLA.F32     q14, q9, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[9]
+    "VMLA.F32     q15, q9, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[13]
+
+    "VMLA.F32     q12, q10, d1[0]     \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[2]
+    "VMLA.F32     q13, q10, d3[0]     \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[6]
+    "VMLA.F32     q14, q10, d5[0]     \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[10]
+    "VMLA.F32     q15, q10, d7[0]     \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[14]
+
+    "VMLA.F32     q12, q11, d1[1]     \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[3]
+    "VMLA.F32     q13, q11, d3[1]     \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[7]
+    "VMLA.F32     q14, q11, d5[1]     \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[11]
+    "VMLA.F32     q15, q11, d7[1]     \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[15]
+    "VSTM         %2,  {q12-q15}      \n\t" // store entire output matrix.
+    : "+r"(rhsPtr), "+r"(lhsPtr), "+r"(temp)
+    :
+    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory");
+
+#endif
  }
  
  // Dali::Matrix3
diff --git a/dali/internal/common/matrix-utils.h b/dali/internal/common/matrix-utils.h

index 9852f97..bd14506 100644 (file)
--- a/dali/internal/common/matrix-utils.h
+++ b/dali/internal/common/matrix-utils.h
@@ -52,7 +52,28 @@ void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix&
  void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Quaternion& rhs);
  
  /**
- * @brief Function to multiply projection matrix and store the result onto third.
+ * @brief Function to multiply two transform matrix and store the result onto third.
+ *
+ * This API assume that both lhs and rhs are Transform Matrix.
+ * Scale & Rotation only has 3x3 area of matrix, and Translate only has [12,13,14] index.
+ * So, If we make Matrix for use Transform, 3, 7, 11 is always 0.0f, and 15 is always 1.0f.
+ * So we can reduce the number of multiplication.
+ *
+ * When we try to calculate WorldMatrix, It will have good efforts.
+ *
+ * Use this method in time critical path as it does not require temporaries.
+ *
+ * result = rhs * lhs
+ *
+ * @SINCE_2_2.15
+ * @param[out] result Result of the multiplication
+ * @param[in] lhs Transform Matrix, this cannot be same matrix as result
+ * @param[in] rhs Transform Matrix, this can be same matrix as result
+ */
+void MultiplyTransformMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& rhs);
+
+/**
+ * @brief Function to multiply projection matrix x transform matrix. and store the result onto third.
   *
   * This API assume that projection is Projection Matrix which top/bottom/left/right is symmetrical.
   *
@@ -68,7 +89,7 @@ void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Quatern
   *
   * @SINCE_2_1.46
   * @param[out] result Result of the multiplication
- * @param[in] lhs Matrix, this cannot be same matrix as result
+ * @param[in] lhs Transform Matrix, this cannot be same matrix as result
   * @param[in] projection Projection Matrix, this can be same matrix as result
   */
  void MultiplyProjectionMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& projection);
diff --git a/dali/internal/event/actors/actor-coords.cpp b/dali/internal/event/actors/actor-coords.cpp

index 9fa1983..7375d9d 100644 (file)
--- a/dali/internal/event/actors/actor-coords.cpp
+++ b/dali/internal/event/actors/actor-coords.cpp
@@ -79,7 +79,7 @@ bool ConvertScreenToLocal(
  {
    // Get the ModelView matrix
    Matrix modelView;
-  MatrixUtils::Multiply(modelView, worldMatrix, viewMatrix);
+  MatrixUtils::MultiplyTransformMatrix(modelView, worldMatrix, viewMatrix);
  
    // Calculate the inverted ModelViewProjection matrix; this will be used for 2 unprojects
    Matrix invertedMvp(false /*don't init*/);
@@ -528,7 +528,7 @@ Matrix CalculateActorWorldTransform(const Actor& actor)
  
          //Update the world matrix
          Matrix tempMatrix;
-        MatrixUtils::Multiply(tempMatrix, localMatrix, worldMatrix);
+        MatrixUtils::MultiplyTransformMatrix(tempMatrix, localMatrix, worldMatrix);
          worldMatrix = tempMatrix;
        }
        else
@@ -547,7 +547,7 @@ Matrix CalculateActorWorldTransform(const Actor& actor)
  
          // Compute intermediate world information
          Matrix intermediateWorldMatrix;
-        MatrixUtils::Multiply(intermediateWorldMatrix, intermediateLocalMatrix, parentMatrix);
+        MatrixUtils::MultiplyTransformMatrix(intermediateWorldMatrix, intermediateLocalMatrix, parentMatrix);
  
          Vector3    intermediateWorldPosition, intermediateWorldScale;
          Quaternion intermediateWorldOrientation;
diff --git a/dali/internal/update/manager/render-instruction-processor.cpp b/dali/internal/update/manager/render-instruction-processor.cpp

index 45e1f87..4187b8f 100644 (file)
--- a/dali/internal/update/manager/render-instruction-processor.cpp
+++ b/dali/internal/update/manager/render-instruction-processor.cpp
@@ -221,7 +221,7 @@ inline void AddRendererToRenderList(BufferIndex               updateBufferIndex,
  
        if(size.LengthSquared() > Math::MACHINE_EPSILON_1000)
        {
-        MatrixUtils::Multiply(nodeModelViewMatrix, nodeWorldMatrix, viewMatrix);
+        MatrixUtils::MultiplyTransformMatrix(nodeModelViewMatrix, nodeWorldMatrix, viewMatrix);
          nodeModelViewMatrixSet = true;
  
          // Assume actors are at z=0, compute AABB in view space & test rect intersection
@@ -288,7 +288,7 @@ inline void AddRendererToRenderList(BufferIndex               updateBufferIndex,
  
        if(!nodeModelViewMatrixSet)
        {
-        MatrixUtils::Multiply(nodeModelViewMatrix, nodeWorldMatrix, viewMatrix);
+        MatrixUtils::MultiplyTransformMatrix(nodeModelViewMatrix, nodeWorldMatrix, viewMatrix);
        }
        item.mModelViewMatrix = nodeModelViewMatrix;
  
diff --git a/dali/internal/update/manager/transform-manager.cpp b/dali/internal/update/manager/transform-manager.cpp

index 452d272..b92b13d 100644 (file)
--- a/dali/internal/update/manager/transform-manager.cpp
+++ b/dali/internal/update/manager/transform-manager.cpp
@@ -265,7 +265,7 @@ bool TransformManager::Update()
          }
  
          //Update the world matrix
-        MatrixUtils::Multiply(mWorld[i], mLocal[i], mWorld[parentIndex]);
+        MatrixUtils::MultiplyTransformMatrix(mWorld[i], mLocal[i], mWorld[parentIndex]);
        }
        else
        {
@@ -286,7 +286,7 @@ bool TransformManager::Update()
  
          // Compute intermediate world information
          Matrix intermediateWorldMatrix;
-        MatrixUtils::Multiply(intermediateWorldMatrix, intermediateLocalMatrix, mWorld[parentIndex]);
+        MatrixUtils::MultiplyTransformMatrix(intermediateWorldMatrix, intermediateLocalMatrix, mWorld[parentIndex]);
  
          Vector3       intermediateWorldPosition, intermediateWorldScale;
          Quaternion    intermediateWorldOrientation;
diff --git a/dali/internal/update/render-tasks/scene-graph-camera.cpp b/dali/internal/update/render-tasks/scene-graph-camera.cpp

index e4916b5..4557c4c 100644 (file)
--- a/dali/internal/update/render-tasks/scene-graph-camera.cpp
+++ b/dali/internal/update/render-tasks/scene-graph-camera.cpp
@@ -608,7 +608,7 @@ uint32_t Camera::UpdateViewMatrix(BufferIndex updateBufferIndex)
  
              Matrix& viewMatrix = mViewMatrix.Get(updateBufferIndex);
              Matrix  oldViewMatrix(viewMatrix);
-            MatrixUtils::Multiply(viewMatrix, oldViewMatrix, mReflectionMtx);
+            MatrixUtils::MultiplyTransformMatrix(viewMatrix, oldViewMatrix, mReflectionMtx);
            }
  
            viewMatrix.Invert();
author	Eunki Hong <eunkiki.hong@samsung.com>
	Sat, 18 Feb 2023 07:05:50 +0000 (16:05 +0900)
committer	Eunki, Hong <eunkiki.hong@samsung.com>
	Mon, 20 Feb 2023 12:22:05 +0000 (21:22 +0900)
automated-tests/src/dali-internal/utc-Dali-Internal-MatrixUtils.cpp		patch \| blob \| history
dali/internal/common/matrix-utils.cpp		patch \| blob \| history
dali/internal/common/matrix-utils.h		patch \| blob \| history
dali/internal/event/actors/actor-coords.cpp		patch \| blob \| history
dali/internal/update/manager/render-instruction-processor.cpp		patch \| blob \| history
dali/internal/update/manager/transform-manager.cpp		patch \| blob \| history
dali/internal/update/render-tasks/scene-graph-camera.cpp		patch \| blob \| history