[WIP] Optimize some matrix multiply for projection matrix

author Eunki Hong <eunkiki.hong@samsung.com>

Sat, 4 Feb 2023 04:08:53 +0000 (13:08 +0900)

committer Eunki Hong <eunkiki.hong@samsung.com>

Sat, 4 Feb 2023 05:21:54 +0000 (14:21 +0900)
author Eunki Hong <eunkiki.hong@samsung.com>
Sat, 4 Feb 2023 04:08:53 +0000 (13:08 +0900)
committer Eunki Hong <eunkiki.hong@samsung.com>
Sat, 4 Feb 2023 05:21:54 +0000 (14:21 +0900)
diff --git a/dali/internal/common/matrix-utils.cpp b/dali/internal/common/matrix-utils.cpp

index ce2eb1f..53f979d 100644 (file)
--- a/dali/internal/common/matrix-utils.cpp
+++ b/dali/internal/common/matrix-utils.cpp
@@ -332,6 +332,90 @@ void MultiplyProjectionMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, con
  #endif
  }
  
+void MultiplyProjectionMatrixWithReflect(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& projection)
+{
+  // TODO : Implement with NEON.
+  // Current NEON code is copy of Multiply.
+
+  MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
+  MATH_INCREASE_BY(PerformanceMonitor::FLOAT_POINT_MULTIPLY, 40); // 40 = 10*4
+
+  float*       temp   = result.AsFloat();
+  const float* rhsPtr = projection.AsFloat();
+  const float* lhsPtr = lhs.AsFloat();
+
+#ifndef __ARM_NEON__
+
+  // We only use rhsPtr's 0, 1, 2, 4, 5, 6, 10, 11, 14, 15 index.
+  const float rhs0  = rhsPtr[0];
+  const float rhs1  = rhsPtr[1];
+  const float rhs2  = rhsPtr[2];
+  const float rhs4  = rhsPtr[4];
+  const float rhs5  = rhsPtr[5];
+  const float rhs6  = rhsPtr[6];
+  const float rhs10 = rhsPtr[10];
+  const float rhs11 = rhsPtr[11];
+  const float rhs14 = rhsPtr[14];
+  const float rhs15 = rhsPtr[15];
+
+  for(int32_t i = 0; i < 4; i++)
+  {
+    // i<<2 gives the first vector / column
+    const int32_t loc0 = i << 2;
+    const int32_t loc1 = loc0 + 1;
+    const int32_t loc2 = loc0 + 2;
+    const int32_t loc3 = loc0 + 3;
+
+    const float value0 = lhsPtr[loc0];
+    const float value1 = lhsPtr[loc1];
+    const float value2 = lhsPtr[loc2];
+    const float value3 = lhsPtr[loc3];
+
+    temp[loc0] = (value0 * rhs0) + (value1 * rhs4);
+    temp[loc1] = (value0 * rhs1) + (value1 * rhs5);
+    temp[loc2] = (value0 * rhs2) + (value1 * rhs6) + (value2 * rhs10) + (value3 * rhs14);
+    temp[loc3] = (value2 * rhs11) + (value3 * rhs15);
+  }
+
+#else
+
+  // 64 32bit registers,
+  // aliased to
+  // d = 64 bit double-word d0 -d31
+  // q =128 bit quad-word   q0 -q15  (enough to handle a column of 4 floats in a matrix)
+  // e.g. q0 = d0 and d1
+
+  // load and stores interleaved as NEON can load and store while calculating
+  asm volatile(
+    "VLDM         %1,  {q0-q3}        \n\t" // load matrix 1 (lhsPtr) q[0..q3]
+    "VLDM         %0,  {q8-q11}       \n\t" // load matrix 2 (rhsPtr) q[q8-q11]
+    "VMUL.F32     q12, q8, d0[0]      \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0..3]
+    "VMUL.F32     q13, q8, d2[0]      \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4..7]
+    "VMUL.F32     q14, q8, d4[0]      \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8..11]
+    "VMUL.F32     q15, q8, d6[0]      \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12..15]
+
+    "VMLA.F32     q12, q9, d0[1]      \n\t" // column 0 += rhsPtr[4..7] * lhsPtr[0..3]
+    "VMLA.F32     q13, q9, d2[1]      \n\t" // column 1 += rhsPtr[4..7] * lhsPtr[4..7]
+    "VMLA.F32     q14, q9, d4[1]      \n\t" // column 2 += rhsPtr[4..7] * lhsPtr[8..11]
+    "VMLA.F32     q15, q9, d6[1]      \n\t" // column 3 += rhsPtr[4..7] * lhsPtr[12..15]
+
+    "VMLA.F32     q12, q10, d1[0]     \n\t" // column 0 += rhsPtr[8..11] * lhsPtr[0..3]
+    "VMLA.F32     q13, q10, d3[0]     \n\t" // column 1 += rhsPtr[8..11] * lhsPtr[4..7]
+    "VMLA.F32     q14, q10, d5[0]     \n\t" // column 2 += rhsPtr[8..11] * lhsPtr[8..11]
+    "VMLA.F32     q15, q10, d7[0]     \n\t" // column 3 += rhsPtr[8..11] * lhsPtr[12..15]
+
+    "VMLA.F32     q12, q11, d1[1]     \n\t" // column 0 += rhsPtr[12..15] * lhsPtr[0..3]
+    "VMLA.F32     q13, q11, d3[1]     \n\t" // column 1 += rhsPtr[12..15] * lhsPtr[4..7]
+    "VMLA.F32     q14, q11, d5[1]     \n\t" // column 2 += rhsPtr[12..15] * lhsPtr[8..11]
+    "VMLA.F32     q15, q11, d7[1]     \n\t" // column 3 += rhsPtr[12..15] * lhsPtr[12..15]
+    "VSTM         %2,  {q12-q15}      \n\t" // store entire output matrix.
+    : "+r"(rhsPtr), "+r"(lhsPtr), "+r"(temp)
+    :
+    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory");
+
+#endif
+}
+
  void MultiplyAssign(Dali::Matrix& result, const Dali::Matrix& rhs)
  {
    MATH_INCREASE_COUNTER(PerformanceMonitor::MATRIX_MULTIPLYS);
diff --git a/dali/internal/common/matrix-utils.h b/dali/internal/common/matrix-utils.h

index f65687c..80da0c1 100644 (file)
--- a/dali/internal/common/matrix-utils.h
+++ b/dali/internal/common/matrix-utils.h
@@ -73,6 +73,27 @@ void Multiply(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Quatern
  void MultiplyProjectionMatrix(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& projection);
  
  /**
+ * @brief Function to multiply projection matrix with reflection plane and store the result onto third.
+ *
+ * This API assume that projection is Projection Matrix which top/bottom/left/right is symmetrical.
+ *
+ * Perspective matrix only has 0, 2, 5, 6, 10, 11, 14 (14 is const value, 1.0f).
+ * Orthographic matrix only has 0, 2, 5, 6, 10, 14, 15 (15 is const value, 1.0f).
+ * If window rotated, we use 1, 4 index instead of 0, 5.
+ * So we only need 10 values to multiplication.
+ *
+ * Use this method in time critical path as it does not require temporaries.
+ *
+ * result = projection * lhs
+ *
+ * @SINCE_2_1.46
+ * @param[out] result Result of the multiplication
+ * @param[in] lhs Matrix, this cannot be same matrix as result
+ * @param[in] projection Projection Matrix, this can be same matrix as result
+ */
+void MultiplyProjectionMatrixWithReflect(Dali::Matrix& result, const Dali::Matrix& lhs, const Dali::Matrix& projection);
+
+/**
   * @brief Function to multiply two matrices and store the result onto first one.
   *
   * result = result * rhs
diff --git a/dali/internal/event/actors/actor-coords.cpp b/dali/internal/event/actors/actor-coords.cpp

index 9f1fefe..98c928c 100644 (file)
--- a/dali/internal/event/actors/actor-coords.cpp
+++ b/dali/internal/event/actors/actor-coords.cpp
@@ -83,7 +83,7 @@ bool ConvertScreenToLocal(
  
    // Calculate the inverted ModelViewProjection matrix; this will be used for 2 unprojects
    Matrix invertedMvp(false /*don't init*/);
-  MatrixUtils::Multiply(invertedMvp, modelView, projectionMatrix);
+  MatrixUtils::MultiplyProjectionMatrix(invertedMvp, modelView, projectionMatrix);
    bool success = invertedMvp.Invert();
  
    // Convert to GL coordinates
diff --git a/dali/internal/event/actors/camera-actor-impl.cpp b/dali/internal/event/actors/camera-actor-impl.cpp

index 1c6bb33..8b46bed 100644 (file)
--- a/dali/internal/event/actors/camera-actor-impl.cpp
+++ b/dali/internal/event/actors/camera-actor-impl.cpp
@@ -107,7 +107,7 @@ void BuildOrthoPickingRay(const Matrix&   viewMatrix,
  
    // Transforms the touch point from the screen reference system to the world reference system.
    Matrix invViewProjection(false); // Don't initialize.
-  MatrixUtils::Multiply(invViewProjection, viewMatrix, projectionMatrix);
+  MatrixUtils::MultiplyProjectionMatrix(invViewProjection, viewMatrix, projectionMatrix);
    if(!invViewProjection.Invert())
    {
      DALI_ASSERT_DEBUG(false);
diff --git a/dali/internal/event/common/projection.cpp b/dali/internal/event/common/projection.cpp

index e6cae8d..0396af0 100644 (file)
--- a/dali/internal/event/common/projection.cpp
+++ b/dali/internal/event/common/projection.cpp
@@ -73,7 +73,7 @@ bool UnprojectFull(const Vector4& windowPos,
                     Vector4&       objectPos)
  {
    Matrix invertedMvp(false); // Don't initialize.
-  MatrixUtils::Multiply(invertedMvp, modelView, projection);
+  MatrixUtils::MultiplyProjectionMatrix(invertedMvp, modelView, projection);
  
    if(invertedMvp.Invert())
    {
@@ -124,7 +124,7 @@ bool ProjectFull(const Vector4& position,
    bool ok = false;
  
    Matrix Mvp(false); // Don't initialize.
-  MatrixUtils::Multiply(Mvp, modelView, projection);
+  MatrixUtils::MultiplyProjectionMatrix(Mvp, modelView, projection);
  
    Vector4 p = Mvp * position;
  
diff --git a/dali/internal/render/renderers/render-renderer.cpp b/dali/internal/render/renderers/render-renderer.cpp

index 3bea0e5..6ef9c67 100644 (file)
--- a/dali/internal/render/renderers/render-renderer.cpp
+++ b/dali/internal/render/renderers/render-renderer.cpp
@@ -487,7 +487,7 @@ bool Renderer::Render(Graphics::CommandBuffer&                             comma
      mRenderCallbackInput->size       = size;
      mRenderCallbackInput->projection = projectionMatrix;
  
-    MatrixUtils::Multiply(mRenderCallbackInput->mvp, modelViewMatrix, projectionMatrix);
+    MatrixUtils::MultiplyProjectionMatrix(mRenderCallbackInput->mvp, modelViewMatrix, projectionMatrix);
  
      // submit draw
      commandBuffer.DrawNative(&info);
@@ -739,7 +739,7 @@ void Renderer::WriteUniformBuffer(
      if(mvpUniformInfo && !mvpUniformInfo->name.empty())
      {
        Matrix modelViewProjectionMatrix(false);
-      MatrixUtils::Multiply(modelViewProjectionMatrix, modelViewMatrix, projectionMatrix);
+      MatrixUtils::MultiplyProjectionMatrixWithReflect(modelViewProjectionMatrix, modelViewMatrix, projectionMatrix);
        WriteDefaultUniform(mvpUniformInfo, *uboView, modelViewProjectionMatrix);
      }
  
diff --git a/dali/internal/update/render-tasks/scene-graph-camera.cpp b/dali/internal/update/render-tasks/scene-graph-camera.cpp

index 586d7cc..de7bd92 100644 (file)
--- a/dali/internal/update/render-tasks/scene-graph-camera.cpp
+++ b/dali/internal/update/render-tasks/scene-graph-camera.cpp
@@ -28,6 +28,7 @@
  #include <dali/internal/update/nodes/node.h>
  #include <dali/public-api/common/dali-common.h>
  #include <dali/public-api/math/math-utils.h>
+#include <sstream>
  
  namespace // unnamed namespace
  {
@@ -416,7 +417,7 @@ void Camera::Update(BufferIndex updateBufferIndex)
    if(viewUpdateCount > COPY_PREVIOUS_MATRIX || projectionUpdateCount > COPY_PREVIOUS_MATRIX)
    {
      // either has actually changed so recalculate
-    MatrixUtils::Multiply(mInverseViewProjection[updateBufferIndex], mViewMatrix[updateBufferIndex], mProjectionMatrix[updateBufferIndex]);
+    MatrixUtils::MultiplyProjectionMatrixWithReflect(mInverseViewProjection[updateBufferIndex], mViewMatrix[updateBufferIndex], mProjectionMatrix[updateBufferIndex]);
      UpdateFrustum(updateBufferIndex);
  
      // ignore the error, if the view projection is incorrect (non inversible) then you will have tough times anyways
@@ -514,10 +515,12 @@ uint32_t Camera::UpdateViewMatrix(BufferIndex updateBufferIndex)
              upNew3           = Vector3(upNew);
              LookAt(viewMatrix, positionNew3, targetNewVector3, upNew3);
  
-            Matrix oldViewMatrix(viewMatrix);
-            Matrix tmp;
-            tmp.SetIdentityAndScale(Vector3(-1.0, 1.0, 1.0));
-            MatrixUtils::Multiply(viewMatrix, oldViewMatrix, tmp);
+            // Invert X
+            float* vZ = viewMatrix.AsFloat();
+            vZ[0] = -vZ[0];
+            vZ[4] = -vZ[4];
+            vZ[8] = -vZ[8];
+            vZ[12] = -vZ[12];
  
              mReflectionEye     = positionNew;
              mUseReflectionClip = true;
@@ -540,7 +543,7 @@ void Camera::UpdateFrustum(BufferIndex updateBufferIndex, bool normalize)
  {
    // Extract the clip matrix planes
    Matrix clipMatrix;
-  MatrixUtils::Multiply(clipMatrix, mViewMatrix[updateBufferIndex], mProjectionMatrix[updateBufferIndex]);
+  MatrixUtils::MultiplyProjectionMatrixWithReflect(clipMatrix, mViewMatrix[updateBufferIndex], mProjectionMatrix[updateBufferIndex]);
  
    const float*   cm     = clipMatrix.AsFloat();
    FrustumPlanes& planes = mFrustum[updateBufferIndex];
@@ -692,11 +695,11 @@ uint32_t Camera::UpdateProjection(BufferIndex updateBufferIndex)
              AdjustNearPlaneForPerspective(projectionMatrix, customClipping);
  
              // Invert Z
-            Matrix matZ;
-            matZ.SetIdentity();
-            float* vZ = matZ.AsFloat();
-            vZ[10]    = -vZ[10];
-            MatrixUtils::Multiply(projectionMatrix, projectionMatrix, matZ);
+            float* pZ = projectionMatrix.AsFloat();
+            pZ[2] = -pZ[2];
+            pZ[6] = -pZ[6];
+            pZ[10] = -pZ[10];
+            pZ[14] = -pZ[14];
            }
            break;
          }
@@ -739,11 +742,7 @@ uint32_t Camera::UpdateProjection(BufferIndex updateBufferIndex)
            break;
        }
  
-      Matrix rotation;
-      rotation.SetIdentity();
-      rotation.SetTransformComponents(Vector3(1.0f, 1.0f, 1.0f), rotationAngle, Vector3(0.0f, 0.0f, 0.0f));
-
-      MatrixUtils::Multiply(finalProjection, mProjectionMatrix.Get(updateBufferIndex), rotation);
+      MatrixUtils::Multiply(finalProjection, mProjectionMatrix.Get(updateBufferIndex), rotationAngle);
      }
      --mUpdateProjectionFlag;
    }
author	Eunki Hong <eunkiki.hong@samsung.com>
	Sat, 4 Feb 2023 04:08:53 +0000 (13:08 +0900)
committer	Eunki Hong <eunkiki.hong@samsung.com>
	Sat, 4 Feb 2023 05:21:54 +0000 (14:21 +0900)
dali/internal/common/matrix-utils.cpp		patch \| blob \| history
dali/internal/common/matrix-utils.h		patch \| blob \| history
dali/internal/event/actors/actor-coords.cpp		patch \| blob \| history
dali/internal/event/actors/camera-actor-impl.cpp		patch \| blob \| history
dali/internal/event/common/projection.cpp		patch \| blob \| history
dali/internal/render/renderers/render-renderer.cpp		patch \| blob \| history
dali/internal/update/render-tasks/scene-graph-camera.cpp		patch \| blob \| history