- "VLDM %1, {q4-q6} \n\t" // load matrix 1 (lhsPtr)
- "VLD1.F32 {q7}, [%2]! \n\t" // load matrix 2 (rhsPtr) [0..3]
- "VMUL.F32 q0, q7, d8[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0..3]
- "VMUL.F32 q1, q7, d10[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4..7]
- "VMUL.F32 q2, q7, d12[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8..11]
- "VLD1.F32 {q7}, [%2]! \n\t" // load matrix 2 (rhsPtr) [4..7]
- "VMLA.F32 q0, q7, d8[1] \n\t" // column 0+= rhsPtr[4..7] * lhsPtr[0..3]
- "VMLA.F32 q1, q7, d10[1] \n\t" // column 1+= rhsPtr[4..7] * lhsPtr[4..7]
- "VMLA.F32 q2, q7, d12[1] \n\t" // column 2+= rhsPtr[4..7] * lhsPtr[8..11]
- "VLD1.F32 {q7}, [%2]! \n\t" // load matrix 2 (rhsPtr) [8..11]
- "VMLA.F32 q0, q7, d9[0] \n\t" // column 0+= rhsPtr[8..11] * lhsPtr[0..3]
- "VMLA.F32 q1, q7, d11[0] \n\t" // column 1+= rhsPtr[8..11] * lhsPtr[4..7]
- "VMLA.F32 q2, q7, d13[0] \n\t" // column 2+= rhsPtr[8..11] * lhsPtr[8..11]
- "VSTM %0, {q0-q2} \n\t" // store entire output matrix.
+ "VLDM %1, {q4-q7} \n\t" // load matrix 1 (lhsPtr)
+ "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [0..3]
+ "VMUL.F32 q0, q8, d8[0] \n\t" // column 0 = rhsPtr[0..3] * lhsPtr[0]
+ "VMUL.F32 q1, q8, d10[0] \n\t" // column 1 = rhsPtr[0..3] * lhsPtr[4]
+ "VMUL.F32 q2, q8, d12[0] \n\t" // column 2 = rhsPtr[0..3] * lhsPtr[8]
+ "VMUL.F32 q3, q8, d14[0] \n\t" // column 3 = rhsPtr[0..3] * lhsPtr[12]
+ "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [4..7]
+ "VMLA.F32 q0, q8, d8[1] \n\t" // column 0+= rhsPtr[4..7] * lhsPtr[1]
+ "VMLA.F32 q1, q8, d10[1] \n\t" // column 1+= rhsPtr[4..7] * lhsPtr[5]
+ "VMLA.F32 q2, q8, d12[1] \n\t" // column 2+= rhsPtr[4..7] * lhsPtr[9]
+ "VMLA.F32 q3, q8, d14[1] \n\t" // column 3+= rhsPtr[4..7] * lhsPtr[13]
+ "VLD1.F32 {q8}, [%2]! \n\t" // load matrix 2 (rhsPtr) [8..11]
+ "VMLA.F32 q0, q8, d9[0] \n\t" // column 0+= rhsPtr[8..11] * lhsPtr[2]
+ "VMLA.F32 q1, q8, d11[0] \n\t" // column 1+= rhsPtr[8..11] * lhsPtr[6]
+ "VMLA.F32 q2, q8, d13[0] \n\t" // column 2+= rhsPtr[8..11] * lhsPtr[10]
+ "VMLA.F32 q3, q8, d15[0] \n\t" // column 3+= rhsPtr[8..11] * lhsPtr[14]
+ "VSTM %0, {q0-q3} \n\t" // store entire output matrix.