Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_matrix3_operator_multiply.S

   1 @\r
   2 @ Applied Research Associates Inc. (c)2011\r
   3 @\r
   4 @ Redistribution and use in source and binary forms,\r
   5 @   with or without modification, are permitted provided that the\r
   6 @   following conditions are met:\r
   7 @    * Redistributions of source code must retain the above copyright\r
   8 @      notice, this list of conditions and the following disclaimer.\r
   9 @    * Redistributions in binary form must reproduce the above copyright\r
  10 @      notice, this list of conditions and the following disclaimer in the\r
  11 @      documentation and/or other materials provided with the distribution.\r
  12 @    * Neither the name of the Applied Research Associates Inc nor the names\r
  13 @      of its contributors may be used to endorse or promote products derived\r
  14 @      from this software without specific prior written permission.\r
  15 @\r
  16 @   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
  17 @   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
  18 @   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
  19 @   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\r
  20 @   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\r
  21 @   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\r
  22 @   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\r
  23 @   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\r
  24 @   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\r
  25 @   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\r
  26 @   POSSIBILITY OF SUCH DAMAGE.\r
  27 @\r
  28         .syntax unified\r
  29         .arch armv7-a\r
  30         .fpu neon\r
  31         .thumb\r
  32         .text\r
  33         .align 2\r
  34 \r
  35 @ matrix3 operator *, result stored directly to memory\r
  36 \r
  37         .global Matrix3OperatorMultiplyNeon\r
  38         .thumb_func\r
  39 Matrix3OperatorMultiplyNeon:\r
  40         .fnstart\r
  41         vld1.32     {d16-d19}, [r0]!            @ load first eight elements of matrix 0\r
  42     vld1.32     {d20-d21}, [r0]             @ load second eight elements of matrix 0\r
  43     vld1.32     {d0-d3}, [r1]!              @ load first eight elements of matrix 1\r
  44     vld1.32     {d4-d5}, [r1]               @ load second eight elements of matrix 1\r
  45 \r
  46         vmul.f32    q12, q8, d0[0]              @ rslt col0  = (mat0 col0) * (mat1 col0 elt0)\r
  47     vmul.f32    q13, q8, d2[0]              @ rslt col1  = (mat0 col0) * (mat1 col1 elt0)\r
  48     vmul.f32    q14, q8, d4[0]              @ rslt col2  = (mat0 col0) * (mat1 col2 elt0)\r
  49 \r
  50     vmla.f32    q12, q9, d0[1]              @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)\r
  51     vmla.f32    q13, q9, d2[1]              @ rslt col1 += (mat0 col1) * (mat1 col1 elt1)\r
  52     vmla.f32    q14, q9, d4[1]              @ rslt col2 += (mat0 col1) * (mat1 col2 elt1)\r
  53 \r
  54     vmla.f32    q12, q10, d1[0]             @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)\r
  55     vmla.f32    q13, q10, d3[0]             @ rslt col1 += (mat0 col2) * (mat1 col1 elt2)\r
  56     vmla.f32    q14, q10, d5[0]             @ rslt col2 += (mat0 col2) * (mat1 col2 elt2)\r
  57 \r
  58     vst1.32     {d24-d27}, [r2]!            @ store first eight elements of result\r
  59     vst1.32     {d28-d29}, [r2]             @ store second eight elements of result\r
  60 \r
  61         bx      lr\r
  62         .fnend\r