Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_cross_product.cpp

   1 /*\r
   2  Applied Research Associates Inc. (c)2011\r
   3 \r
   4  Redistribution and use in source and binary forms,\r
   5    with or without modification, are permitted provided that the\r
   6    following conditions are met:\r
   7     * Redistributions of source code must retain the above copyright\r
   8       notice, this list of conditions and the following disclaimer.\r
   9     * Redistributions in binary form must reproduce the above copyright\r
  10       notice, this list of conditions and the following disclaimer in the\r
  11       documentation and/or other materials provided with the distribution.\r
  12     * Neither the name of the Applied Research Associates Inc nor the names\r
  13       of its contributors may be used to endorse or promote products derived\r
  14       from this software without specific prior written permission.\r
  15 \r
  16    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
  17    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
  18    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
  19    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\r
  20    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\r
  21    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\r
  22    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\r
  23    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\r
  24    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\r
  25    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\r
  26    POSSIBILITY OF SUCH DAMAGE.\r
  27 */\r
  28 \r
  29 #include "test_neon.h"\r
  30 #define SCE_PFX_USE_PERFCOUNTER\r
  31 #include "physics_effects.h"\r
  32 #include <arm_neon.h>\r
  33 #include <string.h>\r
  34 #include <stdio.h>\r
  35 #include <stdlib.h>\r
  36 #include <android/log.h>\r
  37 \r
  38 // This works with gcc\r
  39 #define SET_ALIGNMENT(alignment)   __attribute__((__aligned__((alignment))))\r
  40 \r
  41 //#define PRINT_COMPUTED_VECTOR_RESULTS\r
  42 \r
  43 // assembly implementations\r
  44 extern "C"\r
  45 {\r
  46         void CrossProductNeonResultInMemoryAssembly(float *a, float *b, float *pfResult);\r
  47         void CrossProductNeonResultInMemoryAssembly2(float *a, float *b, float *pfResult);\r
  48         void CrossProductNeonResultInMemoryAssembly3(float *a, float *b, float *pfResult);\r
  49 }\r
  50 \r
  51 //----------------------------------------------------------------------------\r
  52 //  CrossProductNeonResultInMemory\r
  53 //\r
  54 /// Performs a Vector3 style cross product using NEON intrinsics, storing the\r
  55 /// result directly into system memory.\r
  56 ///\r
  57 /// @param  a         Input vector 1. Must point to 4 float values\r
  58 /// @param  b         Input vector 2. Must point to 4 float values\r
  59 /// @param  pfResult  [in] must point to an array of at least *4*\r
  60 ///                   float values. [out] The result of the cross\r
  61 ///                   product is contained in the first 3 lanes.\r
  62 ///\r
  63 /// NOTE: The parameter types here are floats, not float32_t's. gcc\r
  64 /// sometimes doesn't interpret float32_t's correctly. In particular,\r
  65 /// if the type of pfResult is set to float32_t*, gcc will throw an\r
  66 /// internal compiler error (ICE) for this code. In memory, float32_t\r
  67 /// and float are equivalent, so can cast between them explicitly.\r
  68 //----------------------------------------------------------------------------\r
  69 void CrossProductNeonResultInMemory(float *a, float *b, float *pfResult)\r
  70 {\r
  71         float32x4_t v1 = {a[1],a[2],a[0], 0.0f};\r
  72         float32x4_t v2 = {b[2],b[0],b[1], 0.0f};\r
  73         float32x4_t v3 = {a[2],a[0],a[1], 0.0f};\r
  74         float32x4_t v4 = {b[1],b[2],b[0], 0.0f};\r
  75         v1 = vmulq_f32(v1, v2);\r
  76         v1 = vmlsq_f32(v1, v3, v4);\r
  77         vst1q_f32((float32_t*)pfResult, v1);\r
  78 }\r
  79 \r
  80 void CrossProductNeonResultInMemoryCPPAssembly(float *a, float *b, float *result) {\r
  81     asm volatile(\r
  82                 "vld1.32 {d18[1]}, [r1]!         \n\t"\r
  83                 "vld1.32 {d19[0]}, [r1]!         \n\t"\r
  84                 "vld1.32 {d18[0]}, [r1]!         \n\t"\r
  85                 "vld1.32 {d19[1]}, [r1]          \n\t"\r
  86                 "vld1.32 {d17[0]}, [r0]!         \n\t"\r
  87                 "vld1.32 {d16}, [r0]!            \n\t"\r
  88                 "vld1.32 {d17[1]}, [r0]          \n\t"\r
  89                 "vmul.f32 q10, q8, q9            \n\t"\r
  90                 "vtrn.32 d18,d19                         \n\t"\r
  91                 "vrev64.32 d16,d16                       \n\t"\r
  92                 "vrev64.32 d18,d18                       \n\t"\r
  93                 "vtrn.32 d16,d17                         \n\t"\r
  94                 "vmls.f32 q10, q8, q9            \n\t"\r
  95                 "vst1.32        {q10}, [r2]              \n\t"\r
  96     );\r
  97 }\r
  98 \r
  99 //----------------------------------------------------------------------------\r
 100 //  CrossProductScalarResultInMemory\r
 101 //\r
 102 /// Performs a Vector3 style cross product using scalar math, storing the\r
 103 /// result directly into system memory.\r
 104 ///\r
 105 /// @param  a         Input vector 1. Must point to 4 float values\r
 106 /// @param  b         Input vector 2. Must point to 4 float values\r
 107 /// @param  pfResult  [in] pointer to a float. [out] Contains the\r
 108 ///                   result, dotproduct(a,b)\r
 109 //----------------------------------------------------------------------------\r
 110 void CrossProductScalarResultInMemory(float *a, float *b, float *pfResult)\r
 111 {\r
 112         pfResult[0] = a[1]*b[2] - a[2]*b[1];\r
 113         pfResult[1] = a[2]*b[0] - a[0]*b[2];\r
 114         pfResult[2] = a[0]*b[1] - a[1]*b[0];\r
 115 }\r
 116 \r
 117 //----------------------------------------------------------------------------\r
 118 //  TestFastNeonCrossProduct\r
 119 //\r
 120 /// Run timing study of the cross product functions above, writing the\r
 121 /// results to the Android verbose log.\r
 122 //----------------------------------------------------------------------------\r
 123 void TestNeonCrossProduct()\r
 124 {\r
 125         float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,\r
 126                                                                                 float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};\r
 127 \r
 128         float *a = &data[0];\r
 129         float *b = &data[4];\r
 130 \r
 131         char szMsg[256];\r
 132 \r
 133         sprintf(szMsg, "");\r
 134         __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);\r
 135         sprintf(szMsg,"---------------------------------------");\r
 136         __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);\r
 137         sprintf(szMsg, "Cross product test inputs A=<%f,%f,%f>, B=<%f,%f,%f>",\r
 138                                         a[0], a[1], a[2], b[0], b[1], b[2]);\r
 139         __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);\r
 140 \r
 141         float SET_ALIGNMENT(64) fResult[4];\r
 142     \r
 143         double dTimeSpan, dRefTimeSpan;; \r
 144         unsigned int uiNumTries = 10000000;\r
 145         unsigned int i;\r
 146 \r
 147         sce::PhysicsEffects::PfxPerfCounter pc;\r
 148 \r
 149 // profile scalar cross product with direct memory return\r
 150         fResult[0] = 0.0f;\r
 151         pc.countBegin("");\r
 152         for(i = 0; i < uiNumTries; i++)\r
 153         {\r
 154                 CrossProductScalarResultInMemory(a, b, fResult);\r
 155         }\r
 156         pc.countEnd();\r
 157         dTimeSpan = pc.getCountTime(0);\r
 158         pc.resetCount();\r
 159         dRefTimeSpan = dTimeSpan;\r
 160         sprintf(szMsg, "Time to do %i calls for CrossProductScalarResultInMemory: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",\r
 161                                         uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);\r
 162         __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);\r
 163 \r
 164 \r
 165 \r
 166 // profile NEON assembly volatile cross product with direct memory return\r
 167         fResult[0] = 0.0f;\r
 168         pc.countBegin("");\r
 169         for(i = 0; i < uiNumTries; i++)\r
 170         {\r
 171                 CrossProductNeonResultInMemoryCPPAssembly(a, b, fResult);\r
 172         }\r
 173         pc.countEnd();\r
 174         dTimeSpan = pc.getCountTime(0);\r
 175         pc.resetCount();\r
 176         sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryFast: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",\r
 177                                         uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);\r
 178         __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);\r
 179 \r
 180 \r
 181 \r
 182 // profile NEON cross product with direct memory return, assembly version\r
 183         fResult[0] = 0.0f;\r
 184         pc.countBegin("");\r
 185         for(i = 0; i < uiNumTries; i++)\r
 186         {\r
 187                 CrossProductNeonResultInMemoryAssembly(a, b, fResult);\r
 188         }\r
 189         pc.countEnd();\r
 190         dTimeSpan = pc.getCountTime(0);\r
 191         pc.resetCount();\r
 192         sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",\r
 193                                         uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);\r
 194         __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);\r
 195 \r
 196 \r
 197 \r
 198 // profile NEON cross product with direct memory return, assembly version 2\r
 199         fResult[0] = 0.0f;\r
 200         pc.countBegin("");\r
 201         for(i = 0; i < uiNumTries; i++)\r
 202         {\r
 203                 CrossProductNeonResultInMemoryAssembly2(a, b, fResult);\r
 204         }\r
 205         pc.countEnd();\r
 206         dTimeSpan = pc.getCountTime(0);\r
 207         pc.resetCount();\r
 208 \r
 209         sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly2: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",\r
 210                                         uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);\r
 211         __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);\r
 212 \r
 213 \r
 214 \r
 215 // profile NEON cross product with direct memory return, assembly version 3\r
 216         fResult[0] = 0.0f;\r
 217         pc.countBegin("");\r
 218         for(i = 0; i < uiNumTries; i++)\r
 219         {\r
 220                 CrossProductNeonResultInMemoryAssembly3(a, b, fResult);\r
 221         }\r
 222         pc.countEnd();\r
 223         dTimeSpan = pc.getCountTime(0);\r
 224         pc.resetCount();\r
 225 \r
 226         sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly3: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",\r
 227                                         uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);\r
 228         __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);\r
 229 }\r