2 Applied Research Associates Inc. (c)2011
\r
4 Redistribution and use in source and binary forms,
\r
5 with or without modification, are permitted provided that the
\r
6 following conditions are met:
\r
7 * Redistributions of source code must retain the above copyright
\r
8 notice, this list of conditions and the following disclaimer.
\r
9 * Redistributions in binary form must reproduce the above copyright
\r
10 notice, this list of conditions and the following disclaimer in the
\r
11 documentation and/or other materials provided with the distribution.
\r
12 * Neither the name of the Applied Research Associates Inc nor the names
\r
13 of its contributors may be used to endorse or promote products derived
\r
14 from this software without specific prior written permission.
\r
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
\r
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
\r
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
\r
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
\r
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
\r
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
\r
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
\r
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
\r
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
\r
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
\r
26 POSSIBILITY OF SUCH DAMAGE.
\r
29 #include "test_neon.h"
\r
30 #define SCE_PFX_USE_PERFCOUNTER
\r
31 #include "physics_effects.h"
\r
32 #include <arm_neon.h>
\r
36 #include <android/log.h>
\r
38 // This works with gcc
\r
39 #define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
\r
41 //#define PRINT_COMPUTED_VECTOR_RESULTS
\r
43 // assembly implementations
\r
46 void CrossProductNeonResultInMemoryAssembly(float *a, float *b, float *pfResult);
\r
47 void CrossProductNeonResultInMemoryAssembly2(float *a, float *b, float *pfResult);
\r
48 void CrossProductNeonResultInMemoryAssembly3(float *a, float *b, float *pfResult);
\r
51 //----------------------------------------------------------------------------
\r
52 // CrossProductNeonResultInMemory
\r
54 /// Performs a Vector3 style cross product using NEON intrinsics, storing the
\r
55 /// result directly into system memory.
\r
57 /// @param a Input vector 1. Must point to 4 float values
\r
58 /// @param b Input vector 2. Must point to 4 float values
\r
59 /// @param pfResult [in] must point to an array of at least *4*
\r
60 /// float values. [out] The result of the cross
\r
61 /// product is contained in the first 3 lanes.
\r
63 /// NOTE: The parameter types here are floats, not float32_t's. gcc
\r
64 /// sometimes doesn't interpret float32_t's correctly. In particular,
\r
65 /// if the type of pfResult is set to float32_t*, gcc will throw an
\r
66 /// internal compiler error (ICE) for this code. In memory, float32_t
\r
67 /// and float are equivalent, so can cast between them explicitly.
\r
68 //----------------------------------------------------------------------------
\r
69 void CrossProductNeonResultInMemory(float *a, float *b, float *pfResult)
\r
71 float32x4_t v1 = {a[1],a[2],a[0], 0.0f};
\r
72 float32x4_t v2 = {b[2],b[0],b[1], 0.0f};
\r
73 float32x4_t v3 = {a[2],a[0],a[1], 0.0f};
\r
74 float32x4_t v4 = {b[1],b[2],b[0], 0.0f};
\r
75 v1 = vmulq_f32(v1, v2);
\r
76 v1 = vmlsq_f32(v1, v3, v4);
\r
77 vst1q_f32((float32_t*)pfResult, v1);
\r
80 void CrossProductNeonResultInMemoryCPPAssembly(float *a, float *b, float *result) {
\r
82 "vld1.32 {d18[1]}, [r1]! \n\t"
\r
83 "vld1.32 {d19[0]}, [r1]! \n\t"
\r
84 "vld1.32 {d18[0]}, [r1]! \n\t"
\r
85 "vld1.32 {d19[1]}, [r1] \n\t"
\r
86 "vld1.32 {d17[0]}, [r0]! \n\t"
\r
87 "vld1.32 {d16}, [r0]! \n\t"
\r
88 "vld1.32 {d17[1]}, [r0] \n\t"
\r
89 "vmul.f32 q10, q8, q9 \n\t"
\r
90 "vtrn.32 d18,d19 \n\t"
\r
91 "vrev64.32 d16,d16 \n\t"
\r
92 "vrev64.32 d18,d18 \n\t"
\r
93 "vtrn.32 d16,d17 \n\t"
\r
94 "vmls.f32 q10, q8, q9 \n\t"
\r
95 "vst1.32 {q10}, [r2] \n\t"
\r
99 //----------------------------------------------------------------------------
\r
100 // CrossProductScalarResultInMemory
\r
102 /// Performs a Vector3 style cross product using scalar math, storing the
\r
103 /// result directly into system memory.
\r
105 /// @param a Input vector 1. Must point to 4 float values
\r
106 /// @param b Input vector 2. Must point to 4 float values
\r
107 /// @param pfResult [in] pointer to a float. [out] Contains the
\r
108 /// result, dotproduct(a,b)
\r
109 //----------------------------------------------------------------------------
\r
110 void CrossProductScalarResultInMemory(float *a, float *b, float *pfResult)
\r
112 pfResult[0] = a[1]*b[2] - a[2]*b[1];
\r
113 pfResult[1] = a[2]*b[0] - a[0]*b[2];
\r
114 pfResult[2] = a[0]*b[1] - a[1]*b[0];
\r
117 //----------------------------------------------------------------------------
\r
118 // TestFastNeonCrossProduct
\r
120 /// Run timing study of the cross product functions above, writing the
\r
121 /// results to the Android verbose log.
\r
122 //----------------------------------------------------------------------------
\r
123 void TestNeonCrossProduct()
\r
125 float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
\r
126 float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
\r
128 float *a = &data[0];
\r
129 float *b = &data[4];
\r
133 sprintf(szMsg, "");
\r
134 __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
\r
135 sprintf(szMsg,"---------------------------------------");
\r
136 __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
\r
137 sprintf(szMsg, "Cross product test inputs A=<%f,%f,%f>, B=<%f,%f,%f>",
\r
138 a[0], a[1], a[2], b[0], b[1], b[2]);
\r
139 __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
\r
141 float SET_ALIGNMENT(64) fResult[4];
\r
143 double dTimeSpan, dRefTimeSpan;;
\r
144 unsigned int uiNumTries = 10000000;
\r
147 sce::PhysicsEffects::PfxPerfCounter pc;
\r
149 // profile scalar cross product with direct memory return
\r
152 for(i = 0; i < uiNumTries; i++)
\r
154 CrossProductScalarResultInMemory(a, b, fResult);
\r
157 dTimeSpan = pc.getCountTime(0);
\r
159 dRefTimeSpan = dTimeSpan;
\r
160 sprintf(szMsg, "Time to do %i calls for CrossProductScalarResultInMemory: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
\r
161 uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
\r
162 __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
\r
166 // profile NEON assembly volatile cross product with direct memory return
\r
169 for(i = 0; i < uiNumTries; i++)
\r
171 CrossProductNeonResultInMemoryCPPAssembly(a, b, fResult);
\r
174 dTimeSpan = pc.getCountTime(0);
\r
176 sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryFast: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
\r
177 uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
\r
178 __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
\r
182 // profile NEON cross product with direct memory return, assembly version
\r
185 for(i = 0; i < uiNumTries; i++)
\r
187 CrossProductNeonResultInMemoryAssembly(a, b, fResult);
\r
190 dTimeSpan = pc.getCountTime(0);
\r
192 sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
\r
193 uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
\r
194 __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
\r
198 // profile NEON cross product with direct memory return, assembly version 2
\r
201 for(i = 0; i < uiNumTries; i++)
\r
203 CrossProductNeonResultInMemoryAssembly2(a, b, fResult);
\r
206 dTimeSpan = pc.getCountTime(0);
\r
209 sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly2: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
\r
210 uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
\r
211 __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
\r
215 // profile NEON cross product with direct memory return, assembly version 3
\r
218 for(i = 0; i < uiNumTries; i++)
\r
220 CrossProductNeonResultInMemoryAssembly3(a, b, fResult);
\r
223 dTimeSpan = pc.getCountTime(0);
\r
226 sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly3: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
\r
227 uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
\r
228 __android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
\r