5 // Copyright (c) 2011 Apple Inc.
9 #include "LinearMath/btScalar.h"
10 #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
13 #include "Test_dot3.h"
20 #include <LinearMath/btVector3.h>
22 // reference code for testing purposes
23 static btVector3 dot3_ref( const btVector3 &, const btVector3 &, const btVector3 &, const btVector3 &);
24 static btVector3 dot3_ref( const btVector3 &v, const btVector3 &v1, const btVector3 &v2, const btVector3 &v3)
26 return btVector3( v.dot(v1), v.dot(v2), v.dot(v3));
30 SIMD_FORCE_INLINE int operator!=(const btVector3 &s, const btVector3 &v)
33 __m128 test = _mm_cmpneq_ps( s.mVec128, v.mVec128 );
34 return (_mm_movemask_ps( test ) & 7) != 0;
35 #elif defined __ARM_NEON_H
36 uint32x4_t test = vandq_u32( vceqq_f32( s.mVec128, v.mVec128 ), (uint32x4_t){-1,-1,-1,0});
37 uint32x2_t t = vpadd_u32( vget_low_u32(test), vget_high_u32(test));
39 return -3 != (int32_t) vget_lane_u32(t, 0);
41 return s.m_floats[0] != v.m_floats[0] ||
42 s.m_floats[1] != v.m_floats[1] ||
43 s.m_floats[2] != v.m_floats[2];
50 #define LOOPCOUNT 1000
51 #define NUM_CYCLES 10000
55 btVector3 v, v1, v2, v3;
57 #define DATA_SIZE 1024
59 btVector3 vec3_arr[DATA_SIZE];
60 btVector3 vec3_arr1[DATA_SIZE];
61 btVector3 vec3_arr2[DATA_SIZE];
62 btVector3 vec3_arr3[DATA_SIZE];
63 btVector3 res_arr[DATA_SIZE];
68 btVector3 correct, test;
70 for( k = 0; k < DATA_SIZE; k++ )
73 vec3_arr[k] = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN));
74 vec3_arr1[k] = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN));
75 vec3_arr2[k] = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN ));
76 vec3_arr3[k] = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN));
78 correct = dot3_ref(vec3_arr[k], vec3_arr1[k], vec3_arr2[k], vec3_arr3[k]);
79 test = vec3_arr[k].dot3( vec3_arr1[k], vec3_arr2[k], vec3_arr3[k]);
83 vlog( "Error (%ld) - dot3 result error! *{%a, %a, %a, %a} != {%a, %a, %a, %a} \n", k,
84 correct.x(), correct.y(), correct.z(), correct.w(),
85 test.x(), test.y(), test.z(), test.w() );
93 uint64_t startTime, bestTime, currentTime;
97 for (j = 0; j < NUM_CYCLES; j++)
99 startTime = ReadTicks();
100 for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
102 size_t k32 = (k & (DATA_SIZE-1));
103 res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
104 res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
105 res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
106 res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]);
108 currentTime = ReadTicks() - startTime;
109 scalarTime += currentTime;
110 if( currentTime < bestTime )
111 bestTime = currentTime;
113 if( 0 == gReportAverageTimes )
114 scalarTime = bestTime;
116 scalarTime /= NUM_CYCLES;
120 uint64_t startTime, bestTime, currentTime;
124 for (j = 0; j < NUM_CYCLES; j++)
126 startTime = ReadTicks();
127 for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
129 size_t k32 = (k & (DATA_SIZE-1));
130 res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
131 res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
132 res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
133 res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]);
135 currentTime = ReadTicks() - startTime;
136 vectorTime += currentTime;
137 if( currentTime < bestTime )
138 bestTime = currentTime;
140 if( 0 == gReportAverageTimes )
141 vectorTime = bestTime;
143 vectorTime /= NUM_CYCLES;
147 vlog( " \t scalar\t vector\n" );
148 vlog( " \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );