2 Copyright (c) 2012 Advanced Micro Devices, Inc.
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
14 //Originally written by Takahiro Harada
17 //#pragma OPENCL EXTENSION cl_amd_printf : enable
18 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
19 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
20 #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
21 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
24 #ifdef cl_ext_atomic_counters_32
25 #pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
27 #define counter32_t volatile global int*
30 typedef unsigned int u32;
31 typedef unsigned short u16;
32 typedef unsigned char u8;
34 #define GET_GROUP_IDX get_group_id(0)
35 #define GET_LOCAL_IDX get_local_id(0)
36 #define GET_GLOBAL_IDX get_global_id(0)
37 #define GET_GROUP_SIZE get_local_size(0)
38 #define GET_NUM_GROUPS get_num_groups(0)
39 #define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
40 #define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
41 #define AtomInc(x) atom_inc(&(x))
42 #define AtomInc1(x, out) out = atom_inc(&(x))
43 #define AppendInc(x, out) out = atomic_inc(x)
44 #define AtomAdd(x, value) atom_add(&(x), value)
45 #define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
46 #define AtomXhg(x, value) atom_xchg ( &(x), value )
49 #define SELECT_UINT4( b, a, condition ) select( b,a,condition )
51 #define mymake_float4 (float4)
52 //#define make_float2 (float2)
53 //#define make_uint4 (uint4)
54 //#define make_int4 (int4)
55 //#define make_uint2 (uint2)
56 //#define make_int2 (int2)
63 ///////////////////////////////////////
65 ///////////////////////////////////////
71 float4 fastNormalize4(float4 v)
73 return fast_normalize(v);
79 float4 cross3(float4 a, float4 b)
85 float dot3F4(float4 a, float4 b)
87 float4 a1 = mymake_float4(a.xyz,0.f);
88 float4 b1 = mymake_float4(b.xyz,0.f);
96 float4 normalize3(const float4 a)
98 float4 n = mymake_float4(a.x, a.y, a.z, 0.f);
99 return fastNormalize4( n );
100 // float length = sqrtf(dot3F4(a, a));
101 // return 1.f/length * a;
107 ///////////////////////////////////////
109 ///////////////////////////////////////
122 float4 mtMul1(Matrix3x3 a, float4 b);
125 float4 mtMul3(float4 a, Matrix3x3 b);
131 float4 mtMul1(Matrix3x3 a, float4 b)
134 ans.x = dot3F4( a.m_row[0], b );
135 ans.y = dot3F4( a.m_row[1], b );
136 ans.z = dot3F4( a.m_row[2], b );
142 float4 mtMul3(float4 a, Matrix3x3 b)
144 float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
145 float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
146 float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
149 ans.x = dot3F4( a, colx );
150 ans.y = dot3F4( a, coly );
151 ans.z = dot3F4( a, colz );
155 ///////////////////////////////////////
157 ///////////////////////////////////////
159 typedef float4 Quaternion;
178 float m_restituitionCoeff;
179 float m_frictionCoeff;
184 Matrix3x3 m_invInertia;
185 Matrix3x3 m_initInvInertia;
191 float4 m_worldPos[4];
193 float m_jacCoeffInv[4];
195 float m_appliedRambdaDt[4];
197 float m_fJacCoeffInv[2];
198 float m_fAppliedRambdaDt[2];
215 // int m_paddings[1];
221 int m_maxBatch; // long batch really kills the performance
224 // int m_paddings[1];
225 } ConstBufferBatchSolve;
227 void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);
229 void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
231 *linear = mymake_float4(-n.xyz,0.f);
232 *angular0 = -cross3(r0, n);
233 *angular1 = cross3(r1, n);
236 float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );
238 float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
240 return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
244 float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
245 float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);
247 float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
248 float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)
250 // linear0,1 are normlized
251 float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
252 float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
253 float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
254 float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
255 return -1.f/(jmj0+jmj1+jmj2+jmj3);
257 void btPlaneSpace1 (const float4* n, float4* p, float4* q);
258 void btPlaneSpace1 (const float4* n, float4* p, float4* q)
260 if (fabs(n[0].z) > 0.70710678f) {
261 // choose p in y-z plane
262 float a = n[0].y*n[0].y + n[0].z*n[0].z;
263 float k = 1.f/sqrt(a);
269 q[0].y = -n[0].x*p[0].z;
270 q[0].z = n[0].x*p[0].y;
273 // choose p in x-y plane
274 float a = n[0].x*n[0].x + n[0].y*n[0].y;
275 float k = 1.f/sqrt(a);
280 q[0].x = -n[0].z*p[0].y;
281 q[0].y = n[0].z*p[0].x;
287 void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);
288 void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)
290 float frictionCoeff = ldsCs[0].m_linear.w;
291 int aIdx = ldsCs[0].m_bodyA;
292 int bIdx = ldsCs[0].m_bodyB;
295 float4 posA = gBodies[aIdx].m_pos;
296 float4 linVelA = gBodies[aIdx].m_linVel;
297 float4 angVelA = gBodies[aIdx].m_angVel;
298 float invMassA = gBodies[aIdx].m_invMass;
299 Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
301 float4 posB = gBodies[bIdx].m_pos;
302 float4 linVelB = gBodies[bIdx].m_linVel;
303 float4 angVelB = gBodies[bIdx].m_angVel;
304 float invMassB = gBodies[bIdx].m_invMass;
305 Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
309 float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
310 float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
313 for(int j=0; j<4; j++)
315 sum +=ldsCs[0].m_appliedRambdaDt[j];
317 frictionCoeff = 0.7f;
318 for(int j=0; j<4; j++)
320 maxRambdaDt[j] = frictionCoeff*sum;
321 minRambdaDt[j] = -maxRambdaDt[j];
325 // solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
326 // posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );
331 __global Constraint4* cs = ldsCs;
333 if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;
334 const float4 center = cs->m_center;
336 float4 n = -cs->m_linear;
339 btPlaneSpace1(&n,&tangent[0],&tangent[1]);
340 float4 angular0, angular1, linear;
341 float4 r0 = center - posA;
342 float4 r1 = center - posB;
343 for(int i=0; i<2; i++)
345 setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
346 float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
347 linVelA, angVelA, linVelB, angVelB );
348 rambdaDt *= cs->m_fJacCoeffInv[i];
351 float prevSum = cs->m_fAppliedRambdaDt[i];
352 float updated = prevSum;
354 updated = max2( updated, minRambdaDt[i] );
355 updated = min2( updated, maxRambdaDt[i] );
356 rambdaDt = updated - prevSum;
357 cs->m_fAppliedRambdaDt[i] = updated;
360 float4 linImp0 = invMassA*linear*rambdaDt;
361 float4 linImp1 = invMassB*(-linear)*rambdaDt;
362 float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
363 float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
370 { // angular damping for point constraint
371 float4 ab = normalize3( posB - posA );
372 float4 ac = normalize3( center - posA );
373 if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
375 float angNA = dot3F4( n, angVelA );
376 float angNB = dot3F4( n, angVelB );
378 angVelA -= (angNA*0.1f)*n;
379 angVelB -= (angNB*0.1f)*n;
388 if (gBodies[aIdx].m_invMass)
390 gBodies[aIdx].m_linVel = linVelA;
391 gBodies[aIdx].m_angVel = angVelA;
394 gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);
395 gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);
397 if (gBodies[bIdx].m_invMass)
399 gBodies[bIdx].m_linVel = linVelB;
400 gBodies[bIdx].m_angVel = angVelB;
403 gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);
404 gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);
427 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
428 void BatchSolveKernelFriction(__global Body* gBodies,
429 __global Shape* gShapes,
430 __global Constraint4* gConstraints,
432 __global int* gOffsets,
433 __global int* batchSizes,
439 //__local int ldsBatchIdx[WG_SIZE+1];
440 __local int ldsCurBatch;
441 __local int ldsNextBatch;
442 __local int ldsStart;
444 int lIdx = GET_LOCAL_IDX;
445 int wgIdx = GET_GROUP_IDX;
447 // int gIdx = GET_GLOBAL_IDX;
448 // debugInfo[gIdx].m_valInt0 = gIdx;
449 //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;
452 int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);
453 int remain= (wgIdx%((nSplit.x*nSplit.y)/4));
454 int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);
455 int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);
456 int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);
459 if( gN[cellIdx] == 0 )
462 int maxBatch = batchSizes[cellIdx];
464 const int start = gOffsets[cellIdx];
465 const int end = start + gN[cellIdx];
478 int idx=ldsStart+lIdx;
479 while (ldsCurBatch < maxBatch)
483 if (gConstraints[idx].m_batchIdx == ldsCurBatch)
486 solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );
510 __kernel void solveSingleFrictionKernel(__global Body* gBodies,
511 __global Shape* gShapes,
512 __global Constraint4* gConstraints,
515 int numConstraintsInBatch
519 int index = get_global_id(0);
520 if (index < numConstraintsInBatch)
523 int idx=batchOffset+index;
525 solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );