dali-physics/third-party/bullet3/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl

   1 /*
   2 Copyright (c) 2012 Advanced Micro Devices, Inc.
   3
   4 This software is provided 'as-is', without any express or implied warranty.
   5 In no event will the authors be held liable for any damages arising from the use of this software.
   6 Permission is granted to anyone to use this software for any purpose,
   7 including commercial applications, and to alter it and redistribute it freely,
   8 subject to the following restrictions:
   9
  10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  12 3. This notice may not be removed or altered from any source distribution.
  13 */
  14 //Originally written by Takahiro Harada
  15
  16
  17 //#pragma OPENCL EXTENSION cl_amd_printf : enable
  18 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
  19 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
  20 #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
  21 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
  22
  23
  24 #ifdef cl_ext_atomic_counters_32
  25 #pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
  26 #else
  27 #define counter32_t volatile global int*
  28 #endif
  29
  30 typedef unsigned int u32;
  31 typedef unsigned short u16;
  32 typedef unsigned char u8;
  33
  34 #define GET_GROUP_IDX get_group_id(0)
  35 #define GET_LOCAL_IDX get_local_id(0)
  36 #define GET_GLOBAL_IDX get_global_id(0)
  37 #define GET_GROUP_SIZE get_local_size(0)
  38 #define GET_NUM_GROUPS get_num_groups(0)
  39 #define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
  40 #define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
  41 #define AtomInc(x) atom_inc(&(x))
  42 #define AtomInc1(x, out) out = atom_inc(&(x))
  43 #define AppendInc(x, out) out = atomic_inc(x)
  44 #define AtomAdd(x, value) atom_add(&(x), value)
  45 #define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
  46 #define AtomXhg(x, value) atom_xchg ( &(x), value )
  47
  48
  49 #define SELECT_UINT4( b, a, condition ) select( b,a,condition )
  50
  51 #define mymake_float4 (float4)
  52 //#define make_float2 (float2)
  53 //#define make_uint4 (uint4)
  54 //#define make_int4 (int4)
  55 //#define make_uint2 (uint2)
  56 //#define make_int2 (int2)
  57
  58
  59 #define max2 max
  60 #define min2 min
  61
  62
  63 ///////////////////////////////////////
  64 //      Vector
  65 ///////////////////////////////////////
  66
  67
  68
  69
  70 __inline
  71 float4 fastNormalize4(float4 v)
  72 {
  73         return fast_normalize(v);
  74 }
  75
  76
  77
  78 __inline
  79 float4 cross3(float4 a, float4 b)
  80 {
  81         return cross(a,b);
  82 }
  83
  84 __inline
  85 float dot3F4(float4 a, float4 b)
  86 {
  87         float4 a1 = mymake_float4(a.xyz,0.f);
  88         float4 b1 = mymake_float4(b.xyz,0.f);
  89         return dot(a1, b1);
  90 }
  91
  92
  93
  94
  95 __inline
  96 float4 normalize3(const float4 a)
  97 {
  98         float4 n = mymake_float4(a.x, a.y, a.z, 0.f);
  99         return fastNormalize4( n );
 100 //      float length = sqrtf(dot3F4(a, a));
 101 //      return 1.f/length * a;
 102 }
 103
 104
 105
 106
 107 ///////////////////////////////////////
 108 //      Matrix3x3
 109 ///////////////////////////////////////
 110
 111 typedef struct
 112 {
 113         float4 m_row[3];
 114 }Matrix3x3;
 115
 116
 117
 118
 119
 120
 121 __inline
 122 float4 mtMul1(Matrix3x3 a, float4 b);
 123
 124 __inline
 125 float4 mtMul3(float4 a, Matrix3x3 b);
 126
 127
 128
 129
 130 __inline
 131 float4 mtMul1(Matrix3x3 a, float4 b)
 132 {
 133         float4 ans;
 134         ans.x = dot3F4( a.m_row[0], b );
 135         ans.y = dot3F4( a.m_row[1], b );
 136         ans.z = dot3F4( a.m_row[2], b );
 137         ans.w = 0.f;
 138         return ans;
 139 }
 140
 141 __inline
 142 float4 mtMul3(float4 a, Matrix3x3 b)
 143 {
 144         float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
 145         float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
 146         float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
 147
 148         float4 ans;
 149         ans.x = dot3F4( a, colx );
 150         ans.y = dot3F4( a, coly );
 151         ans.z = dot3F4( a, colz );
 152         return ans;
 153 }
 154
 155 ///////////////////////////////////////
 156 //      Quaternion
 157 ///////////////////////////////////////
 158
 159 typedef float4 Quaternion;
 160
 161
 162
 163
 164
 165
 166
 167 #define WG_SIZE 64
 168
 169 typedef struct
 170 {
 171         float4 m_pos;
 172         Quaternion m_quat;
 173         float4 m_linVel;
 174         float4 m_angVel;
 175
 176         u32 m_shapeIdx;
 177         float m_invMass;
 178         float m_restituitionCoeff;
 179         float m_frictionCoeff;
 180 } Body;
 181
 182 typedef struct
 183 {
 184         Matrix3x3 m_invInertia;
 185         Matrix3x3 m_initInvInertia;
 186 } Shape;
 187
 188 typedef struct
 189 {
 190         float4 m_linear;
 191         float4 m_worldPos[4];
 192         float4 m_center;
 193         float m_jacCoeffInv[4];
 194         float m_b[4];
 195         float m_appliedRambdaDt[4];
 196
 197         float m_fJacCoeffInv[2];
 198         float m_fAppliedRambdaDt[2];
 199
 200         u32 m_bodyA;
 201         u32 m_bodyB;
 202
 203         int m_batchIdx;
 204         u32 m_paddings[1];
 205 } Constraint4;
 206
 207
 208
 209 typedef struct
 210 {
 211         int m_nConstraints;
 212         int m_start;
 213         int m_batchIdx;
 214         int m_nSplit;
 215 //      int m_paddings[1];
 216 } ConstBuffer;
 217
 218 typedef struct
 219 {
 220         int m_solveFriction;
 221         int m_maxBatch; //      long batch really kills the performance
 222         int m_batchIdx;
 223         int m_nSplit;
 224 //      int m_paddings[1];
 225 } ConstBufferBatchSolve;
 226
 227 void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);
 228
 229 void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
 230 {
 231         *linear = mymake_float4(-n.xyz,0.f);
 232         *angular0 = -cross3(r0, n);
 233         *angular1 = cross3(r1, n);
 234 }
 235
 236 float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );
 237
 238 float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
 239 {
 240         return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
 241 }
 242
 243
 244 float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
 245                                    float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);
 246
 247 float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
 248                                         float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)
 249 {
 250         //      linear0,1 are normlized
 251         float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
 252         float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
 253         float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
 254         float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
 255         return -1.f/(jmj0+jmj1+jmj2+jmj3);
 256 }
 257 void btPlaneSpace1 (const float4* n, float4* p, float4* q);
 258  void btPlaneSpace1 (const float4* n, float4* p, float4* q)
 259 {
 260   if (fabs(n[0].z) > 0.70710678f) {
 261     // choose p in y-z plane
 262     float a = n[0].y*n[0].y + n[0].z*n[0].z;
 263     float k = 1.f/sqrt(a);
 264     p[0].x = 0;
 265         p[0].y = -n[0].z*k;
 266         p[0].z = n[0].y*k;
 267     // set q = n x p
 268     q[0].x = a*k;
 269         q[0].y = -n[0].x*p[0].z;
 270         q[0].z = n[0].x*p[0].y;
 271   }
 272   else {
 273     // choose p in x-y plane
 274     float a = n[0].x*n[0].x + n[0].y*n[0].y;
 275     float k = 1.f/sqrt(a);
 276     p[0].x = -n[0].y*k;
 277         p[0].y = n[0].x*k;
 278         p[0].z = 0;
 279     // set q = n x p
 280     q[0].x = -n[0].z*p[0].y;
 281         q[0].y = n[0].z*p[0].x;
 282         q[0].z = a*k;
 283   }
 284 }
 285
 286
 287 void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);
 288 void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)
 289 {
 290         float frictionCoeff = ldsCs[0].m_linear.w;
 291         int aIdx = ldsCs[0].m_bodyA;
 292         int bIdx = ldsCs[0].m_bodyB;
 293
 294
 295         float4 posA = gBodies[aIdx].m_pos;
 296         float4 linVelA = gBodies[aIdx].m_linVel;
 297         float4 angVelA = gBodies[aIdx].m_angVel;
 298         float invMassA = gBodies[aIdx].m_invMass;
 299         Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
 300
 301         float4 posB = gBodies[bIdx].m_pos;
 302         float4 linVelB = gBodies[bIdx].m_linVel;
 303         float4 angVelB = gBodies[bIdx].m_angVel;
 304         float invMassB = gBodies[bIdx].m_invMass;
 305         Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
 306
 307
 308         {
 309                 float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
 310                 float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
 311
 312                 float sum = 0;
 313                 for(int j=0; j<4; j++)
 314                 {
 315                         sum +=ldsCs[0].m_appliedRambdaDt[j];
 316                 }
 317                 frictionCoeff = 0.7f;
 318                 for(int j=0; j<4; j++)
 319                 {
 320                         maxRambdaDt[j] = frictionCoeff*sum;
 321                         minRambdaDt[j] = -maxRambdaDt[j];
 322                 }
 323
 324
 325 //              solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
 326 //                      posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );
 327
 328
 329                 {
 330
 331                         __global Constraint4* cs = ldsCs;
 332
 333                         if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;
 334                         const float4 center = cs->m_center;
 335
 336                         float4 n = -cs->m_linear;
 337
 338                         float4 tangent[2];
 339                         btPlaneSpace1(&n,&tangent[0],&tangent[1]);
 340                         float4 angular0, angular1, linear;
 341                         float4 r0 = center - posA;
 342                         float4 r1 = center - posB;
 343                         for(int i=0; i<2; i++)
 344                         {
 345                                 setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
 346                                 float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
 347                                                                                         linVelA, angVelA, linVelB, angVelB );
 348                                 rambdaDt *= cs->m_fJacCoeffInv[i];
 349
 350                                 {
 351                                         float prevSum = cs->m_fAppliedRambdaDt[i];
 352                                         float updated = prevSum;
 353                                         updated += rambdaDt;
 354                                         updated = max2( updated, minRambdaDt[i] );
 355                                         updated = min2( updated, maxRambdaDt[i] );
 356                                         rambdaDt = updated - prevSum;
 357                                         cs->m_fAppliedRambdaDt[i] = updated;
 358                                 }
 359
 360                                 float4 linImp0 = invMassA*linear*rambdaDt;
 361                                 float4 linImp1 = invMassB*(-linear)*rambdaDt;
 362                                 float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
 363                                 float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
 364
 365                                 linVelA += linImp0;
 366                                 angVelA += angImp0;
 367                                 linVelB += linImp1;
 368                                 angVelB += angImp1;
 369                         }
 370                         {       //      angular damping for point constraint
 371                                 float4 ab = normalize3( posB - posA );
 372                                 float4 ac = normalize3( center - posA );
 373                                 if( dot3F4( ab, ac ) > 0.95f  || (invMassA == 0.f || invMassB == 0.f))
 374                                 {
 375                                         float angNA = dot3F4( n, angVelA );
 376                                         float angNB = dot3F4( n, angVelB );
 377
 378                                         angVelA -= (angNA*0.1f)*n;
 379                                         angVelB -= (angNB*0.1f)*n;
 380                                 }
 381                         }
 382                 }
 383
 384
 385
 386         }
 387
 388         if (gBodies[aIdx].m_invMass)
 389         {
 390                 gBodies[aIdx].m_linVel = linVelA;
 391                 gBodies[aIdx].m_angVel = angVelA;
 392         } else
 393         {
 394                 gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);
 395                 gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);
 396         }
 397         if (gBodies[bIdx].m_invMass)
 398         {
 399                 gBodies[bIdx].m_linVel = linVelB;
 400                 gBodies[bIdx].m_angVel = angVelB;
 401         } else
 402         {
 403                 gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);
 404                 gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);
 405         }
 406
 407
 408 }
 409
 410 typedef struct
 411 {
 412         int m_valInt0;
 413         int m_valInt1;
 414         int m_valInt2;
 415         int m_valInt3;
 416
 417         float m_val0;
 418         float m_val1;
 419         float m_val2;
 420         float m_val3;
 421 } SolverDebugInfo;
 422
 423
 424
 425
 426 __kernel
 427 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 428 void BatchSolveKernelFriction(__global Body* gBodies,
 429                       __global Shape* gShapes,
 430                       __global Constraint4* gConstraints,
 431                       __global int* gN,
 432                       __global int* gOffsets,
 433                       __global int* batchSizes,
 434                        int maxBatch1,
 435                        int cellBatch,
 436                        int4 nSplit
 437                       )
 438 {
 439         //__local int ldsBatchIdx[WG_SIZE+1];
 440         __local int ldsCurBatch;
 441         __local int ldsNextBatch;
 442         __local int ldsStart;
 443
 444         int lIdx = GET_LOCAL_IDX;
 445         int wgIdx = GET_GROUP_IDX;
 446
 447 //      int gIdx = GET_GLOBAL_IDX;
 448 //      debugInfo[gIdx].m_valInt0 = gIdx;
 449         //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;
 450
 451
 452         int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);
 453         int remain= (wgIdx%((nSplit.x*nSplit.y)/4));
 454         int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);
 455         int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);
 456         int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);
 457
 458
 459         if( gN[cellIdx] == 0 )
 460                 return;
 461
 462         int maxBatch = batchSizes[cellIdx];
 463
 464         const int start = gOffsets[cellIdx];
 465         const int end = start + gN[cellIdx];
 466
 467
 468         if( lIdx == 0 )
 469         {
 470                 ldsCurBatch = 0;
 471                 ldsNextBatch = 0;
 472                 ldsStart = start;
 473         }
 474
 475
 476         GROUP_LDS_BARRIER;
 477
 478         int idx=ldsStart+lIdx;
 479         while (ldsCurBatch < maxBatch)
 480         {
 481                 for(; idx<end; )
 482                 {
 483                         if (gConstraints[idx].m_batchIdx == ldsCurBatch)
 484                         {
 485
 486                                         solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );
 487
 488                                  idx+=64;
 489                         } else
 490                         {
 491                                 break;
 492                         }
 493                 }
 494                 GROUP_LDS_BARRIER;
 495                 if( lIdx == 0 )
 496                 {
 497                         ldsCurBatch++;
 498                 }
 499                 GROUP_LDS_BARRIER;
 500         }
 501
 502
 503 }
 504
 505
 506
 507
 508
 509
 510 __kernel void solveSingleFrictionKernel(__global Body* gBodies,
 511                       __global Shape* gShapes,
 512                       __global Constraint4* gConstraints,
 513                        int cellIdx,
 514                        int batchOffset,
 515                        int numConstraintsInBatch
 516                       )
 517 {
 518
 519         int index = get_global_id(0);
 520         if (index < numConstraintsInBatch)
 521         {
 522
 523                 int idx=batchOffset+index;
 524
 525                 solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );
 526         }
 527 }