dali-physics/third-party/bullet3/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl

   1 /*
   2 Copyright (c) 2012 Advanced Micro Devices, Inc.
   3
   4 This software is provided 'as-is', without any express or implied warranty.
   5 In no event will the authors be held liable for any damages arising from the use of this software.
   6 Permission is granted to anyone to use this software for any purpose,
   7 including commercial applications, and to alter it and redistribute it freely,
   8 subject to the following restrictions:
   9
  10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  12 3. This notice may not be removed or altered from any source distribution.
  13 */
  14 //Originally written by Takahiro Harada
  15
  16
  17 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
  18
  19 #pragma OPENCL EXTENSION cl_amd_printf : enable
  20 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
  21 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
  22 #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
  23 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
  24
  25
  26 #ifdef cl_ext_atomic_counters_32
  27 #pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
  28 #else
  29 #define counter32_t volatile global int*
  30 #endif
  31
  32 typedef unsigned int u32;
  33 typedef unsigned short u16;
  34 typedef unsigned char u8;
  35
  36 #define GET_GROUP_IDX get_group_id(0)
  37 #define GET_LOCAL_IDX get_local_id(0)
  38 #define GET_GLOBAL_IDX get_global_id(0)
  39 #define GET_GROUP_SIZE get_local_size(0)
  40 #define GET_NUM_GROUPS get_num_groups(0)
  41 #define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
  42 #define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
  43 #define AtomInc(x) atom_inc(&(x))
  44 #define AtomInc1(x, out) out = atom_inc(&(x))
  45 #define AppendInc(x, out) out = atomic_inc(x)
  46 #define AtomAdd(x, value) atom_add(&(x), value)
  47 #define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
  48 #define AtomXhg(x, value) atom_xchg ( &(x), value )
  49
  50
  51 #define SELECT_UINT4( b, a, condition ) select( b,a,condition )
  52
  53 #define make_float4 (float4)
  54 #define make_float2 (float2)
  55 #define make_uint4 (uint4)
  56 #define make_int4 (int4)
  57 #define make_uint2 (uint2)
  58 #define make_int2 (int2)
  59
  60
  61 #define max2 max
  62 #define min2 min
  63
  64
  65 ///////////////////////////////////////
  66 //      Vector
  67 ///////////////////////////////////////
  68 __inline
  69 float fastDiv(float numerator, float denominator)
  70 {
  71         return native_divide(numerator, denominator);
  72 //      return numerator/denominator;
  73 }
  74
  75 __inline
  76 float4 fastDiv4(float4 numerator, float4 denominator)
  77 {
  78         return native_divide(numerator, denominator);
  79 }
  80
  81 __inline
  82 float fastSqrtf(float f2)
  83 {
  84         return native_sqrt(f2);
  85 //      return sqrt(f2);
  86 }
  87
  88 __inline
  89 float fastRSqrt(float f2)
  90 {
  91         return native_rsqrt(f2);
  92 }
  93
  94 __inline
  95 float fastLength4(float4 v)
  96 {
  97         return fast_length(v);
  98 }
  99
 100 __inline
 101 float4 fastNormalize4(float4 v)
 102 {
 103         return fast_normalize(v);
 104 }
 105
 106
 107 __inline
 108 float sqrtf(float a)
 109 {
 110 //      return sqrt(a);
 111         return native_sqrt(a);
 112 }
 113
 114 __inline
 115 float4 cross3(float4 a, float4 b)
 116 {
 117         return cross(a,b);
 118 }
 119
 120 __inline
 121 float dot3F4(float4 a, float4 b)
 122 {
 123         float4 a1 = make_float4(a.xyz,0.f);
 124         float4 b1 = make_float4(b.xyz,0.f);
 125         return dot(a1, b1);
 126 }
 127
 128 __inline
 129 float length3(const float4 a)
 130 {
 131         return sqrtf(dot3F4(a,a));
 132 }
 133
 134 __inline
 135 float dot4(const float4 a, const float4 b)
 136 {
 137         return dot( a, b );
 138 }
 139
 140 //      for height
 141 __inline
 142 float dot3w1(const float4 point, const float4 eqn)
 143 {
 144         return dot3F4(point,eqn) + eqn.w;
 145 }
 146
 147 __inline
 148 float4 normalize3(const float4 a)
 149 {
 150         float4 n = make_float4(a.x, a.y, a.z, 0.f);
 151         return fastNormalize4( n );
 152 //      float length = sqrtf(dot3F4(a, a));
 153 //      return 1.f/length * a;
 154 }
 155
 156 __inline
 157 float4 normalize4(const float4 a)
 158 {
 159         float length = sqrtf(dot4(a, a));
 160         return 1.f/length * a;
 161 }
 162
 163 __inline
 164 float4 createEquation(const float4 a, const float4 b, const float4 c)
 165 {
 166         float4 eqn;
 167         float4 ab = b-a;
 168         float4 ac = c-a;
 169         eqn = normalize3( cross3(ab, ac) );
 170         eqn.w = -dot3F4(eqn,a);
 171         return eqn;
 172 }
 173
 174 ///////////////////////////////////////
 175 //      Matrix3x3
 176 ///////////////////////////////////////
 177
 178 typedef struct
 179 {
 180         float4 m_row[3];
 181 }Matrix3x3;
 182
 183 __inline
 184 Matrix3x3 mtZero();
 185
 186 __inline
 187 Matrix3x3 mtIdentity();
 188
 189 __inline
 190 Matrix3x3 mtTranspose(Matrix3x3 m);
 191
 192 __inline
 193 Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
 194
 195 __inline
 196 float4 mtMul1(Matrix3x3 a, float4 b);
 197
 198 __inline
 199 float4 mtMul3(float4 a, Matrix3x3 b);
 200
 201 __inline
 202 Matrix3x3 mtZero()
 203 {
 204         Matrix3x3 m;
 205         m.m_row[0] = (float4)(0.f);
 206         m.m_row[1] = (float4)(0.f);
 207         m.m_row[2] = (float4)(0.f);
 208         return m;
 209 }
 210
 211 __inline
 212 Matrix3x3 mtIdentity()
 213 {
 214         Matrix3x3 m;
 215         m.m_row[0] = (float4)(1,0,0,0);
 216         m.m_row[1] = (float4)(0,1,0,0);
 217         m.m_row[2] = (float4)(0,0,1,0);
 218         return m;
 219 }
 220
 221 __inline
 222 Matrix3x3 mtTranspose(Matrix3x3 m)
 223 {
 224         Matrix3x3 out;
 225         out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
 226         out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
 227         out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
 228         return out;
 229 }
 230
 231 __inline
 232 Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
 233 {
 234         Matrix3x3 transB;
 235         transB = mtTranspose( b );
 236         Matrix3x3 ans;
 237         //      why this doesn't run when 0ing in the for{}
 238         a.m_row[0].w = 0.f;
 239         a.m_row[1].w = 0.f;
 240         a.m_row[2].w = 0.f;
 241         for(int i=0; i<3; i++)
 242         {
 243 //      a.m_row[i].w = 0.f;
 244                 ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
 245                 ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
 246                 ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
 247                 ans.m_row[i].w = 0.f;
 248         }
 249         return ans;
 250 }
 251
 252 __inline
 253 float4 mtMul1(Matrix3x3 a, float4 b)
 254 {
 255         float4 ans;
 256         ans.x = dot3F4( a.m_row[0], b );
 257         ans.y = dot3F4( a.m_row[1], b );
 258         ans.z = dot3F4( a.m_row[2], b );
 259         ans.w = 0.f;
 260         return ans;
 261 }
 262
 263 __inline
 264 float4 mtMul3(float4 a, Matrix3x3 b)
 265 {
 266         float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
 267         float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
 268         float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
 269
 270         float4 ans;
 271         ans.x = dot3F4( a, colx );
 272         ans.y = dot3F4( a, coly );
 273         ans.z = dot3F4( a, colz );
 274         return ans;
 275 }
 276
 277 ///////////////////////////////////////
 278 //      Quaternion
 279 ///////////////////////////////////////
 280
 281 typedef float4 Quaternion;
 282
 283 __inline
 284 Quaternion qtMul(Quaternion a, Quaternion b);
 285
 286 __inline
 287 Quaternion qtNormalize(Quaternion in);
 288
 289 __inline
 290 float4 qtRotate(Quaternion q, float4 vec);
 291
 292 __inline
 293 Quaternion qtInvert(Quaternion q);
 294
 295
 296
 297
 298
 299 __inline
 300 Quaternion qtMul(Quaternion a, Quaternion b)
 301 {
 302         Quaternion ans;
 303         ans = cross3( a, b );
 304         ans += a.w*b+b.w*a;
 305 //      ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
 306         ans.w = a.w*b.w - dot3F4(a, b);
 307         return ans;
 308 }
 309
 310 __inline
 311 Quaternion qtNormalize(Quaternion in)
 312 {
 313         return fastNormalize4(in);
 314 //      in /= length( in );
 315 //      return in;
 316 }
 317 __inline
 318 float4 qtRotate(Quaternion q, float4 vec)
 319 {
 320         Quaternion qInv = qtInvert( q );
 321         float4 vcpy = vec;
 322         vcpy.w = 0.f;
 323         float4 out = qtMul(qtMul(q,vcpy),qInv);
 324         return out;
 325 }
 326
 327 __inline
 328 Quaternion qtInvert(Quaternion q)
 329 {
 330         return (Quaternion)(-q.xyz, q.w);
 331 }
 332
 333 __inline
 334 float4 qtInvRotate(const Quaternion q, float4 vec)
 335 {
 336         return qtRotate( qtInvert( q ), vec );
 337 }
 338
 339
 340
 341
 342 #define WG_SIZE 64
 343
 344 typedef struct
 345 {
 346         float4 m_pos;
 347         Quaternion m_quat;
 348         float4 m_linVel;
 349         float4 m_angVel;
 350
 351         u32 m_shapeIdx;
 352         float m_invMass;
 353         float m_restituitionCoeff;
 354         float m_frictionCoeff;
 355 } Body;
 356
 357 typedef struct
 358 {
 359         Matrix3x3 m_invInertia;
 360         Matrix3x3 m_initInvInertia;
 361 } Shape;
 362
 363 typedef struct
 364 {
 365         float4 m_linear;
 366         float4 m_worldPos[4];
 367         float4 m_center;
 368         float m_jacCoeffInv[4];
 369         float m_b[4];
 370         float m_appliedRambdaDt[4];
 371
 372         float m_fJacCoeffInv[2];
 373         float m_fAppliedRambdaDt[2];
 374
 375         u32 m_bodyA;
 376         u32 m_bodyB;
 377
 378         int m_batchIdx;
 379         u32 m_paddings[1];
 380 } Constraint4;
 381
 382
 383
 384 typedef struct
 385 {
 386         int m_nConstraints;
 387         int m_start;
 388         int m_batchIdx;
 389         int m_nSplit;
 390 //      int m_paddings[1];
 391 } ConstBuffer;
 392
 393 typedef struct
 394 {
 395         int m_solveFriction;
 396         int m_maxBatch; //      long batch really kills the performance
 397         int m_batchIdx;
 398         int m_nSplit;
 399 //      int m_paddings[1];
 400 } ConstBufferBatchSolve;
 401
 402
 403
 404
 405
 406 typedef struct
 407 {
 408         int m_valInt0;
 409         int m_valInt1;
 410         int m_valInt2;
 411         int m_valInt3;
 412
 413         float m_val0;
 414         float m_val1;
 415         float m_val2;
 416         float m_val3;
 417 } SolverDebugInfo;
 418
 419
 420
 421
 422 //      others
 423 __kernel
 424 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 425 void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )
 426 {
 427         int nContacts = cb.x;
 428         int gIdx = GET_GLOBAL_IDX;
 429
 430         if( gIdx < nContacts )
 431         {
 432                 int srcIdx = sortData[gIdx].y;
 433                 out[gIdx] = in[srcIdx];
 434         }
 435 }
 436
 437 __kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 438 void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)
 439 {
 440         int gIdx = GET_GLOBAL_IDX;
 441
 442         if( gIdx < nContacts )
 443         {
 444                 int2 sd;
 445                 sd.x = contactsIn[gIdx].m_childIndexB;
 446                 sd.y = gIdx;
 447                 sortDataOut[gIdx] = sd;
 448         }
 449 }
 450
 451 __kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 452 void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
 453 {
 454         int gIdx = GET_GLOBAL_IDX;
 455
 456         if( gIdx < nContacts )
 457         {
 458                 int2 sdIn;
 459                 sdIn = sortDataInOut[gIdx];
 460                 int2 sdOut;
 461                 sdOut.x = contactsIn[sdIn.y].m_childIndexA;
 462                 sdOut.y = sdIn.y;
 463                 sortDataInOut[gIdx] = sdOut;
 464         }
 465 }
 466
 467 __kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 468 void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
 469 {
 470         int gIdx = GET_GLOBAL_IDX;
 471
 472         if( gIdx < nContacts )
 473         {
 474                 int2 sdIn;
 475                 sdIn = sortDataInOut[gIdx];
 476                 int2 sdOut;
 477                 sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit;
 478                 sdOut.y = sdIn.y;
 479                 sortDataInOut[gIdx] = sdOut;
 480         }
 481 }
 482
 483
 484 __kernel
 485 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 486 void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
 487 {
 488         int gIdx = GET_GLOBAL_IDX;
 489
 490         if( gIdx < nContacts )
 491         {
 492                 int2 sdIn;
 493                 sdIn = sortDataInOut[gIdx];
 494                 int2 sdOut;
 495                 sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit;
 496                 sdOut.y = sdIn.y;
 497                 sortDataInOut[gIdx] = sdOut;
 498         }
 499 }
 500
 501
 502
 503
 504 typedef struct
 505 {
 506         int m_nContacts;
 507         int m_staticIdx;
 508         float m_scale;
 509         int m_nSplit;
 510 } ConstBufferSSD;
 511
 512
 513 __constant const int gridTable4x4[] =
 514 {
 515     0,1,17,16,
 516         1,2,18,19,
 517         17,18,32,3,
 518         16,19,3,34
 519 };
 520
 521 __constant const int gridTable8x8[] =
 522 {
 523           0,  2,  3, 16, 17, 18, 19,  1,
 524          66, 64, 80, 67, 82, 81, 65, 83,
 525         131,144,128,130,147,129,145,146,
 526         208,195,194,192,193,211,210,209,
 527          21, 22, 23,  5,  4,  6,  7, 20,
 528          86, 85, 69, 87, 70, 68, 84, 71,
 529         151,133,149,150,135,148,132,134,
 530         197,27,214,213,212,199,198,196
 531
 532 };
 533
 534
 535
 536
 537 #define USE_SPATIAL_BATCHING 1
 538 #define USE_4x4_GRID 1
 539
 540 __kernel
 541 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 542 void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut,
 543 int nContacts,float scale,int4 nSplit,int staticIdx)
 544
 545 {
 546         int gIdx = GET_GLOBAL_IDX;
 547
 548         if( gIdx < nContacts )
 549         {
 550                 int aPtrAndSignBit  = gContact[gIdx].m_bodyAPtrAndSignBit;
 551                 int bPtrAndSignBit  = gContact[gIdx].m_bodyBPtrAndSignBit;
 552
 553                 int aIdx = abs(aPtrAndSignBit );
 554                 int bIdx = abs(bPtrAndSignBit);
 555
 556                 bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);
 557                 bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);
 558
 559 #if USE_SPATIAL_BATCHING
 560                 int idx = (aStatic)? bIdx: aIdx;
 561                 float4 p = gBodies[idx].m_pos;
 562                 int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);
 563                 int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);
 564                 int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);
 565                 int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);
 566
 567 #else//USE_SPATIAL_BATCHING
 568         #if USE_4x4_GRID
 569                 int aa = aIdx&3;
 570                 int bb = bIdx&3;
 571                 if (aStatic)
 572                         aa = bb;
 573                 if (bStatic)
 574                         bb = aa;
 575
 576                 int gridIndex = aa + bb*4;
 577                 int newIndex = gridTable4x4[gridIndex];
 578         #else//USE_4x4_GRID
 579                 int aa = aIdx&7;
 580                 int bb = bIdx&7;
 581                 if (aStatic)
 582                         aa = bb;
 583                 if (bStatic)
 584                         bb = aa;
 585
 586                 int gridIndex = aa + bb*8;
 587                 int newIndex = gridTable8x8[gridIndex];
 588         #endif//USE_4x4_GRID
 589 #endif//USE_SPATIAL_BATCHING
 590
 591
 592                 gSortDataOut[gIdx].x = newIndex;
 593                 gSortDataOut[gIdx].y = gIdx;
 594         }
 595         else
 596         {
 597                 gSortDataOut[gIdx].x = 0xffffffff;
 598         }
 599 }
 600
 601 __kernel
 602 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 603 void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )
 604 {
 605         int gIdx = GET_GLOBAL_IDX;
 606         if( gIdx < cb.x )
 607         {
 608                 gOut[gIdx] = gIn[gIdx];
 609         }
 610 }
 611
 612
 613